# Medical Diagnosis with Logistic Regression Model
## Import Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

## Load Data

In [2]:
column_names = ["pregnancies", "glucose", "bpressure", "skinfold", "insulin", "bmi", "pedigree", "age", "class"]
df= pd.read_csv("data_medicalSVM.csv",names=column_names)
df.index = df.index+1
df.head()

Unnamed: 0,pregnancies,glucose,bpressure,skinfold,insulin,bmi,pedigree,age,class
1,6,148,72,35,0,33.6,0.627,50,1
2,1,85,66,29,0,26.6,0.351,31,0
3,8,183,64,0,0,23.3,0.672,32,1
4,1,89,66,23,94,28.1,0.167,21,0
5,0,137,40,35,168,43.1,2.288,33,1


In [3]:
df.shape

(768, 9)

# Extract X variable

In [4]:
X=df.iloc[:,:8]
X.head()

Unnamed: 0,pregnancies,glucose,bpressure,skinfold,insulin,bmi,pedigree,age
1,6,148,72,35,0,33.6,0.627,50
2,1,85,66,29,0,26.6,0.351,31
3,8,183,64,0,0,23.3,0.672,32
4,1,89,66,23,94,28.1,0.167,21
5,0,137,40,35,168,43.1,2.288,33


## Extract Y variable

In [5]:
Y=df['class']
Y.head()

1    1
2    0
3    1
4    0
5    1
Name: class, dtype: int64

## Split Dataset

In [6]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.25)

In [7]:
X_train.head()

Unnamed: 0,pregnancies,glucose,bpressure,skinfold,insulin,bmi,pedigree,age
599,1,173,74,0,0,36.8,0.088,38
348,3,116,0,0,0,23.5,0.187,23
236,4,171,72,0,0,43.6,0.479,26
31,5,109,75,26,0,36.0,0.546,60
104,1,81,72,18,40,26.6,0.283,24


In [8]:
X_train.shape

(576, 8)

## Standardize train set

In [9]:
scaler=StandardScaler()
scaler.fit(X_train)
X_train=scaler.transform(X_train)
print(X_train[:5])

[[-0.83637202  1.63693373  0.25578249 -1.3013409  -0.69037069  0.61698321
  -1.14339309  0.41828494]
 [-0.24490451 -0.16976302 -3.50758403 -1.3013409  -0.69037069 -1.03699901
  -0.85173587 -0.87984074]
 [ 0.05082924  1.57354086  0.15406988 -1.3013409  -0.69037069  1.46262826
   0.00850565 -0.6202156 ]
 [ 0.34656299 -0.39163806  0.30663879  0.32789178 -0.69037069  0.51749556
   0.20588983  2.32220261]
 [-0.83637202 -1.27913822  0.15406988 -0.17341058 -0.34084664 -0.65148436
  -0.56891674 -0.79329903]]


## Standardize test set

In [10]:
scaler=StandardScaler()
scaler.fit(X_test)
X_test=scaler.transform(X_test)
print(X_test[:5])

[[ 0.93375187  2.03948937 -1.06360606  0.82890881  2.64145342  0.19666295
   1.20503529  0.0435321 ]
 [-0.57027799 -0.34783697 -0.40942871  0.76590386 -0.22317764 -0.98671525
  -1.08787147 -1.01138176]
 [ 1.23455784  0.13567216  0.3537782   0.2618643   4.41479646 -0.51064356
   0.74842491  1.50418207]
 [-0.87108396 -0.98244271  0.46280776  0.45087914 -0.42779414  0.29187729
  -1.17656557 -0.93023454]
 [-0.26947202  0.28676876  1.22601467  1.83698792  0.62086544  0.53671416
   1.67150056 -0.11876234]]


## Training a Logistic Regression model

In [11]:
clf=LogisticRegression().fit(X_train,Y_train)

# predict

In [12]:
y_pred=clf.predict(X_test)
print(y_pred)

[1 0 0 0 0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 1 0 1 1 1 0 1 0 0 0 0 0 1 0 0 0 1 1 0 1 1 0 0 1 0 0 0 0 0 1 0 0
 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 1 1
 0 0 0 1 0 0 0 0 1 1 0 0 1 0 1 1 0 1 0 0 0 1 1 0 0 1 1 1 1 0 0 0 0 0 0 0 0
 0 1 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0]


# Accuracy test


In [13]:
print(accuracy_score(Y_test,y_pred))

0.8020833333333334


# Confusion matrix

In [14]:
print(confusion_matrix(Y_test,y_pred))

[[115  14]
 [ 24  39]]
