In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
# Load the dataset
df = pd.read_csv('diabetes.csv')

In [3]:
# Display the first few rows of the dataset
print(df.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [4]:
# Check for missing values
print(df.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [5]:
# Get summary statistics
print(df.describe())

       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min      0.000000                  

In [6]:
df['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [7]:
df.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [8]:
# Split data into features (X) and target (y)
X = df.drop('Outcome', axis=1)
Y = df['Outcome']

In [9]:
print(X)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  
0                       0.627   50  
1                       0.351   31  


In [10]:
print(Y)

NameError: name 'Y' is not defined

KeyError: "None of [Index(['pregnancies', 'glucose', 'bloodpressure', 'skinthickness', 'insulin',\n       'bmi', 'dpf', 'age'],\n      dtype='object')] are in the [columns]"

In [None]:
print(X)
print(Y)


[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]
0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)


In [None]:
feature_names = ['pregnancies', 'glucose', 'bloodpressure', 'skinthickness', 'insulin','bmi', 'dpf', 'age']
X_train = pd.DataFrame(data=X_train, columns=feature_names)
# Assuming X_test is a DataFrame
X_test = pd.DataFrame(data=X_test, columns=feature_names)
print(X_train)


     pregnancies   glucose  bloodpressure  skinthickness   insulin       bmi  \
0      -1.141852 -0.059293      -3.572597      -1.288212 -0.692891  0.051710   
1       0.639947 -0.497453       0.046245       0.719086 -0.102454 -0.151361   
2      -0.844885  2.131507      -0.470732       0.154533  6.652839 -0.240205   
3      -0.547919 -0.497453       0.563223       1.534551  0.965543  0.216705   
4      -1.141852  1.849832      -0.160546       1.158182 -0.692891  1.270134   
..           ...       ...            ...            ...       ...       ...   
609     0.342981 -0.184482       1.493782      -1.288212 -0.692891  2.653554   
610    -0.250952  0.347569       0.563223      -1.288212 -0.692891  0.305548   
611     2.124780 -1.123396       0.253036      -1.288212 -0.692891 -0.240205   
612     0.046014 -0.278373       0.459827       1.220910 -0.692891  0.940144   
613    -1.141852 -1.092099      -0.057150       0.719086 -0.692891  0.483235   

          dpf       age  
0   -0.999286

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(768, 8) (614, 8) (154, 8)


In [None]:
# Assuming X_test is a DataFrame
from sklearn.preprocessing import scale


X_test_scaled = pd.DataFrame(data=scale.transform(X_test), columns=feature_names)


Feature names unseen at fit time:
- age
- bloodpressure
- bmi
- dpf
- glucose
- ...
Feature names seen at fit time, yet now missing:
- Age
- BMI
- BloodPressure
- DiabetesPedigreeFunction
- Glucose
- ...



In [None]:
model= RandomForestClassifier(n_estimators=10)

In [None]:
model.fit(X_train, Y_train)

RandomForestClassifier(n_estimators=10)

In [None]:

# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9820846905537459


In [None]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.7792207792207793


In [None]:
from numpy import ScalarType


input_data = (1,85,66,29,0,26.6,0.351,31)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# standardize the input data
std_data = ScalarType.transform(input_data_reshaped)
print(std_data)

prediction = model.predict(std_data)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[[-0.84488505 -1.12339636 -0.16054575  0.53090156 -0.69289057 -0.68442195
  -0.36506078 -0.19067191]]
[0]
The person is not diabetic


  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


In [None]:
import joblib

# Save the Model to a .pkl File
joblib.dump(model, 'diabetes.pkl')

['diabetes.pkl']