In [111]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.neighbors import LocalOutlierFactor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.pipeline import Pipeline
%matplotlib inline

### Import CSV

In [92]:
df = pd.read_csv('/content/sample_data/diabetes.csv')

In [93]:
print(df.head(10))

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   
5            5      116             74              0        0  25.6   
6            3       78             50             32       88  31.0   
7           10      115              0              0        0  35.3   
8            2      197             70             45      543  30.5   
9            8      125             96              0        0   0.0   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   2

In [94]:
print(df.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [95]:
print(df.columns)

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


### Data Preprocessing

In [96]:
df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction',
   'Age']] = df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI',
               'DiabetesPedigreeFunction', 'Age']].replace(0, np.NaN)

In [97]:
print(df.isnull().sum())

Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64


In [98]:
# Target Based Imputation
def median_target(var):
    temp = df[df[var].notnull()]
    temp = temp[[var, 'Outcome']].groupby(['Outcome'])[[var]].median().reset_index()
    return temp

columns = df.columns
columns = columns.drop("Outcome")
for i in columns:
    median_target(i)
    df.loc[(df['Outcome'] == 0 ) & (df[i].isnull()), i] = median_target(i)[i][0]
    df.loc[(df['Outcome'] == 1 ) & (df[i].isnull()), i] = median_target(i)[i][1]

In [99]:
print(df.head(10))

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0          6.0    148.0           72.0           35.0    169.5  33.6   
1          1.0     85.0           66.0           29.0    102.5  26.6   
2          8.0    183.0           64.0           32.0    169.5  23.3   
3          1.0     89.0           66.0           23.0     94.0  28.1   
4          5.0    137.0           40.0           35.0    168.0  43.1   
5          5.0    116.0           74.0           27.0    102.5  25.6   
6          3.0     78.0           50.0           32.0     88.0  31.0   
7         10.0    115.0           70.0           27.0    102.5  35.3   
8          2.0    197.0           70.0           45.0    543.0  30.5   
9          8.0    125.0           96.0           32.0    169.5  34.3   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   2

In [100]:
# Outlier preprocessing
Q1 = df.Insulin.quantile(0.25)
Q3 = df.Insulin.quantile(0.75)
IQR = Q3-Q1
lower = Q1-1.5*IQR
upper = Q3+1.5*IQR
df.loc[df['Insulin']>upper, "Insulin"] = upper

In [101]:
lof = LocalOutlierFactor(n_neighbors=10)
lof.fit_predict(df)

df_scores = lof.negative_outlier_factor_
np.sort(df_scores)[0:20]

thresold = np.sort(df_scores)[7]
outlier = df_scores>thresold
df = df[outlier]

### Break and Scale the Data

In [102]:
X = pd.DataFrame(df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']])

y = pd.Series(df['Outcome'])

In [103]:
# Scale the fking data cuz its ass
scalers = Pipeline([
    ('standard_scaling', StandardScaler()),
    ('robust_scaling', RobustScaler())
])

X = scaler.fit_transform(X)

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=42)

### RandomForestClassifier

In [126]:
rf = RandomForestClassifier(criterion = 'entropy', max_depth = 15, max_features = 0.75,
                            min_samples_leaf = 2, min_samples_split = 3, n_estimators = 130)
rf.fit(X_train, y_train)

In [127]:
# param_dist = {
#     'n_estimators': [100, 200, 300],           # Number of trees
#     'max_depth': [None, 10, 20, 30],           # Maximum depth of each tree
#     'min_samples_split': [2, 5, 10],           # Minimum samples required to split a node
#     'min_samples_leaf': [1, 2, 4],             # Minimum samples required at a leaf node
#     'max_features': ['sqrt', 'log2', None],    # Number of features to consider when looking for the best split
#     'bootstrap': [True, False]                 # Whether to use bootstrap samples
# }

In [128]:
# random_search = RandomizedSearchCV(
#     estimator=rf,
#     param_distributions=param_dist,
#     cv=5,                    # 5-fold cross-validation
#     scoring='roc_auc',      # Choose scoring metric, e.g., 'roc_auc' for binary classification
#     n_jobs=-1,              # Use all available cores
#     verbose=2               # Print progress
# )

# random_search.fit(X_train, y_train)

# print("Best Hyperparameters:", random_search.best_params_)

# # best_rf = random_search.best_estimator_

In [129]:
y_preds = rf.predict_proba(X_test)

In [130]:
print(classification_report(y_test, (y_preds[:,1] > 0.5)))

              precision    recall  f1-score   support

           0       0.92      0.90      0.91       130
           1       0.79      0.83      0.81        60

    accuracy                           0.88       190
   macro avg       0.86      0.87      0.86       190
weighted avg       0.88      0.88      0.88       190

