In [6]:
# Import Libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.compose import ColumnTransformer
from collections import Counter
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split , RandomizedSearchCV , cross_val_score, cross_validate
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import learning_curve
from sklearn.metrics import roc_auc_score,roc_curve,auc,classification_report, accuracy_score, confusion_matrix
import category_encoders as ce
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
import joblib
import warnings
warnings.filterwarnings('ignore')

In [7]:
# Read the data
df=pd.read_csv(r"D:\Study\Epsilon DS\Eye-Project\sources\Eye-disorders-prevalence_20000.csv")
df.head(5)

Unnamed: 0,locationdesc,category,response,age,gender,raceethnicity,data_value(%),low_confidence_limit,high_confidence_limit,numerator,sample_size,locationid,longitude,latitude
0,District Of Columbia,Diabetic Eye Diseases,Proliferative diabetic retinopathy,85 years and older,Male,"White, non-Hispanic",0.0,0.0,0.7,50.0,1300.0,11,-77.031961,38.890371
1,Colorado,Cancer and Neoplasms of the Eye,All Cancer and neoplasms of the eye diseases,All ages,All genders,"Hispanic, any race",0.09,0.07,0.11,120.0,135200.0,8,-106.133611,38.843841
2,National,Cornea Disorders,Other corneal disorders,65-84 years,All genders,"Black, non-Hispanic",0.81,0.74,0.89,500.0,60900.0,59,-93.236096,40.439534
3,New Hampshire,Other Visual Disturbances,Visual field defect,18 years and older,Male,North American Native,0.0,0.0,1.48,40.0,250.0,33,-71.500361,43.65595
4,New Mexico,Orbital and External Disease,Disorders of the globe,65-84 years,Male,"Black, non-Hispanic",0.07,0.04,0.29,80.0,11900.0,35,-106.240581,34.520881


In [8]:
df = df.drop(columns=['longitude','latitude','locationid'])

In [9]:
X = df.drop('category', axis=1)  # Features
y = df['category']  # Target variable

In [10]:
# Encoding
df_encoded = pd.get_dummies(df, columns=['gender','age'])
binary_encoder = ce.BinaryEncoder(cols=['locationdesc','response','raceethnicity'])
df_encoded = binary_encoder.fit_transform(df_encoded)

X_encoded = df_encoded.drop('category', axis=1)  # Features
y_encoded = df_encoded['category']  # Target variable

# Convert any True/False values to 1/0
X_encoded[['gender_All genders','gender_Female','gender_Male']] = X_encoded[['gender_All genders','gender_Female','gender_Male']].astype(int)

# Split data
X_train_encoded, X_test_encoded, y_train_encoded, y_test_encoded = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_encoded = scaler.fit_transform(X_train_encoded)
X_test_encoded = scaler.transform(X_test_encoded)

# Initialize and train Decision Tree model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train_encoded, y_train_encoded)

# Predict and evaluate the model
y_train_encoded_pred = model.predict(X_train_encoded)
y_test_encoded_pred = model.predict(X_test_encoded)

train_accuracy = accuracy_score(y_train_encoded, y_train_encoded_pred)
test_accuracy = accuracy_score(y_test_encoded, y_test_encoded_pred)

print(f'Training Accuracy: {train_accuracy:.2f}')
print(f'Testing Accuracy: {test_accuracy:.2f}')

# Classification report for test data
print('Classification Report (Test Data):')
print(classification_report(y_test_encoded, y_test_encoded_pred))


Training Accuracy: 1.00
Testing Accuracy: 0.98
Classification Report (Test Data):
                                                     precision    recall  f1-score   support

                   Age Related Macular Degeneration       0.94      0.94      0.94       536
                    Cancer and Neoplasms of the Eye       1.00      1.00      1.00       145
                                   Cornea Disorders       0.98      0.95      0.96       341
                              Diabetic Eye Diseases       0.92      0.92      0.92       433
       Disorders of Optic Nerve and Visual Pathways       1.00      1.00      1.00       151
                                           Glaucoma       0.95      0.98      0.97       365
               Infectious and Inflammatory Diseases       1.00      1.00      1.00       414
Injury, Burns and Surgical Complications of the Eye       1.00      1.00      1.00       222
                       Orbital and External Disease       1.00      1.00      1.

In [11]:
cm_lr = confusion_matrix(y_train_encoded, y_train_encoded_pred)
px.imshow(cm_lr, width=1300, height=500,text_auto=True,color_continuous_scale=px.colors.sequential.Blues, template='plotly_dark')

In [12]:
cm_lr = confusion_matrix(y_test_encoded, y_test_encoded_pred)
px.imshow(cm_lr, width=1300, height=500,text_auto=True,color_continuous_scale=px.colors.sequential.Blues, template='plotly_dark')

In [13]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Define the encoding transformers
categorical_features = ['age', 'gender', 'locationdesc', 'response', 'raceethnicity']

# Define ColumnTransformer for encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['age', 'gender']),
        ('bin', ce.BinaryEncoder(cols=['locationdesc', 'response', 'raceethnicity']), ['locationdesc', 'response', 'raceethnicity']),
    ],
    remainder='passthrough'  # Keep the rest of the columns unchanged
)

# Define the scalers
scalers = [
    ('standard', StandardScaler()),
    ('minmax', MinMaxScaler()),
    ('robust', RobustScaler())
]

# Define the models
models = [
    ('random_forest', RandomForestClassifier(random_state=42)),
    ('logistic_regression', LogisticRegression(max_iter=1000, random_state=42)),
    ('svm', SVC(random_state=42)),
    ('knn', KNeighborsClassifier())
]

# Create a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),  # Placeholder scaler
    ('model', RandomForestClassifier())  # Placeholder model
])

# Define the parameter grid for RandomizedSearchCV
param_dist = [
    {
        'preprocessor__cat': [OneHotEncoder(handle_unknown='ignore')],
        'preprocessor__bin': [ce.BinaryEncoder(cols=['locationdesc', 'response', 'raceethnicity'])],
        'scaler': [scaler for name, scaler in scalers],
        'model': [RandomForestClassifier(random_state=42)],
        'model__n_estimators': [50, 100, 200],
        'model__max_depth': [None, 5, 10],
    },
    {
        'preprocessor__cat': [OneHotEncoder(handle_unknown='ignore')],
        'preprocessor__bin': [ce.BinaryEncoder(cols=['locationdesc', 'response', 'raceethnicity'])],
        'scaler': [scaler for name, scaler in scalers],
        'model': [LogisticRegression(max_iter=1000, random_state=42)],
        'model__C': [0.1, 1, 10],
        'model__penalty': ['l2'],
    },
    {
        'preprocessor__cat': [OneHotEncoder(handle_unknown='ignore')],
        'preprocessor__bin': [ce.BinaryEncoder(cols=['locationdesc', 'response', 'raceethnicity'])],
        'scaler': [scaler for name, scaler in scalers],
        'model': [SVC(random_state=42)],
        'model__C': [0.1, 1, 10],
        'model__kernel': ['linear', 'rbf'],
    },
    {
        'preprocessor__cat': [OneHotEncoder(handle_unknown='ignore')],
        'preprocessor__bin': [ce.BinaryEncoder(cols=['locationdesc', 'response', 'raceethnicity'])],
        'scaler': [scaler for name, scaler in scalers],
        'model': [KNeighborsClassifier()],
        'model__n_neighbors': [3, 5],
        'model__weights': ['uniform', 'distance'],
    }
]

# Use RandomizedSearchCV to find the best model
random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, 
                                   cv=5, scoring='accuracy', n_jobs=-1, 
                                   verbose=2, n_iter=20, random_state=42)
random_search.fit(X_train, y_train)

# Output the best model and its parameters
print("Best Model:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

# Evaluate the best model on the test set
y_pred = random_search.best_estimator_.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy on test data: {accuracy:.2f}')
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Model: {'scaler': StandardScaler(), 'preprocessor__cat': OneHotEncoder(handle_unknown='ignore'), 'preprocessor__bin': BinaryEncoder(cols=['locationdesc', 'response', 'raceethnicity']), 'model__n_estimators': 100, 'model__max_depth': None, 'model': RandomForestClassifier(random_state=42)}
Best Score: 0.9813750000000001
Accuracy on test data: 0.98
Classification Report:
                                                     precision    recall  f1-score   support

                   Age Related Macular Degeneration       0.93      0.94      0.94       536
                    Cancer and Neoplasms of the Eye       1.00      0.99      1.00       145
                                   Cornea Disorders       1.00      0.99      1.00       341
                              Diabetic Eye Diseases       0.92      0.92      0.92       433
       Disorders of Optic Nerve and Visual Pathways       0.99      1.00      1.00       151
   

In [14]:
# Define additional models
additional_models = [
    ('gradient_boosting', GradientBoostingClassifier(random_state=42)),
    ('ada_boost', AdaBoostClassifier(random_state=42)),
    ('naive_bayes', GaussianNB()),
    ('decision_tree', DecisionTreeClassifier(random_state=42))
]

# Define the parameter grid for new models
additional_param_dist = [
    {
        'preprocessor__cat': [OneHotEncoder(handle_unknown='ignore')],
        'preprocessor__bin': [ce.BinaryEncoder(cols=['locationdesc', 'response', 'raceethnicity'])],
        'scaler': [scaler for name, scaler in scalers],
        'model': [GradientBoostingClassifier(random_state=42)],
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.01, 0.1, 0.2],
        'model__max_depth': [3, 5, 7],
    },
    {
        'preprocessor__cat': [OneHotEncoder(handle_unknown='ignore')],
        'preprocessor__bin': [ce.BinaryEncoder(cols=['locationdesc', 'response', 'raceethnicity'])],
        'scaler': [scaler for name, scaler in scalers],
        'model': [AdaBoostClassifier(random_state=42)],
        'model__n_estimators': [50, 100, 200],
        'model__learning_rate': [0.01, 0.1, 0.5],
    },
    {
        'preprocessor__cat': [OneHotEncoder(handle_unknown='ignore')],
        'preprocessor__bin': [ce.BinaryEncoder(cols=['locationdesc', 'response', 'raceethnicity'])],
        'scaler': [scaler for name, scaler in scalers],
        'model': [GaussianNB()],
    },
    {
        'preprocessor__cat': [OneHotEncoder(handle_unknown='ignore')],
        'preprocessor__bin': [ce.BinaryEncoder(cols=['locationdesc', 'response', 'raceethnicity'])],
        'scaler': [scaler for name, scaler in scalers],
        'model': [DecisionTreeClassifier(random_state=42)],
        'model__max_depth': [None, 10, 20],
        'model__min_samples_split': [2, 5, 10],
    }
]

# Use RandomizedSearchCV to find the best model for the new models
additional_random_search = RandomizedSearchCV(pipeline, param_distributions=additional_param_dist, 
                                              cv=5, scoring='accuracy', n_jobs=-1, 
                                              verbose=2, n_iter=20, random_state=42)
additional_random_search.fit(X_train, y_train)

# Output the best model and its parameters
print("Best Model & Parameters (Additional):", additional_random_search.best_params_)
print("Best Score (Additional):", additional_random_search.best_score_)

# Evaluate the best model on the test set
y_pred_additional = additional_random_search.best_estimator_.predict(X_test)

accuracy_additional = accuracy_score(y_test, y_pred_additional)
print(f'Accuracy on test data (Additional): {accuracy_additional:.2f}')
print('Classification Report (Additional):')
print(classification_report(y_test, y_pred_additional))
print('Confusion Matrix (Additional):')
print(confusion_matrix(y_test, y_pred_additional))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Model & Parameters (Additional): {'scaler': StandardScaler(), 'preprocessor__cat': OneHotEncoder(handle_unknown='ignore'), 'preprocessor__bin': BinaryEncoder(cols=['locationdesc', 'response', 'raceethnicity']), 'model__n_estimators': 100, 'model__max_depth': 7, 'model__learning_rate': 0.1, 'model': GradientBoostingClassifier(random_state=42)}
Best Score (Additional): 0.98475
Accuracy on test data (Additional): 0.98
Classification Report (Additional):
                                                     precision    recall  f1-score   support

                   Age Related Macular Degeneration       0.94      0.94      0.94       536
                    Cancer and Neoplasms of the Eye       1.00      1.00      1.00       145
                                   Cornea Disorders       1.00      1.00      1.00       341
                              Diabetic Eye Diseases       0.92      0.93      0.92       433
       Disor

In [15]:
# Evaluate the best model on the test set
y_pred_additional = additional_random_search.best_estimator_.predict(X_test)
y_train_additional = additional_random_search.best_estimator_.predict(X_train)
accuracy_train_additional = accuracy_score(y_train, y_train_additional)
accuracy_additional = accuracy_score(y_test, y_pred_additional)

print(f'Accuracy on train data (gradient_boosting): {accuracy_train_additional:.2f}')
print(f'Accuracy on test data (gradient_boosting): {accuracy_additional:.2f}')

Accuracy on train data (gradient_boosting): 1.00
Accuracy on test data (gradient_boosting): 0.98


## Cross-Validation on the selected model to check its stability across different subsets of the data.

In [16]:
# Perform K-Fold Cross-Validation
best_model = random_search.best_estimator_ 
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='accuracy')

# Display cross-validation results
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean Cross-Validation Score: {np.mean(cv_scores):.4f}")
print(f"Standard Deviation of Cross-Validation Score: {np.std(cv_scores):.4f}")

Cross-Validation Scores: [0.9815625 0.980625  0.980625  0.9828125 0.98125  ]
Mean Cross-Validation Score: 0.9814
Standard Deviation of Cross-Validation Score: 0.0008


In [17]:
# Calculate learning curve
train_sizes, train_scores, test_scores = learning_curve(
    random_search.best_estimator_, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1
)

# Plot Learning Curves with Plotly
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=train_sizes, 
    y=train_scores.mean(axis=1),
    mode='lines+markers',
    name='Training Score'
))

fig.add_trace(go.Scatter(
    x=train_sizes, 
    y=test_scores.mean(axis=1),
    mode='lines+markers',
    name='Validation Score'
))

fig.update_layout(
    title='Learning Curve',
    xaxis_title='Training Set Size',
    yaxis_title='Accuracy',
    legend_title='Score Type',template='plotly_dark'
)

fig.show()


In [18]:
# Calculate probabilities
y_prob = random_search.best_estimator_.predict_proba(X_test)

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob[:, 1], pos_label=random_search.best_estimator_.classes_[1])
roc_auc = auc(fpr, tpr)

fig_roc = go.Figure()
fig_roc.add_trace(go.Scatter(
    x=fpr,
    y=tpr,
    mode='lines',
    name=f'ROC Curve (area = {roc_auc:.2f})',
    line=dict(color='blue')
))
fig_roc.add_trace(go.Scatter(
    x=[0, 1],
    y=[0, 1],
    mode='lines',
    name='Random (area = 0.5)',
    line=dict(color='red', dash='dash')
))
fig_roc.update_layout(
    title='Receiver Operating Characteristic (ROC) Curve',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    legend_title='Legend',template='plotly_dark'
)
# Show plot
fig_roc.show()


In [19]:
# Save the model to a file
joblib.dump(best_model, 'best_model.pkl')


['best_model.pkl']