In [256]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import joblib
import requests
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as sp
from sklearn.metrics import classification_report


## Load the data and train the model

In [257]:
# loads data
eye_data = pd.read_csv("eye_data_set.csv")

# clean column titles
eye_data.columns = [col.lower() for col in eye_data]
eye_data.columns = [col.replace('-','_').replace(' ','_') for col in eye_data]

# Drop unnecessary rows
cleaned_eye_data = eye_data.drop(columns=['id','left_fundus', 'right_fundus','filepath','filename'])

# View data sets 
#eye_data.head(1)
cleaned_eye_data.head(1)

Unnamed: 0,patient_age,patient_sex,left_diagnostic_keywords,right_diagnostic_keywords,n,d,g,c,a,h,m,o,labels,target
0,69,Female,cataract,normal fundus,0,0,0,1,0,0,0,0,['N'],"[1, 0, 0, 0, 0, 0, 0, 0]"


In [258]:
# view the structure of the data
cleaned_eye_data.info()

# scikit-learn can not handle columns with type object so we will need to transform them with the process of 1 hot encoding

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6392 entries, 0 to 6391
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   patient_age                6392 non-null   int64 
 1   patient_sex                6392 non-null   object
 2   left_diagnostic_keywords   6392 non-null   object
 3   right_diagnostic_keywords  6392 non-null   object
 4   n                          6392 non-null   int64 
 5   d                          6392 non-null   int64 
 6   g                          6392 non-null   int64 
 7   c                          6392 non-null   int64 
 8   a                          6392 non-null   int64 
 9   h                          6392 non-null   int64 
 10  m                          6392 non-null   int64 
 11  o                          6392 non-null   int64 
 12  labels                     6392 non-null   object
 13  target                     6392 non-null   object
dtypes: int64

In [259]:
# When the person enters a value they will 
# makes the paitent age lower
cleaned_eye_data['patient_sex'] = cleaned_eye_data['patient_sex'].str.lower()

cleaned_eye_data['patient'] = np.where(
                                        cleaned_eye_data['patient_sex'].isna(),
                                        cleaned_eye_data['patient_age'],
                                        cleaned_eye_data['patient_sex'].str.strip()+ '_' + cleaned_eye_data['patient_age'].astype(str)
                                        )

cleaned_eye_data = cleaned_eye_data.drop(columns=["patient_age", "patient_sex"])


cleaned_eye_data.head(5)

Unnamed: 0,left_diagnostic_keywords,right_diagnostic_keywords,n,d,g,c,a,h,m,o,labels,target,patient
0,cataract,normal fundus,0,0,0,1,0,0,0,0,['N'],"[1, 0, 0, 0, 0, 0, 0, 0]",female_69
1,normal fundus,normal fundus,1,0,0,0,0,0,0,0,['N'],"[1, 0, 0, 0, 0, 0, 0, 0]",male_57
2,laser spot，moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,1,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",male_42
3,macular epiretinal membrane,mild nonproliferative retinopathy,0,1,0,0,0,0,0,1,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",male_53
4,moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,0,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",female_50


In [260]:
num_col = pd.DataFrame(cleaned_eye_data[cleaned_eye_data.select_dtypes(include=["int"]).columns])

num_col.head(3)

Unnamed: 0,n,d,g,c,a,h,m,o
0,0,0,0,1,0,0,0,0
1,1,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,1


In [261]:
# need to convert to text columns to work with vectorizer
# we are going to assign 
# 1 = its true value 
# 0 = z ( showing false)
def replace_number_with_letter(full_data, partial_data):
    # changes the data for num_col 
    for index ,letter in partial_data.iterrows():
            for col in partial_data.columns:
                if letter[col] == 1:
                    partial_data.at[index, col] = col
                elif letter[col] == 0:
                    partial_data.at[index, col] = 'z'
                elif letter[col] != 0 and letter[col] != 1:
                    break
    # replaces the appropriate columsn for the cleaned_eye_data
    for col in full_data.columns:
        if col in partial_data:
            full_data[col] = partial_data[col]
    return full_data

# applies the function
cleaned_eye_data = replace_number_with_letter(cleaned_eye_data, num_col)

cleaned_eye_data

  partial_data.at[index, col] = 'z'
  partial_data.at[index, col] = 'z'
  partial_data.at[index, col] = 'z'
  partial_data.at[index, col] = col
  partial_data.at[index, col] = 'z'
  partial_data.at[index, col] = 'z'
  partial_data.at[index, col] = 'z'
  partial_data.at[index, col] = 'z'


Unnamed: 0,left_diagnostic_keywords,right_diagnostic_keywords,n,d,g,c,a,h,m,o,labels,target,patient
0,cataract,normal fundus,z,z,z,c,z,z,z,z,['N'],"[1, 0, 0, 0, 0, 0, 0, 0]",female_69
1,normal fundus,normal fundus,n,z,z,z,z,z,z,z,['N'],"[1, 0, 0, 0, 0, 0, 0, 0]",male_57
2,laser spot，moderate non proliferative retinopathy,moderate non proliferative retinopathy,z,d,z,z,z,z,z,o,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",male_42
3,macular epiretinal membrane,mild nonproliferative retinopathy,z,d,z,z,z,z,z,o,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",male_53
4,moderate non proliferative retinopathy,moderate non proliferative retinopathy,z,d,z,z,z,z,z,z,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",female_50
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6387,severe nonproliferative retinopathy,proliferative diabetic retinopathy,z,d,z,z,z,z,z,z,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",male_63
6388,moderate non proliferative retinopathy,moderate non proliferative retinopathy,z,d,z,z,z,z,z,z,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",male_42
6389,mild nonproliferative retinopathy,normal fundus,z,d,z,z,z,z,z,z,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",male_54
6390,mild nonproliferative retinopathy,mild nonproliferative retinopathy,z,d,z,z,z,z,z,z,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",male_57


In [262]:
# redefined the data into great alphabet 
def convert_items(data):
    for index , column in data.iterrows():
        for value in data.columns:
            #target column
            if column[value] == "[1, 0, 0, 0, 0, 0, 0, 0]":
                data.at[index, value] = "alpha"
            elif column[value] == "[0, 1, 0, 0, 0, 0, 0, 0]":
                data.at[index, value] = "beta"
            elif column[value] == "[0, 0, 1, 0, 0, 0, 0, 0]":
                data.at[index, value] = "gamma"
            elif column[value] == "[0, 0, 0, 1, 0, 0, 0, 0]":
                data.at[index, value] = "delta"
            elif column[value] == "[0, 0, 0, 0, 1, 0, 0, 0]":
                data.at[index, value] = 'epsilon'
            elif column[value] == "[0, 0, 0, 0, 0, 1, 0, 0]":
                data.at[index, value] = 'zeta'
            elif column[value] == "[0, 0, 0, 0, 0, 0, 1, 0]":
                data.at[index, value] = 'eta'
            elif column[value] == "[0, 0, 0, 0, 0, 0, 0, 1]":
                data.at[index, value] = 'theta'
            # labels column
            elif column[value] == "['N']":
                data.at[index, value] = 'normal'
            elif column[value] == "['D']":
                data.at[index, value] = 'diabetes'
            elif column[value] == "['G']":
                data.at[index, value] = 'Glaucoma'
            elif column[value] == "['C']":
                data.at[index, value] = 'Cataract'
            elif column[value] == "['A']":
                data.at[index, value] = 'Age related Macular Degeneration'
            elif column[value] == "['H']":
                data.at[index, value] = 'Hypertension'
            elif column[value] == "['M']":
                data.at[index, value] = 'Pathological Myopia'
            elif column[value] == "['O']":
                data.at[index, value] = 'Other diseases/abnormalities'
            else:
                break
    return data

In [263]:
'''
changes the target into hierarchal order by greek letters 

'[1, 0, 0, 0, 0, 0, 0, 0]' - Represents Alpha (1st letter)
'[0, 1, 0, 0, 0, 0, 0, 0]' - Represents Beta (2nd letter)
'[0, 0, 1, 0, 0, 0, 0, 0]' - Represents Gamma (3rd letter)
'[0, 0, 0, 1, 0, 0, 0, 0]' - Represents Delta (4th letter)
'[0, 0, 0, 0, 1, 0, 0, 0]' - Represents Epsilon (5th letter)
'[0, 0, 0, 0, 0, 1, 0, 0]' - Represents Zeta (6th letter)
'[0, 0, 0, 0, 0, 0, 1, 0]' - Represents Eta (7th letter)
'[0, 0, 0, 0, 0, 0, 0, 1]' - Represents Theta (8th letter)

Normal (N),
Diabetes (D),
Glaucoma (G),
Cataract (C),
Age related Macular Degeneration (A),
Hypertension (H),
Pathological Myopia (M),
Other diseases/abnormalities (O)
'''
# view the unique values unique value 
unique_eye_data = pd.DataFrame(cleaned_eye_data['target'].unique())
unique_eye_data = pd.DataFrame(cleaned_eye_data['labels'].unique())
# loads the target dataframe into its own dataframe
target = pd.DataFrame(cleaned_eye_data['target'])
labels = pd.DataFrame(cleaned_eye_data['labels'])

# applies the function
target = convert_items(target)
labels = convert_items(labels)

# changes the target column
cleaned_eye_data['target'] = target

# changes the label column
cleaned_eye_data['labels'] = labels

cleaned_eye_data.head(10)

Unnamed: 0,left_diagnostic_keywords,right_diagnostic_keywords,n,d,g,c,a,h,m,o,labels,target,patient
0,cataract,normal fundus,z,z,z,c,z,z,z,z,normal,alpha,female_69
1,normal fundus,normal fundus,n,z,z,z,z,z,z,z,normal,alpha,male_57
2,laser spot，moderate non proliferative retinopathy,moderate non proliferative retinopathy,z,d,z,z,z,z,z,o,diabetes,beta,male_42
3,macular epiretinal membrane,mild nonproliferative retinopathy,z,d,z,z,z,z,z,o,diabetes,beta,male_53
4,moderate non proliferative retinopathy,moderate non proliferative retinopathy,z,d,z,z,z,z,z,z,diabetes,beta,female_50
5,macular epiretinal membrane,moderate non proliferative retinopathy，epireti...,z,d,z,z,z,z,z,o,diabetes,beta,male_60
6,drusen,mild nonproliferative retinopathy,z,d,z,z,z,z,z,o,diabetes,beta,female_60
7,normal fundus,normal fundus,n,z,z,z,z,z,z,z,normal,alpha,male_59
8,normal fundus,vitreous degeneration,z,z,z,z,z,z,z,o,Other diseases/abnormalities,theta,male_54
9,epiretinal membrane,normal fundus,z,z,z,z,z,z,z,o,normal,alpha,male_70


In [265]:
# remove null values 
target_values = cleaned_eye_data[['labels']]
print(f'shape of target data values before null removal: {target_values.shape}')
target_values =target_values[pd.notnull(target_values)]
print(f'shape of target data values after null removal: {target_values.shape}')
print(target_values.head(5))

#combine the items you want to train on
prep_training_data = cleaned_eye_data[['patient','target']]
training_data = prep_training_data['patient'] +"_" + prep_training_data['target']

print(f'shape of training data values before null removal: {training_data.shape}')
training_data = training_data[pd.notnull(training_data)]
print(f'shape of training data values after null removal: {training_data.shape}')

print(training_data.head(5))
# no null values found

shape of target data values before null removal: (6392, 1)
shape of target data values after null removal: (6392, 1)
     labels
0    normal
1    normal
2  diabetes
3  diabetes
4  diabetes
shape of training data values before null removal: (6392,)
shape of training data values after null removal: (6392,)
0    female_69_alpha
1      male_57_alpha
2       male_42_beta
3       male_53_beta
4     female_50_beta
dtype: object


In [249]:
# We are dropping target: label from the x and keeping it in the y
# pop out all the labels
x = training_data
y = target_values

## Split the data into train, test and validate sets
## This will allow for you to tune the models hyperparameters or do any regularization if needed.

# Splitting the dataset into 70% training and 30% temporary test set
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3, random_state=42)

# Splits the 30% test set into 15% hold and 15% validation
x_hold, x_val, y_hold, y_val = train_test_split(x_test,y_test, test_size=0.5, random_state=42)

#validate the split
print(f"size of x_train: {x_train.shape}")
print(f"size of y_train {y_train.shape}")
print(" ")
print(f"initial size of x_test {x_test.shape}")
print(f"initial size of y_test {y_test.shape}")
print(" ")
print(f"size of x_val {x_val.shape}")
print(f"size of y_val {y_val.shape}")
print(" ")
print(f"size of final test x_hold {x_hold.shape}")
print(f"size of final test y_hold {y_hold.shape}")

size of x_train: (4474,)
size of y_train (4474, 1)
 
initial size of x_test (1918,)
initial size of y_test (1918, 1)
 
size of x_val (959,)
size of y_val (959, 1)
 
size of final test x_hold (959,)
size of final test y_hold (959, 1)


In [250]:
# Random forest Classification model
# We want to classify the data and measure it using metrics like accuracy, precision, recall, F1 score, etc.

# initial training of the model
def train_the_model(model, x_var, y_var, name ="Default"):
    #fits the model
    model = model.fit(x_var,y_var)
    #predicts on the model
    preds = model.predict(x_var)
    #prints information
    print(f"Classification Report for {name}:")
    print(classification_report(y_var, preds, zero_division=1))

In [251]:
# sets up the random forest classifier train and validate instance
# to tweak you can add n_estimators and max_depth to changes the parameters of the Random Forest parameters
# you will tune at this level
rclf = RandomForestClassifier(n_estimators= 130, max_depth=100,random_state = 42)


vectorizer = TfidfVectorizer()
print(f'x train going in to vvectorizer: {x_train.shape}')
x_train_vec =  vectorizer.fit_transform(x_train)
print(f'x train coming out of vectorizer {x_train_vec.shape}')

# parameters
# x_train,y_train
train_the_model(rclf,x_train_vec, y_train, name="Training Random Forest Classifier")

x train going in to vvectorizer: (4474,)
x train coming out of vectorizer (4474, 639)


  return fit_method(estimator, *args, **kwargs)


Classification Report for Training Random Forest Classifier:
                                  precision    recall  f1-score   support

Age related Macular Degeneration       1.00      0.21      0.35       187
                        Cataract       1.00      0.13      0.23       201
                        Glaucoma       1.00      0.16      0.28       181
                    Hypertension       1.00      0.09      0.17        86
    Other diseases/abnormalities       1.00      0.57      0.73       510
             Pathological Myopia       1.00      0.29      0.45       157
                        diabetes       1.00      0.84      0.91      1122
                          normal       0.66      1.00      0.79      2030

                        accuracy                           0.76      4474
                       macro avg       0.96      0.41      0.49      4474
                    weighted avg       0.84      0.76      0.73      4474



In [252]:
# Validation Step

# parameters
# x_val,y_val

#vectorize the x_val step
x_val_vec = vectorizer.transform(x_val)

train_the_model(rclf, x_val_vec,y_val, name="Validating Random Forest Classifier")


  return fit_method(estimator, *args, **kwargs)


Classification Report for Validating Random Forest Classifier:
                                  precision    recall  f1-score   support

Age related Macular Degeneration       1.00      0.50      0.67        38
                        Cataract       1.00      0.50      0.67        44
                        Glaucoma       1.00      0.52      0.68        54
                    Hypertension       1.00      0.57      0.72        23
    Other diseases/abnormalities       1.00      0.72      0.84       101
             Pathological Myopia       1.00      0.29      0.44        28
                        diabetes       1.00      0.89      0.94       255
                          normal       0.73      1.00      0.85       416

                        accuracy                           0.84       959
                       macro avg       0.97      0.62      0.73       959
                    weighted avg       0.88      0.84      0.83       959



In [253]:
# hold set (final testing set)

# parameters
# x_hold,y_hold

#vectorizes the holdset
x_hold_vec = vectorizer.transform(x_hold)

train_the_model(rclf, x_hold_vec,y_hold, name="Final testing of Random Forest Classifier")

  return fit_method(estimator, *args, **kwargs)


Classification Report for Final testing of Random Forest Classifier:
                                  precision    recall  f1-score   support

Age related Macular Degeneration       1.00      0.46      0.63        41
                        Cataract       1.00      0.46      0.63        48
                        Glaucoma       1.00      0.39      0.56        49
                    Hypertension       1.00      0.37      0.54        19
    Other diseases/abnormalities       1.00      0.66      0.80        97
             Pathological Myopia       1.00      0.49      0.66        47
                        diabetes       1.00      0.81      0.90       231
                          normal       0.69      1.00      0.82       427

                        accuracy                           0.80       959
                       macro avg       0.96      0.58      0.69       959
                    weighted avg       0.86      0.80      0.79       959



In [254]:
# Save the model into a joblib file 
# we will need this to save the results of the machine learning model
# This can be useful for saving time and resources, as the results can be loaded 
# and used later without having to run the code again.

# Serialize the model
joblib.dump(rclf, 'random_for_eye_dataset_model.joblib')
# Ports over the vectorizer
joblib.dump(vectorizer,'vectorizer.joblib')

['vectorizer.joblib']

### Create API KEY

In [150]:
# First we are going to create our api key 
# import your library
import secrets

# function to create the api key
def generate_api_key():
    return secrets.token_urlsafe(32)
# Call the function 
new_api_key = generate_api_key()

# print out your key
print(f"Your new API KEY: {new_api_key}")

Your new API KEY: vYQiXbd4u422O2yP8iOXhmGg9yVOAlrbMQ7i_dW9Zwc


### Test your if your API works

In [225]:
'''
Alpha 
Beta 
Gamma 
Delta 
Epsilon 
Zeta 
Eta 
Theta 

Example inputs:
female_69_theta
male_69_theta
male_70_gamma
female_70_delta
'''
api_key = 'Cfoh1MsvhLPH-qDPxY6IhOGQNkBRKVp-L2-Lw8cjKeU'

headers = {
    "accept": "application/json",
    "EYE-DISEASE-API-KEY": api_key
}

try:
    response = requests.get("http://127.0.0.1:8000/v1/predict?value=female_70_delta", headers=headers)
    response.raise_for_status()  # This will raise an exception for HTTP errors
    result_json = response.json()  # Using .json() to directly get the JSON response
    print(result_json)
except requests.exceptions.HTTPError as err:
    print(f"HTTP Error: {err}")
except Exception as e:
    print(f"An error occurred: {e}")


{'prediction': 'Cataract'}
