In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.offline import init_notebook_mode, iplot 

sns.set()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data = pd.read_csv("/content/drive/My Drive/ML Project/cataract.csv")

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
data.head()

In [None]:
#Data preprocessing

#To check variables data type
print(data.info())

#Checking missing values
data.isnull()

print('Data columns with null values:\n',data.isnull().sum())

In [None]:
#Summary of numerical variables
summary_num = data.describe()
print(summary_num)

#Summary of categorical variables
summary_cat = data.describe(include="O")
print(summary_cat)

In [None]:
#Data visualization/Exploratory analysis

#Relationship between independent variables
correlation = data.corr()

#Extracting the column names
data.columns

#Numerical Data Analysis
num_attributes = data.select_dtypes(include=['float64'])
print(num_attributes.columns)


cat_attributes = data.select_dtypes(include=['object'])
print(cat_attributes.columns)

In [None]:
#Myopia_Cataract
Myopia = pd.crosstab( index = data["Have Myopia"], columns = 'count', normalize = True)
Myopia_Cataract = pd.crosstab( index = data["Have Myopia"],columns = data["Risk of cataracts"], normalize = 'index', margins = True)
sns.countplot(y='Have Myopia', hue='Risk of cataracts', data = cat_attributes)

In [None]:
#Myopia_Past Eye Surgery
PastEyeSurgery = pd.crosstab( index = data["Past Eye surgery"], columns = 'count', normalize = True)
Surgery_Cataract = pd.crosstab( index = data["Past Eye surgery"],columns = data["Risk of cataracts"], normalize = 'index', margins = True)
sns.countplot(y='Past Eye surgery', hue='Risk of cataracts', data = cat_attributes)



In [None]:
#Myopia_Past Eye Surgery
PastEyeSurgery = pd.crosstab( index = data["Past Eye surgery"], columns = 'count', normalize = True)
Surgery_Cataract = pd.crosstab( index = data["Past Eye surgery"],columns = data["Risk of cataracts"], normalize = 'index', margins = True)
sns.countplot(y='Past Eye surgery', hue='Risk of cataracts', data = cat_attributes)



In [None]:
#Myopia_Past Eye Surgery
Ethnicity = pd.crosstab( index = data["Ethnicity"], columns = 'count', normalize = True)
Ethnicity_Cataract = pd.crosstab( index = data["Ethnicity"],columns = data["Risk of cataracts"], normalize = 'index', margins = True)
sns.countplot(y='Ethnicity', hue='Risk of cataracts', data = cat_attributes)



In [None]:
#Myopia_Past Eye Surgery
Smoking = pd.crosstab( index = data["Smoking"], columns = 'count', normalize = True)
Smoking_Frequency = pd.crosstab( index = data["Smoking"],columns = data["Risk of cataracts"], normalize = 'index', margins = True)
sns.countplot(y='Smoking', hue='Risk of cataracts', data = cat_attributes)

In [None]:
# Import label encoder
from sklearn import preprocessing
  
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column 'species'.
data['Risk of cataracts']= label_encoder.fit_transform(data['Risk of cataracts'])

new_data=pd.get_dummies(data,drop_first=True)

#Storing the column names
columns_list=list(new_data.columns)
print(columns_list)

#Separating the input names from data
features=list(set(columns_list)-set(['Risk of cataracts']))
print(features)

['Weight(kg)', 'Height(metres)', 'Risk of cataracts', 'Age(years)_19 to 35', 'Age(years)_Above 35', 'Age(years)_Below 10', 'Education_Primary', 'Education_Secondary', 'Education_University', 'Occupation_Civil servant', 'Occupation_Military', 'Occupation_Self', 'Occupation_Student', 'Occupation_Teacher', 'Occupation_Trader', 'Marital Status_Single', 'Ethnicity_Yoruba', 'Religion_Islam', 'BMI Class_Obese', 'BMI Class_Overweight', 'Use Lenses_Yes', 'Family History_No', 'Smoking_Yes', 'Smoke Frequency_Pack/day', 'Smoke Frequency_Pack/month', 'Smoke Frequency_Pack/week', 'High Cholesterol_Yes', 'Diabetes_Yes', 'Hypertension_Yes', 'Hypertensive_Yes', 'Corticosteroid medications_Yes', 'Past Eye surgery_Yes', 'Hormone Replacement_Yes', 'Have Myopia_Yes', 'Myopia intensity_Low', 'Myopia intensity_Moderate', 'Myopia intensity_Nil', 'Alcohol_Yes', 'Alcohol frequency_Monthly', 'Alcohol frequency_Nil', 'Alcohol frequency_Weekly']
['BMI Class_Overweight', 'Occupation_Self', 'BMI Class_Obese', 'Hormo

In [None]:
#Storing the values from input features
x=new_data[features].values
print(x)

#Storing the output values in y
y=new_data['Risk of cataracts'].values
print(y)

#To partition the data
from sklearn.model_selection import train_test_split

#Splitting the data into train and test
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size=0.3,random_state=0)

[[1. 0. 0. ... 1. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 1. ... 1. 0. 0.]]
[0 1 0 1 0 0 2 1 1 1 0 0 0 0 2 0 0 0 0 0 0 2 1 0 2 2 0 2 2 0 2 0 1 0 1 0 0
 2 1 1 1 0 0 0 0 2 0 0 0 0 0 0 2 1 0 2 2 0 2 2 0 2 0 1 0 1 0 0 2 1 1 1 0 0
 0 0 2 0 0 0 0 0 0 2 1 0 2 2 0 2 2 0 2]


In [None]:
#Importing library for logistic regression
from sklearn.linear_model import LogisticRegression

#Importing performance metrics - accuracy score & confusion matrix
from sklearn.metrics import accuracy_score,confusion_matrix


# Make an instance of the Model
logistic = LogisticRegression()

# Fitting the vakues for x and y
logistic.fit(train_x,train_y)
logistic.coef_
logistic.intercept_

# Prediction from test data
prediction1 = logistic.predict(test_x)
print(prediction1)

# Confusion matrix3
confusion_matrix1= confusion_matrix(test_y,prediction1)
print(confusion_matrix1)

# Accuracy calculation
accuracy_score1 = accuracy_score(test_y,prediction1)
print(accuracy_score1)

from sklearn.metrics import recall_score

# Accuracy calculation
recall_score1 = recall_score(test_y,prediction1, average='macro')
print(recall_score1)

[0 0 2 0 0 0 0 2 0 0 0 1 2 0 0 2 1 0 2 0 0 0 2 2 0 1 0 0]
[[14  0  0]
 [ 1  3  0]
 [ 3  0  7]]
0.8571428571428571
0.8166666666666668



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



In [None]:
# KNN Classifier

# importing library for KNN
from sklearn.neighbors import KNeighborsClassifier

# Storing the K nearest neighbors classifier
KNN_classifier = KNeighborsClassifier(n_neighbors = 18)

# Fitting the values for x and y
KNN_classifier.fit(train_x,train_y)

# Predicting the test values with model
prediction2 = KNN_classifier.predict(test_x)

# Performance metric check
confusion_matrix2 = confusion_matrix(test_y,prediction2)
print("\t","Predicted values")
print("Original values","\n",confusion_matrix2)

# Accuracy calculation
accuracy_score2 = accuracy_score(test_y,prediction2)
print(accuracy_score2)

# Missclassified values from prediction
print("Missclassified samples : %d" % (test_y != prediction2).sum())

# Accuracy calculation
recall_score2 = recall_score(test_y,prediction1, average='macro')
print(recall_score2)

In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 10)

# Train the model on training data
rf.fit(train_x, train_y)

# Use the forest's predict method on the test data
prediction3 = rf.predict(test_x)
print(prediction3)
prediction3= np.around(prediction3)

prediction3 = prediction3.astype(int)
print(prediction3)

# Confusion matrix
confusion_matrix3= confusion_matrix(test_y,prediction3)
print(confusion_matrix3)

# Accuracy calculation
accuracy_score3 = accuracy_score(test_y,prediction3)
print(accuracy_score3)

# Calculate the absolute errors
errors = abs(prediction3 - test_y)
print(errors)

# Missclassified values from prediction
print("Missclassified samples : %d" % (test_y != prediction3).sum())