# Diabetes Prediction Model

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, LogisticRegression

In [None]:
from google.colab import files
df = files.upload()

In [None]:
# Import some diabates DataSet
# Data from kaggle = https://www.kaggle.com/datasets/saurabh00007/diabetescsv?resource=download

In [None]:
df = pd.read_csv('diabetes.csv')

# Show Data

In [None]:
df.head()

# Count information (Dtype, attribute)

In [None]:
df.info()

The describe() method is used for calculating some statistical data like percentile, mean and std of the numerical
values of the Series or DataFrame. It analyzes both numeric and object series and also the DataFrame column sets of
mixed data types.

In [None]:
df.describe()

In [None]:
'''The function dataframe. isnull(). sum(). sum() returns the number of missing values in the data set.'''

In [None]:
df.isnull().sum()

In [None]:
'''
df.corr(): Pandas df.corr() is used to find the pairwise correlation of all columns in the dataframe. Any na values are automatically excluded. For any non-numeric data type columns in the dataframe it is ignored. Use corr() function to find the correlation among the columns in the dataframe using ‘Outcome’ method. Now use corr() function to find the correlation among the columns.
Heatmap: Heatmap is defined as a graphical representation of data using colors to visualize the value of the matrix. In this, to represent more common values or higher activities brighter colors basically reddish colors are used and to represent less common or activity values, darker colors are preferred. Heatmap is also defined by the name of the shading matrix. Heatmaps in Seaborn can be plotted by using the seaborn.heatmap() function.

'''

In [None]:
corr_matrix = df.corr()
plt.figure(figsize=(15,7))
sns.heatmap(corr_matrix,annot=True,fmt='0.2f')

In [None]:
'''Relation between all attributes based on Outcome attributes'''
sns.pairplot(df,hue='Outcome')

In [None]:
'''
To see Dependent and independent variable
'''

In [None]:
plt.figure(figsize=(15,7))
df[df.columns[0:]].corr()['Outcome'].plot(kind='barh')

In [None]:
'''
Extracting independent and dependent variable
The independent variable is the cause. Its value is independent of other variables in our study. Independent is = X
The dependent variable is the effect. Its value depends on changes in the independent variable. Dependent is = Y

'''

In [None]:
x = df.drop(['Outcome'], axis=1)
y = df['Outcome']

In [None]:
'''
Splitting the dataset into training and test set. Noted that: 80% of Data is picked for training and 20% of Data is picked for testing. those data are selected randomly.
'''

In [None]:
from sklearn.model_selection import train_test_split as tts
xtrain,xtest,ytrain,ytest=tts(x,y,test_size=0.2,random_state=1)

In [None]:
'''Show xtrain data '''

In [None]:
xtrain

In [None]:
''' Show xtest data '''

In [None]:
xtest

In [None]:
''' Call some libraries and import algorithms from them.'''

In [None]:
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import LabelEncoder

In [None]:
'''
Run algorithms and store in variable.
'''

In [None]:
lg = LinearRegression()
knn =KNeighborsRegressor()
support = SVR(gamma='auto')
tree = DecisionTreeRegressor()
forest = RandomForestRegressor(n_estimators=49)

In [None]:
'''
Fitted and trained libraries to get the best scores.
'''

In [None]:
#lg
lg
lg.fit(xtrain, ytrain)
lg_score = lg.score(xtrain, ytrain)
#knn
knn
knn.fit(xtrain, ytrain)
knn_score = knn.score(xtrain, ytrain)
#support
support
support.fit(xtrain, ytrain)
support_score = support.score(xtrain, ytrain)
#tree
tree
tree.fit(xtrain, ytrain)
tree_score = tree.score(xtrain, ytrain)
#forest
forest
forest.fit(xtrain, ytrain)
forest_score = forest.score(xtrain, ytrain)

In [None]:
'''
All algorithms are shown in the 'Algorithms'  variable and all scores are shown in the 'Score' variable
'''

In [None]:
Score = pd.DataFrame([lg_score, knn_score, support_score, tree_score,forest_score], columns=['Score'])
Algorithoms = pd.DataFrame(['Linear Regression', 'K Neighbors Regressor', 'Support Vector Machine', ' Decision Tree Regressor','Random Forest Regressor'], columns=['Algorithoms'])

In [None]:
''' Concat 'Algorithms' and 'Score' variable '''

In [None]:
table = pd.concat([Algorithoms, Score ], axis=1)
table

In [None]:
'''Got the best scorer Algorithm which is the 'Decision Tree Regression'''

In [None]:
'''Taken all algorithms for error finding'''

In [None]:
algo = [lg, knn, support, tree, forest]

In [None]:
score = []
mse = []
mae = []
for a in algo:
    a.fit(xtrain, ytrain)
    pred = a.predict(xtest)
    score.append(r2_score(ytest, pred))
    mse.append(mean_squared_error(ytest, pred))
    mae.append(mean_absolute_error(ytest, pred))

In [None]:
scr = pd.DataFrame(score, columns=['r2_score'])
me_ab_er = pd.DataFrame(mae, columns=['Mean Absolute Error'])
me_sc_er = pd.DataFrame(mse, columns=['Mean Squared Error'])
algorithm = pd.DataFrame(['Linear Regression', 'K Neighbors Regressor','SVR','Decision Tree Regressor','Random Forest Regressor'], columns=['Algorithoms'])

In [None]:
err_table = pd.concat([algorithm, scr, me_sc_er, me_ab_er ], axis=1)
err_table

In [None]:
'''Got the less error Algorithm which is the 'Decision Tree Regression'''

In [None]:
''' It can be concluded that the 'Decision tree algorithm' is the best algorithm which will give the best result for Diabetes Prediction. '''

In [None]:
Pregnancies = (input('Pregnancies Month:'))
Glucose = (input('Glucose level:'))
BloodPressure = (input('Please input -> Blood Pressure level:'))
SkinThickness = (input('Now input -> Skin Thickness:'))
Insulin = (input('Now input -> Insulin Level:'))
BMI = (input('Now input -> BMI:'))
DiabetesPedigreeFunction = (input('Now input -> Diabetes Pedigree Function:'))
Age = (input('Now input -> Age:'))
sample = np.array([Pregnancies, Glucose,BloodPressure, SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age])
pred = tree.predict([sample])
if pred == 0:
    print('Prediction Successfully completed, You have no diabetes')
elif pred == 1:
    print('Prediction Successfully completed, You have diabetes, Be Care full')
else:
    print('Data is not correct')