# Projet prédiction diabète

## Import des librairies

In [None]:
# Analyse
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

## Import des données

In [None]:
df = pd.read_csv("diabetes.csv")
df

## Analyse du dataset

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
fig, axes = plt.subplots(nrows = 2, ncols = 4, figsize = (20, 10))

for col, axe in zip(df.columns[:-1], axes.flatten()):
    axe.boxplot(df[col])
    axe.set_title(col)

## Management valeurs aberrantes        

In [None]:
# Remplacement des 0 par NaN
df['BMI'] = df['BMI'].replace(0, np.NaN)
df['BloodPressure'] = df['BloodPressure'].replace(0, np.NaN)
df['SkinThickness'] = df['SkinThickness'].replace(0, np.NaN)
df['Insulin'] = df['Insulin'].replace(0, np.NaN)
df['Glucose'] = df['Glucose'].replace(0, np.NaN)

In [None]:
# Remplacement des NaNs par la moyenne par rapport à la sortie
df['BMI'] = df.groupby('Outcome')['BMI'].transform(lambda x: x.fillna(x.mean()))
df['BloodPressure'] = df.groupby('Outcome')['BloodPressure'].transform(lambda x: x.fillna(x.mean()))
df['SkinThickness'] = df.groupby('Outcome')['SkinThickness'].transform(lambda x: x.fillna(x.mean()))
df['Insulin'] = df.groupby('Outcome')['Insulin'].transform(lambda x: x.fillna(x.mean()))
df['Glucose'] = df.groupby('Outcome')['Glucose'].transform(lambda x: x.fillna(x.mean()))

## Tests Modèles

### Train split

In [None]:
x = df.drop(['Outcome'], axis = 1)

y = df['Outcome']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

### Log Reg

In [None]:
logReg = LogisticRegression()

logReg.fit(x_test, y_test)

y_pred = logReg.predict(x_test)

print(f"Matrice de confusion: \n{confusion_matrix(y_test, y_pred)}\n")

print(classification_report(y_test, y_pred))

### Random Forest

In [None]:
forest = RandomForestClassifier(n_estimators = 100)

forest.fit(x_train, y_train)

y_pred = forest.predict(x_test)

print(f"Matrice de confusion: \n{confusion_matrix(y_test, y_pred)}\n")

print(classification_report(y_test, y_pred))

In [None]:
importances = forest.feature_importances_
colones = df.drop(['Outcome'], axis = 1).columns.values.tolist()

imp_df = pd.DataFrame()

imp_df['feature'] = colones
imp_df['importance'] = importances

imp_df.sort_values('importance', axis = 0, ascending = False, inplace = True)

imp_df.plot.bar(x = 'feature', y = 'importance', rot = 90)

### SVM

In [None]:
clf = SVC()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

print(f"Matrice de confusion: \n{confusion_matrix(y_test, y_pred)}\n")

print(classification_report(y_test, y_pred))

### XGBoost

In [None]:
xg_boost = GradientBoostingClassifier()
xg_boost.fit(x_train, y_train)
y_pred = xg_boost.predict(x_test)

print(f"Matrice de confusion: \n{confusion_matrix(y_test, y_pred)}\n")

print(classification_report(y_test, y_pred))

### KNeighborsClassifier

In [None]:
best_score = 0
for i in range(1, 20):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)

    score = round(accuracy_score(y_test, y_pred), 3) * 100

    if score > best_score:
        best_score = round(accuracy_score(y_test, y_pred), 3) * 100
        best_n = i

print(best_n)

In [None]:
knn = KNeighborsClassifier(best_n)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)

print(f"Matrice de confusion: \n{confusion_matrix(y_test, y_pred)}\n")

print(classification_report(y_test, y_pred))