# Exploration et prétraitement des données

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

: 

### Importation du jeu de données et nettoyages

In [None]:
water_data = pd.read_csv('water_potability.csv')
water_data.head()

: 

In [None]:
water_data.shape

: 

Suppression des valeurs manquantes

In [None]:
water_data.replace('Missing', np.nan, inplace = True)
water_data.replace('NA', np.nan, inplace = True)

: 

In [None]:
water_data.isna().sum(axis = 0)

: 

In [None]:
water_data.dropna(axis=0, inplace=True)
water_data.reset_index(drop = True, inplace = True)
water_data.head()

: 

In [None]:
water_data.info()

: 

In [None]:
water_data.columns

: 

In [None]:
water_data.to_csv('Cleaned_water_data.csv')

: 

## Visualisation des données

In [None]:
clean_water = pd.read_csv('Cleaned_water_data.csv')

: 

In [None]:
clean_water.columns

: 

In [None]:
clean_water.drop(columns = 'Unnamed: 0', axis = 1, inplace = True)

: 

In [None]:
clean_water.head()

: 

In [None]:
clean_water.isnull().sum(axis = 0)

: 

In [None]:
clean_water.Potability.value_counts()

: 

### Visualisation Matplotlib

In [None]:
clean_water = clean_water[clean_water['ph'] > 7]
clean_water.hist(figsize = (12, 10))

: 

In [None]:
clean_water.Potability.value_counts().plot(kind = 'barh')

: 

In [None]:
plt.figure(figsize = (8, 4))
plt.scatter(clean_water['Sulfate'], clean_water['Conductivity'], c = clean_water['Potability'], alpha = 0.5, s = 100)
plt.xlabel('Sulfate'); plt.ylabel('Conductivity')

: 

In [None]:
plt.figure(figsize = (8, 4))
plt.scatter(clean_water['Solids'], clean_water['Organic_carbon'], c = clean_water['Potability'])
plt.xlabel('Solids'); plt.ylabel('Organic_carbon')

: 

In [None]:
correl = clean_water.corr()
correl.round(2)

: 

In [None]:
plt.figure(figsize = (10, 8))
sns.heatmap(correl, cmap = 'RdBu', center = 0, linewidths = 0.5) 
# sns.heatmap(correl, cmap = 'RdBu', vmin = -1, vmax = 1, center = 0, linewidths = 0.5) 
plt.title('heatmap des correlations lineaires croisées')

: 

In [None]:
plt.figure(figsize = (14, 10))
plt.subplot(3, 3, 1)
plt.hist(clean_water['ph'], color = 'red', label = 'PH')
plt.legend()
plt.subplot(3, 3, 2)
plt.hist(clean_water['Hardness'], label = 'Hardness')
plt.legend()
plt.subplot(3, 3, 3)
plt.hist(clean_water['Solids'], label = 'Solids')
plt.legend()
plt.subplot(3, 3, 4)
plt.hist(clean_water['Chloramines'], color = 'g', label = 'Chloramines')
plt.legend()
plt.subplot(3, 3, 5)
plt.hist(clean_water['Sulfate'], label = 'Sulfate')
plt.legend()
plt.subplot(3, 3, 6)
plt.hist(clean_water['Conductivity'], label = 'Conductivity')
plt.legend()
plt.subplot(3, 3, 7)
plt.hist(clean_water['Organic_carbon'], label = 'Organic_carbon')
plt.legend()

plt.legend()


# plt.subplot(2, 2, 4)
# plt.hist(clean_water['Sulfate'])

: 

In [None]:
# fig, axs = plt.subplots(3, 3)
# for i in range(1, 9):
#     plt.figure(figsize=(14,10))
#     plt.subplot(3, 3, i)
#     plt.hist(X_.iloc[:, i], label = "Courbe de "+X_.columns[i])
#     plt.legend()
#     plt.grid()

: 

## Opération de preprocessing

### Preprocessing

Nous appliquerons quelques opérations de preprocessing sur notre dataset à savoir la normalisation et la sélection

In [None]:
from sklearn.feature_selection import chi2, SelectKBest, SelectFromModel
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import MinMaxScaler, RobustScaler # utilisé pour effectuer de la normalisation
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, learning_curve

: 

In [None]:
clean_water.head()

: 

In [None]:
X_ = clean_water.drop(columns = 'Potability', axis = 1)
y_  = clean_water['Potability']

: 

### Opération de sélection

In [None]:
clean_water.head()

: 

In [None]:
X_.var()

#On constate donc que ph, chloramines et Turbidity varient très peu

: 

Premier test avex SGDClassifier

In [None]:
select = SelectFromModel(SGDClassifier(random_state = 0), threshold = 'mean')
select.fit_transform(X_, y_)
select.get_support()

: 

In [None]:
X_.columns[select.get_support()]

: 

2e test avex SelectKBest

In [None]:
chi2(X_, y_)

: 

In [None]:
selector2 = SelectKBest(chi2, k = 4)
selector2.fit_transform(X_, y_)
selector2.get_support()

: 

In [None]:
df = X_.var(axis = 0)
df.to_frame()

: 

### Graphes pour afficher les colonnes qui varient très peu

In [None]:
plt.figure(figsize = (10, 8))
plt.xlim(0, 100)
plt.ylim(2.5, 50)
plt.plot(X_)
plt.legend(X_)

: 

In [None]:
plt.figure(figsize = (10, 8))
plt.xlim(0, 100)
plt.ylim(0, 600)
plt.plot(X_)
plt.legend(X_)

: 

In [None]:
# Les variables sélectionnées qui seront utilisées pour entrainer le modèle sont

X_.columns[selector2.get_support()]

: 

In [None]:
X_.drop(columns = ['ph','Organic_carbon', 'Conductivity', 'Trihalomethanes', 'Turbidity'], axis = 1, inplace = True)

: 

In [None]:
X_.head()

: 

In [None]:
X_.shape

: 

In [None]:
y_.head()

: 

In [None]:
X = np.array(X_)
y = np.array(y_)

: 

In [None]:
print(type(X))
print(type(y))

: 

In [None]:
print(X.shape)
print(y.shape)

: 

### Opération de normalisation

In [None]:
X__ = RobustScaler().fit_transform(X)
X__


: 

In [None]:
# select.estimator_.coef_

: 

## Machine Learning : Entrainement du modèle qui servira à faire de la prédiction

Creation des variables de test et des variables d'entrainement avec train_test_split()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X__, y, test_size = 0.2)

: 

### Vosting

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier

model_1 = SGDClassifier()
model_2 = DecisionTreeClassifier()
model_3 = KNeighborsClassifier(3)

model_4 = VotingClassifier([('SGD', model_1), ('Tree', model_2), 
                            ('KNN', model_3)], voting='hard')

for mod in (model_1, model_2, model_3, model_4):
    mod.fit(X_train, y_train)
    print(mod.__class__.__name__, mod.score(X_test, y_test))

: 

### Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

: 

In [None]:
model = BaggingClassifier(base_estimator = KNeighborsClassifier(), n_estimators = 100)
model.fit(X_train, y_train)
model.score(X_test, y_test)

: 

### Boosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

: 

In [None]:
model = AdaBoostClassifier(n_estimators = 100)
model.fit(X_train, y_train)
model.score(X_test, y_test)

: 

In [None]:
print(X_train.shape)
print(X_test.shape)

: 

In [None]:
model = KNeighborsClassifier()
model.fit(X_train, y_train) #Entrainement du modele
model.score(X_test, y_test) #Verifier la fiabilité du modèle

: 

In [None]:
print('Train score : ',model.score(X_train, y_train))
print('Test score : ',model.score(X_test, y_test))

: 

### Validation Set

### Opération de Cross Validation pour valider notre modele

In [None]:
cross_val_score(KNeighborsClassifier(), X_train, y_train, cv = 5, scoring = 'accuracy').mean()

: 

In [None]:
# On obtiendra de meilleures performances lorsqu'on aura un nombre de voisins aux alentours de 15-18
val_score = []
for i in range(1, 50):
    score = cross_val_score(KNeighborsClassifier(i), X_train, y_train, cv = 5).mean()
    val_score.append(score)
    
plt.plot(val_score, label = 'validation')
plt.legend()

: 

### Validation Curve

In [None]:
# model = KNeighborsClassifier()

# k = np.arange(1, 50)
# train_score, val_score = validation_curve(model, X_train, y_train, k, cv = 5)
# val_score.shape

: 

### Amelioration du modele avec GridSearchCV (permet de trouver les meilleurs hyper paramètres)

In [None]:
param_grid = {'n_neighbors' : np.arange(1, 20), 'metric' : ['euclidian', 'manhattan']}

grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv = 5)

grid.fit(X_train, y_train)

: 

In [None]:
print('Meilleur score : ', grid.best_score_)
print('Meilleurs parametre : ', grid.best_params_)

: 

In [None]:
modeell = grid.best_estimator_

: 

In [None]:
modeell.score(X_test, y_test)

: 

### Learning Curves

In [None]:
N, train_score, val_score = learning_curve(modeell, X_train, y_train, train_sizes = np.linspace(0.05, 1.0, 10), cv = 5)

plt.plot(N, train_score.mean(axis = 1), label = 'train')
plt.plot(N, val_score.mean(axis = 1), label = 'Validation')
plt.xlabel('Train sizes')
plt.legend()
print(N)

: 

In [None]:
X_.head()

: 

In [None]:
def potable(modeell, Hardness, Solids, Chloramines, Sulfate):
    H = np.array([Hardness, Solids, Chloramines, Sulfate]).reshape(1, 4)
    return modeell.predict(H), model.predict_proba(H)

: 

In [None]:
potable(modeell, 20.34, 2500.5467, 5.5, 28.98)

: 