#*AUTHORS : COUTAREL Allan, DEVOUCOUX Maxime*

## IMPORT OF LIBRARIES AND METHOD DEFINITIONS

In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
import random
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline

from bagging import Bagging
from decisionTree import DecisionTree

le = LabelEncoder()

In [2]:
def getKNeighborsClassifier(X_train, y_train) :
  """
    KNeighbors Classifier

    Parameters
    ----------
    X_train : dataset
      training features
    y_train : dataset
      training labels
    
    Returns
    -------
    sklearn KNeighbors model
      Model after training
  """
  model = KNeighborsClassifier()
  model.fit(X_train, y_train)
  print("Accuracy of the model (with training data) :", model.score(X_train, y_train))
  return model

In [3]:
def getSVClassifier(X_train, y_train):
  """
    Support Vector Classifier

    Parameters
    ----------
    X_train : dataset
      training features
    y_train : dataset
      training labels
    
    Returns
    -------
    sklearn Support Vector model
      Model after training
  """
  model = svm.SVC(kernel='linear')
  model.fit(X_train, y_train)
  print("Accuracy of the model (with training data) :", model.score(X_train, y_train))
  return model

In [4]:
def getGaussianNBClassifier(X_train, y_train):
  """
    Gaussian Naive Bayes Classifier

    Parameters
    ----------
    X_train : dataset
      training features
    y_train : dataset
      training labels
    
    Returns
    -------
    sklearn Gaussian Naive Bayes model
      Model after training
  """
  model = GaussianNB()
  model.fit(X_train, y_train)
  print("Accuracy of the model (with training data) :", model.score(X_train, y_train))
  return model

In [5]:
def getLogisticRegressionClassifier(X_train, y_train):
  """
    Logistic Regression

    Parameters
    ----------
    X_train : dataset
      training features
    y_train : dataset
      training labels
    
    Returns
    -------
    sklearn Logistic Regression model
      Model after training
  """
  model = LogisticRegression()
  model.fit(X_train, y_train)
  print("Accuracy of the model (with training data) :", model.score(X_train, y_train))
  return model

In [6]:
def evaluateModel(model, X_test, y_test, isBinary):
  """
    Evaluates the model with 5 metrics and returns expected values and predicted values in a dataframe

    Parameters
    ----------
    model : sklearn model
      model used to make the predictions
    X_test : dataset
      testing features
    y_test : dataset
      testing labels
    isBinary : boolean
      True for binary classes, False for multi classes

    Returns
    -------
    dataframe
      expected values and predicted values in a dataframe
  """
  y_pred = pd.DataFrame(model.predict(X_test))
  print("Accuracy :", metrics.accuracy_score(y_test, y_pred))
  print("Balanced accuracy :", metrics.balanced_accuracy_score(y_test, y_pred))
  print("Confusion matrix :", metrics.confusion_matrix(y_test, y_pred))
  if isBinary:
    print("Precision :", metrics.precision_score(y_test, y_pred))
    print("Recall :", metrics.recall_score(y_test, y_pred))
  else:
    print("Precision :", metrics.precision_score(y_test, y_pred, average='macro', zero_division=0))
    print("Recall :", metrics.recall_score(y_test, y_pred, average='macro', zero_division=0))
  return get_Results_Dataframe(y_pred, y_test)

In [7]:
def get_Results_Dataframe(predict, y_test):
  """
    Makes a dataframe with the expected values and the predicted values 

    Parameters
      ----------
      predict : dataset
        predicted values
      y_test : dataset
        testing labels = expected values
    
    Returns
    -------
    dataframe
      expected values and predicted values in a dataframe
  """
  predict.set_index(y_test.index, inplace=True)
  result = pd.concat([y_test, predict], axis=1)
  result.rename({result.columns[0]: 'y_test', result.columns[1]: 'y_pred'}, axis=1, inplace=True)
  return result

In [8]:
def printResults(data, nbRowsMin, nbRowsMax):
  """
    Prints the dataframe and the occurrence of dataset elements

    Parameters
      ----------
      data : dataset
        dataset to print
      nbRowsMin : int
        minimum of rows to print
      nbRowsMax : int
        maximum of rows to print
      
    Returns
    -------
  """
  pd.set_option("display.min_rows", nbRowsMin, "display.max_rows", nbRowsMax, "display.max_columns", None)
  print(data)

## H1N1 DATASET

In [9]:
"""
H1N1 flu vaccines dataset import
"""
X_vaccines = pd.read_csv('https://raw.githubusercontent.com/a-coutarel/Projet-4A-IA/main/vaccines_data/training_set_features.csv')  
y_vaccines = pd.read_csv('https://raw.githubusercontent.com/a-coutarel/Projet-4A-IA/main/vaccines_data/training_set_labels.csv')
dataset_vaccines = pd.merge(X_vaccines, y_vaccines, on=['respondent_id','respondent_id'])

In [10]:
"""
Counts the occurrences of each class
"""
y_vaccines['h1n1_vaccine'].value_counts()

0    21033
1     5674
Name: h1n1_vaccine, dtype: int64

In [11]:
"""
Converts categorical variables into numbers
"""
dataset_vaccines['sex'] = le.fit_transform(dataset_vaccines['sex'])
dataset_vaccines['race'] = le.fit_transform(dataset_vaccines['race'])
dataset_vaccines['age_group'] = le.fit_transform(dataset_vaccines['age_group'])
dataset_vaccines['education'] = le.fit_transform(dataset_vaccines['education'])
dataset_vaccines['income_poverty'] = le.fit_transform(dataset_vaccines['income_poverty'])
dataset_vaccines['marital_status'] = le.fit_transform(dataset_vaccines['marital_status'])
dataset_vaccines['rent_or_own'] = le.fit_transform(dataset_vaccines['rent_or_own'])
dataset_vaccines['employment_status'] = le.fit_transform(dataset_vaccines['employment_status'])
dataset_vaccines['census_msa'] = le.fit_transform(dataset_vaccines['census_msa'])
dataset_vaccines['hhs_geo_region'] = le.fit_transform(dataset_vaccines['hhs_geo_region'])
dataset_vaccines['employment_industry'] = le.fit_transform(dataset_vaccines['employment_industry'])
dataset_vaccines['employment_occupation'] = le.fit_transform(dataset_vaccines['employment_occupation'])

"""
Deletes dataset lines which contains missing values (NaN) and splitting features and labels
"""
dataset_vaccines = dataset_vaccines.dropna()
y_vaccines = dataset_vaccines.pop('h1n1_vaccine')
X_vaccines = dataset_vaccines

In [14]:
"""
Train/test split and removal of outliers + model training and testing 
"""
X_train_vaccines, X_test_vaccines, y_train_vaccines, y_test_vaccines = train_test_split(X_vaccines, y_vaccines, test_size=0.3, random_state=42)

IF = IsolationForest(random_state=42)
IF.fit(X_train_vaccines)
y_pred = IF.predict(X_test_vaccines)
X_test_vaccines = X_test_vaccines[y_pred != -1]
y_test_vaccines = y_test_vaccines[y_pred != -1]

scaler = StandardScaler().fit(X_train_vaccines)
X_train_vaccines = scaler.transform(X_train_vaccines)
X_test_vaccines = scaler.transform(X_test_vaccines)

"""
KNN Classifier for H1N1 vaccines
"""
print("\nKNN CLASSIFIER :\n")
KNNmodel = getKNeighborsClassifier(X_train_vaccines, y_train_vaccines)
KNN = evaluateModel(KNNmodel, X_test_vaccines, y_test_vaccines, True)
#printResults(KNN, 0, 15)

"""
SVC Classifier for H1N1 vaccines
"""
print("\nSVC CLASSIFIER :\n")
SVCmodel = getSVClassifier(X_train_vaccines, y_train_vaccines)
SVC = evaluateModel(SVCmodel, X_test_vaccines, y_test_vaccines, True)
#printResults(SVC, 0, 15)

"""
GNB Classifier for H1N1 vaccines
"""
print("\nGNB CLASSIFIER :\n")
GNBmodel = getGaussianNBClassifier(X_train_vaccines, y_train_vaccines)
GNB = evaluateModel(GNBmodel, X_test_vaccines, y_test_vaccines, True)
#printResults(GNB, 0, 15)

"""
LR Classifier for H1N1 vaccines
"""
print("\nLR CLASSIFIER :\n")
LRmodel = getLogisticRegressionClassifier(X_train_vaccines, y_train_vaccines)
LR = evaluateModel(LRmodel, X_test_vaccines, y_test_vaccines, True)
#printResults(LR, 0, 15)

"""
Bagging of decision trees for H1N1 vaccines
"""
print("\nBAGGING CLASSIFIER :\n")
bagging = Bagging(10, .02, 15).fit(X_train_vaccines, y_train_vaccines)
bagging = evaluateModel(bagging, X_test_vaccines, y_test_vaccines, False)
#printResults(bagging, 0, 15)


KNN CLASSIFIER :

Accuracy of the model (with training data) : 0.8537127141950497
Accuracy : 0.808532249873032
Balanced accuracy : 0.7019666720770438
Confusion matrix : [[1348  116]
 [ 261  244]]
Precision : 0.6777777777777778
Recall : 0.48316831683168315

SVC CLASSIFIER :

Accuracy of the model (with training data) : 0.8422889782102814
Accuracy : 0.8481462671406805
Balanced accuracy : 0.7655744467889412
Confusion matrix : [[1369   95]
 [ 204  301]]
Precision : 0.76010101010101
Recall : 0.596039603960396

GNB CLASSIFIER :

Accuracy of the model (with training data) : 0.7743812143008251
Accuracy : 0.7836465210766886
Balanced accuracy : 0.7442514743277606
Confusion matrix : [[1208  256]
 [ 170  335]]
Precision : 0.5668358714043993
Recall : 0.6633663366336634

LR CLASSIFIER :

Accuracy of the model (with training data) : 0.8416543262111276
Accuracy : 0.845606907059421
Balanced accuracy : 0.7619210896499486
Confusion matrix : [[1367   97]
 [ 207  298]]
Precision : 0.7544303797468355
Recal

  prop = prop / len(partition)


Accuracy : 0.8252920264093448
Balanced accuracy : 0.7638255423903046
Confusion matrix : [[1303  161]
 [ 183  322]]
Precision : 0.7717586361597129
Recall : 0.7638255423903046


In [None]:
"""
KNN Classifier with kfolds for H1N1 vaccines
"""
kf = KFold(n_splits=5, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', KNeighborsClassifier())
])
scoring = ['accuracy', 'balanced_accuracy', 'precision', 'recall']
scores = cross_validate(pipe, X=X_vaccines, y=y_vaccines, cv=kf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision'])
print("Mean Precision :", scores['test_precision'].mean())
print("Recall :", scores['test_recall'])
print("Mean Recall :", scores['test_recall'].mean())

In [None]:
"""
SVC Classifier with kfolds for H1N1 vaccines
"""
kf = KFold(n_splits=5, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', svm.SVC(kernel='linear'))
])
scoring = ['accuracy', 'balanced_accuracy', 'precision', 'recall']
scores = cross_validate(pipe, X=X_vaccines, y=y_vaccines, cv=kf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision'])
print("Mean Precision :", scores['test_precision'].mean())
print("Recall :", scores['test_recall'])
print("Mean Recall :", scores['test_recall'].mean())

In [None]:
"""
GNB Classifier with kfolds for H1N1 vaccines
"""
kf = KFold(n_splits=5, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', GaussianNB())
])
scoring = ['accuracy', 'balanced_accuracy', 'precision', 'recall']
scores = cross_validate(pipe, X=X_vaccines, y=y_vaccines, cv=kf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision'])
print("Mean Precision :", scores['test_precision'].mean())
print("Recall :", scores['test_recall'])
print("Mean Recall :", scores['test_recall'].mean())

In [None]:
"""
LR Classifier with kfolds for H1N1 vaccines
"""
kf = KFold(n_splits=5, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])
scoring = ['accuracy', 'balanced_accuracy', 'precision', 'recall']
scores = cross_validate(pipe, X=X_vaccines, y=y_vaccines, cv=kf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision'])
print("Mean Precision :", scores['test_precision'].mean())
print("Recall :", scores['test_recall'])
print("Mean Recall :", scores['test_recall'].mean())

In [None]:
"""
Bagging Classifier with kfolds for H1N1 vaccines
"""
kf = KFold(n_splits=5, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', Bagging(10, .02, 15))
])
scoring = ['accuracy', 'balanced_accuracy', 'precision', 'recall']
scores = cross_validate(pipe, X=X_vaccines, y=y_vaccines, cv=kf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision'])
print("Mean Precision :", scores['test_precision'].mean())
print("Recall :", scores['test_recall'])
print("Mean Recall :", scores['test_recall'].mean())

In [None]:
"""
KNN Classifier with Stratified kfolds for H1N1 vaccines
"""
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', KNeighborsClassifier())
])
scoring = ['accuracy', 'balanced_accuracy', 'precision', 'recall']
scores = cross_validate(pipe, X=X_vaccines, y=y_vaccines, cv=skf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision'])
print("Mean Precision :", scores['test_precision'].mean())
print("Recall :", scores['test_recall'])
print("Mean Recall :", scores['test_recall'].mean())

In [None]:
"""
SVC Classifier with Stratified kfolds for H1N1 vaccines
"""
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', svm.SVC(kernel='linear'))
])
scoring = ['accuracy', 'balanced_accuracy', 'precision', 'recall']
scores = cross_validate(pipe, X=X_vaccines, y=y_vaccines, cv=skf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision'])
print("Mean Precision :", scores['test_precision'].mean())
print("Recall :", scores['test_recall'])
print("Mean Recall :", scores['test_recall'].mean())

In [None]:
"""
GNB Classifier with Stratified kfolds for H1N1 vaccines
"""
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', GaussianNB())
])
scoring = ['accuracy', 'balanced_accuracy', 'precision', 'recall']
scores = cross_validate(pipe, X=X_vaccines, y=y_vaccines, cv=skf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision'])
print("Mean Precision :", scores['test_precision'].mean())
print("Recall :", scores['test_recall'])
print("Mean Recall :", scores['test_recall'].mean())

In [None]:
"""
LR Classifier with Stratified kfolds for H1N1 vaccines
"""
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])
scoring = ['accuracy', 'balanced_accuracy', 'precision', 'recall']
scores = cross_validate(pipe, X=X_vaccines, y=y_vaccines, cv=skf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision'])
print("Mean Precision :", scores['test_precision'].mean())
print("Recall :", scores['test_recall'])
print("Mean Recall :", scores['test_recall'].mean())

In [None]:
"""
Bagging Classifier with Stratified kfolds for H1N1 vaccines
"""
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', Bagging(10, .02, 15))
])
scoring = ['accuracy', 'balanced_accuracy', 'precision', 'recall']
scores = cross_validate(pipe, X=X_vaccines, y=y_vaccines, cv=skf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision'])
print("Mean Precision :", scores['test_precision'].mean())
print("Recall :", scores['test_recall'])
print("Mean Recall :", scores['test_recall'].mean())

## WATER PUMPS DATASET

In [None]:
"""
water pumps dataset import
"""
X_water_pumps = pd.read_csv('https://raw.githubusercontent.com/a-coutarel/Projet-4A-IA/main/water_pumps_data/training_set_values.csv')  
y_water_pumps = pd.read_csv('https://raw.githubusercontent.com/a-coutarel/Projet-4A-IA/main/water_pumps_data/training_set_labels.csv')
dataset_water_pumps = pd.merge(X_water_pumps, y_water_pumps, on=['id','id'])

In [None]:
"""
Counts the occurrences of each class
"""
y_water_pumps['status_group'].value_counts()

In [None]:
"""
Converts categorical variables into numbers
"""
dataset_water_pumps['date_recorded'] = le.fit_transform(dataset_water_pumps['date_recorded'])
dataset_water_pumps['funder'] = le.fit_transform(dataset_water_pumps['funder'])
dataset_water_pumps['installer'] = le.fit_transform(dataset_water_pumps['installer'])
dataset_water_pumps['wpt_name'] = le.fit_transform(dataset_water_pumps['wpt_name'])
dataset_water_pumps['basin'] = le.fit_transform(dataset_water_pumps['basin'])
dataset_water_pumps['subvillage'] = le.fit_transform(dataset_water_pumps['subvillage'])
dataset_water_pumps['region'] = le.fit_transform(dataset_water_pumps['region'])
dataset_water_pumps['lga'] = le.fit_transform(dataset_water_pumps['lga'])
dataset_water_pumps['ward'] = le.fit_transform(dataset_water_pumps['ward'])
dataset_water_pumps['public_meeting'] = le.fit_transform(dataset_water_pumps['public_meeting'])
dataset_water_pumps['recorded_by'] = le.fit_transform(dataset_water_pumps['recorded_by'])
dataset_water_pumps['scheme_management'] = le.fit_transform(dataset_water_pumps['scheme_management'])
dataset_water_pumps['scheme_name'] = le.fit_transform(dataset_water_pumps['scheme_name'])
dataset_water_pumps['permit'] = le.fit_transform(dataset_water_pumps['permit'])
dataset_water_pumps['extraction_type'] = le.fit_transform(dataset_water_pumps['extraction_type'])
dataset_water_pumps['extraction_type_group'] = le.fit_transform(dataset_water_pumps['extraction_type_group'])
dataset_water_pumps['extraction_type_class'] = le.fit_transform(dataset_water_pumps['extraction_type_class'])
dataset_water_pumps['management'] = le.fit_transform(dataset_water_pumps['management'])
dataset_water_pumps['management_group'] = le.fit_transform(dataset_water_pumps['management_group'])
dataset_water_pumps['payment'] = le.fit_transform(dataset_water_pumps['payment'])
dataset_water_pumps['payment_type'] = le.fit_transform(dataset_water_pumps['payment_type'])
dataset_water_pumps['water_quality'] = le.fit_transform(dataset_water_pumps['water_quality'])
dataset_water_pumps['quality_group'] = le.fit_transform(dataset_water_pumps['quality_group'])
dataset_water_pumps['quantity'] = le.fit_transform(dataset_water_pumps['quantity'])
dataset_water_pumps['quantity_group'] = le.fit_transform(dataset_water_pumps['quantity_group'])
dataset_water_pumps['source'] = le.fit_transform(dataset_water_pumps['source'])
dataset_water_pumps['source_type'] = le.fit_transform(dataset_water_pumps['source_type'])
dataset_water_pumps['source_class'] = le.fit_transform(dataset_water_pumps['source_class'])
dataset_water_pumps['waterpoint_type'] = le.fit_transform(dataset_water_pumps['waterpoint_type'])
dataset_water_pumps['waterpoint_type_group'] = le.fit_transform(dataset_water_pumps['waterpoint_type_group'])
dataset_water_pumps['status_group'] = le.fit_transform(dataset_water_pumps['status_group'])

"""
Deletes dataset lines which contains missing values (NaN) and splitting features and labels
"""
dataset_water_pumps = dataset_water_pumps.dropna()
y_water_pumps = dataset_water_pumps.pop('status_group')
X_water_pumps = dataset_water_pumps

In [None]:
"""
Train/test split and removal of outliers + model training and testing 
"""
X_train_wp, X_test_wp, y_train_wp, y_test_wp = train_test_split(X_water_pumps, y_water_pumps, test_size=0.3, random_state=42)

IF = IsolationForest(random_state=42)
IF.fit(X_train_wp)
y_pred = IF.predict(X_test_wp)
X_test_wp = X_test_wp[y_pred != -1]
y_test_wp = y_test_wp[y_pred != -1]

scaler = StandardScaler().fit(X_train_wp)
X_train_wp = scaler.transform(X_train_wp)
X_test_wp = scaler.transform(X_test_wp)


"""
KNN Classifier for water pumps
"""
print("\nKNN CLASSIFIER :\n")
KNNmodel = getKNeighborsClassifier(X_train_wp, y_train_wp)
KNN = evaluateModel(KNNmodel, X_test_wp, y_test_wp, False)
#printResults(KNN, 0, 15)

"""
SVC Classifier for water pumps
"""
print("\nSVC CLASSIFIER :\n")
X_train_subset = X_train_wp[:int(X_train_wp.shape[0]*0.5),:]
y_train_subset = y_train_wp[:int(y_train_wp.shape[0]*0.5)]
SVCmodel = getSVClassifier(X_train_subset, y_train_subset)
SVC = evaluateModel(SVCmodel, X_test_wp, y_test_wp, False)
#printResults(SVC, 0, 15)

"""
GNB Classifier for water pumps
"""
print("\nGNB CLASSIFIER :\n")
GNBmodel = getGaussianNBClassifier(X_train_wp, y_train_wp)
GNB = evaluateModel(GNBmodel, X_test_wp, y_test_wp, False)
#printResults(GNB, 0, 15)

"""
LR Classifier for water pumps
"""
print("\nLR CLASSIFIER :\n")
LRmodel = getLogisticRegressionClassifier(X_train_wp, y_train_wp)
LR = evaluateModel(LRmodel, X_test_wp, y_test_wp, False)
#printResults(LR, 0, 15)

"""
Bagging of decision trees for water pumps
"""
print("\nBAGGING CLASSIFIER :\n")
bagging = Bagging(10, .01, 15).fit(X_train_wp, y_train_wp)
bagging = evaluateModel(bagging, X_test_wp, y_test_wp, False)
#printResults(bagging, 0, 15)

In [None]:
"""
KNN Classifier with kfolds for water pumps
"""
kf = KFold(n_splits=3, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', KNeighborsClassifier())
])
scoring = ['accuracy', 'balanced_accuracy', 'precision_micro', 'recall_micro']
scores = cross_validate(pipe, X=X_water_pumps, y=y_water_pumps, cv=kf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision_micro'])
print("Mean Precision :", scores['test_precision_micro'].mean())
print("Recall :", scores['test_recall_micro'])
print("Mean Recall :", scores['test_recall_micro'].mean())

In [None]:
"""
SVC Classifier with kfolds for water pumps
"""
kf = KFold(n_splits=3, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', svm.SVC(kernel='linear'))
])
scoring = ['accuracy', 'balanced_accuracy', 'precision_micro', 'recall_micro']
random_sample = random.sample(range(X_water_pumps.shape[0]), int(X_water_pumps.shape[0]*0.5))
X_subset = X_water_pumps.iloc[random_sample]
y_subset = y_water_pumps.iloc[random_sample]
scores = cross_validate(pipe, X=X_subset, y=y_subset, cv=kf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision_micro'])
print("Mean Precision :", scores['test_precision_micro'].mean())
print("Recall :", scores['test_recall_micro'])
print("Mean Recall :", scores['test_recall_micro'].mean())

In [None]:
"""
GNB Classifier with kfolds for water pumps
"""
kf = KFold(n_splits=3, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', GaussianNB())
])
scoring = ['accuracy', 'balanced_accuracy', 'precision_micro', 'recall_micro']
scores = cross_validate(pipe, X=X_water_pumps, y=y_water_pumps, cv=kf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision_micro'])
print("Mean Precision :", scores['test_precision_micro'].mean())
print("Recall :", scores['test_recall_micro'])
print("Mean Recall :", scores['test_recall_micro'].mean())

In [None]:
"""
LR Classifier with kfolds for water pumps
"""
kf = KFold(n_splits=3, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])
scoring = ['accuracy', 'balanced_accuracy', 'precision_micro', 'recall_micro']
scores = cross_validate(pipe, X=X_water_pumps, y=y_water_pumps, cv=kf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision_micro'])
print("Mean Precision :", scores['test_precision_micro'].mean())
print("Recall :", scores['test_recall_micro'])
print("Mean Recall :", scores['test_recall_micro'].mean())

In [None]:
"""
Bagging Classifier with kfolds for water pumps
"""
kf = KFold(n_splits=3, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', Bagging(10, .01, 15))
])
scoring = ['accuracy', 'balanced_accuracy', 'precision_micro', 'recall_micro']
scores = cross_validate(pipe, X=X_water_pumps, y=y_water_pumps, cv=kf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision_micro'])
print("Mean Precision :", scores['test_precision_micro'].mean())
print("Recall :", scores['test_recall_micro'])
print("Mean Recall :", scores['test_recall_micro'].mean())

In [None]:
"""
KNN Classifier with Stratified kfolds for water pumps
"""
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', KNeighborsClassifier())
])
scoring = ['accuracy', 'balanced_accuracy', 'precision_micro', 'recall_micro']
scores = cross_validate(pipe, X=X_water_pumps, y=y_water_pumps, cv=skf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision_micro'])
print("Mean Precision :", scores['test_precision_micro'].mean())
print("Recall :", scores['test_recall_micro'])
print("Mean Recall :", scores['test_recall_micro'].mean())

In [None]:
"""
SVC Classifier with Stratified kfolds for water pumps
"""
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', svm.SVC(kernel='linear'))
])
scoring = ['accuracy', 'balanced_accuracy', 'precision_micro', 'recall_micro']
random_sample = random.sample(range(X_water_pumps.shape[0]), int(X_water_pumps.shape[0]*0.5))
X_subset = X_water_pumps.iloc[random_sample]
y_subset = y_water_pumps.iloc[random_sample]
scores = cross_validate(pipe, X=X_subset, y=y_subset, cv=skf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision_micro'])
print("Mean Precision :", scores['test_precision_micro'].mean())
print("Recall :", scores['test_recall_micro'])
print("Mean Recall :", scores['test_recall_micro'].mean())

In [None]:
"""
GNB Classifier with Stratified kfolds for water pumps 
"""
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', GaussianNB())
])
scoring = ['accuracy', 'balanced_accuracy', 'precision_micro', 'recall_micro']
scores = cross_validate(pipe, X=X_water_pumps, y=y_water_pumps, cv=skf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision_micro'])
print("Mean Precision :", scores['test_precision_micro'].mean())
print("Recall :", scores['test_recall_micro'])
print("Mean Recall :", scores['test_recall_micro'].mean())

In [None]:
"""
LR Classifier with Stratified kfolds for water pumps
"""
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])
scoring = ['accuracy', 'balanced_accuracy', 'precision_micro', 'recall_micro']
scores = cross_validate(pipe, X=X_water_pumps, y=y_water_pumps, cv=skf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision_micro'])
print("Mean Precision :", scores['test_precision_micro'].mean())
print("Recall :", scores['test_recall_micro'])
print("Mean Recall :", scores['test_recall_micro'].mean())

In [None]:
"""
Bagging Classifier with Stratified kfolds for water pumps
"""
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', Bagging(10, .01, 15))
])
scoring = ['accuracy', 'balanced_accuracy', 'precision_micro', 'recall_micro']
scores = cross_validate(pipe, X=X_water_pumps, y=y_water_pumps, cv=skf, scoring=scoring, return_train_score=True)
print("Accuracy :", scores['test_accuracy'])
print("Mean Accuracy :", scores['test_accuracy'].mean())
print("Balanced accuracy :", scores['test_balanced_accuracy'])
print("Mean Balanced accuracy :", scores['test_balanced_accuracy'].mean())
print("Precision :", scores['test_precision_micro'])
print("Mean Precision :", scores['test_precision_micro'].mean())
print("Recall :", scores['test_recall_micro'])
print("Mean Recall :", scores['test_recall_micro'].mean())