# BINARY CLASSIFICATION

In [62]:
# IMPORTING LIBRARIES

import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Feature selection and engineering
from sklearn.feature_selection import RFE

#Model evaluation
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

import sys
import os

# Add the parent directory of 'modules' to sys.path
sys.path.append(os.path.abspath("../modules"))

# Now import
from modules.EDA import EDA
from modules.dataprocessor import DataProcessor
from modules.testprocessor import TestDatasetProcessor


### 1.0 Load dataset

In [48]:

# Load the dataset
df = pd.read_csv("./data/wells_data_cleaned.csv")
df_valid = pd.read_csv("./data/validataion_data_cleaned.csv")

# Check the first few rows to confirm the structure
df.head()


Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,basin,region,...,construction_year,extraction_type_class,management_group,payment_type,quality_group,quantity,source_class,waterpoint_type,status_group,year_recorded
0,69572,6000.0,2011-03-14,ROMAN,1390.0,ROMAN,34.938093,-9.856322,LAKE NYASA,IRINGA,...,1999,GRAVITY,USER-GROUP,ANNUALLY,GOOD,ENOUGH,GROUNDWATER,COMMUNAL STANDPIPE,FUNCTIONAL,2011
1,8776,0.0,2013-03-06,GRUMETI,1399.0,GRUMETI,34.698766,-2.147466,LAKE VICTORIA,MARA,...,2010,GRAVITY,USER-GROUP,NEVER PAY,GOOD,INSUFFICIENT,SURFACE,COMMUNAL STANDPIPE,FUNCTIONAL,2013
2,34310,25.0,2013-02-25,LOTTERY CLUB,686.0,WORLD VISION,37.460664,-3.821329,PANGANI,MANYARA,...,2009,GRAVITY,USER-GROUP,PER BUCKET,GOOD,ENOUGH,SURFACE,COMMUNAL STANDPIPE MULTIPLE,FUNCTIONAL,2013
3,67743,0.0,2013-01-28,UNICEF,263.0,UNICEF,38.486161,-11.155298,RUVUMA / SOUTHERN COAST,MTWARA,...,1986,SUBMERSIBLE,USER-GROUP,NEVER PAY,GOOD,DRY,GROUNDWATER,COMMUNAL STANDPIPE MULTIPLE,NON FUNCTIONAL,2013
4,19728,0.0,2011-07-13,ACTION IN A,0.0,ARTISAN,31.130847,-1.825359,LAKE VICTORIA,KAGERA,...,0,GRAVITY,OTHER,NEVER PAY,GOOD,SEASONAL,SURFACE,COMMUNAL STANDPIPE,FUNCTIONAL,2011


### 2.0 Recatecorize Target Variable to Binary class

- Functional -> 0 , Non-functional -> 1 , Function needs repir ->1

In [50]:
def recategorize_target_variable(df, target_column, new_column):
   
    df[new_column] = df[target_column].apply(lambda x: 0 if x == 'FUNCTIONAL' else 1)
    return df

# Application for recategorizing target variable
data = recategorize_target_variable(df, target_column='status_group', new_column='status_group_binary')
print("Binary Target Variable Distribution:")
print(data['status_group_binary'].value_counts())

Binary Target Variable Distribution:
0    32259
1    27141
Name: status_group_binary, dtype: int64


In [51]:
# Creating a new dataframe to be used in binary classification
df1 = df.drop('status_group',axis = 1)
df1.to_csv ('./data/binary_clf_data.csv',index = False)

### 3.0 Data Preprocessing

Train test data

In [26]:

# Load data
df = pd.read_csv("./data/binary_clf_data.csv")

# Initialize DataProcessor
processor = DataProcessor(df, target_column="status_group_binary")

# Process the data
processor.process_data()

# Access processed data
X_train = processor.X_train
X_test = processor.X_test
y_train = processor.y_train_encoded
y_test = processor.y_test_encoded

print("Processed training and testing data are ready.")


### Splitting Data ###
Training set size: 47520 rows
Testing set size: 11880 rows

### Encoding Categorical and Boolean Columns with Balanced Approach ###
Boolean columns: ['public_meeting', 'permit']
Categorical columns: ['funder', 'installer', 'basin', 'region', 'lga', 'scheme_management', 'extraction_type_class', 'management_group', 'payment_type', 'quality_group', 'quantity', 'source_class', 'waterpoint_type']
Balanced encoding completed successfully.
Target Encoding Mapping:
{0: 0, 1: 1}
Data processing completed successfully.
Processed training and testing data are ready.


Validation data

In [71]:
import json

# Load reference columns from training
with open("columns.json", "r") as file:
    reference_columns = json.load(file)

# Load new test dataset
validation_df = pd.read_csv("./data/validataion_data_cleaned.csv")

# Initialize and process the dataset
test_processor = TestDatasetProcessor(validation_df, reference_columns)
test_processor.process_data()

# Access the processed data
processed_validation_X = test_processor.processed_X

# Display processed dataset
print("Processed Validation Data (first 5 rows):")
print(processed_validation_X.head())



### Encoding Categorical and Boolean Columns ###
Boolean columns: ['public_meeting', 'permit']
Categorical columns: ['date_recorded', 'funder', 'installer', 'basin', 'region', 'lga', 'scheme_management', 'extraction_type_class', 'management_group', 'payment_type', 'quality_group', 'quantity', 'source_class', 'waterpoint_type']
Balanced encoding completed successfully.
Test dataset processing completed successfully.
Processed Validation Data (first 5 rows):
   amount_tsh  funder  gps_height  installer  longitude  latitude  region  \
0   -0.128571     0.0    1.942225        0.0   0.081843  0.552598     0.0   
1   -0.128571     0.0    1.321841        0.0   0.607162  0.807803     0.0   
2   -0.128571     0.0    1.318935        0.0  -0.119274  0.231366     0.0   
3   -0.128571     0.0   -0.569821        0.0   1.146105 -1.269746     0.0   
4    0.070562     0.0    0.872898        0.0  -0.027641 -1.790621     0.0   

   lga  population  public_meeting  ...  quantity_SEASONAL  quantity_UNKNOW

3.0 Fitting Models

3.1 Dummy Classifier as the baseline model

In [34]:
from sklearn.dummy import DummyClassifier

# Initialize the dummy classifier
dummy_clf = DummyClassifier(strategy="stratified", random_state=42)  # Predicts according to the training set’s class distribution

# Train the dummy classifier
dummy_clf.fit(X_train, y_train)

# Predict on the test set
y_pred_dummy = dummy_clf.predict(X_test)

# Evaluate performance
print("Dummy Classifier Accuracy:", accuracy_score(y_test, y_pred_dummy))
print("\nClassification Report for Dummy Classifier:")
print(classification_report(y_test, y_pred_dummy))

Dummy Classifier Accuracy: 0.4968013468013468

Classification Report for Dummy Classifier:
              precision    recall  f1-score   support

           0       0.54      0.54      0.54      6457
           1       0.45      0.44      0.45      5423

    accuracy                           0.50     11880
   macro avg       0.49      0.49      0.49     11880
weighted avg       0.50      0.50      0.50     11880



- Our dummy classifier with its limitation of using class distribution produced an 50% accuracy. It will act as our baseline

3.2 Logistic regression

In [35]:
# Train Logistic Regression
regr = LogisticRegression()
regr.fit(X_train, y_train)

# Predict on test data
y_pred = regr.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.6876262626262626
Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.78      0.73      6457
           1       0.69      0.58      0.63      5423

    accuracy                           0.69     11880
   macro avg       0.69      0.68      0.68     11880
weighted avg       0.69      0.69      0.68     11880



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


3.3 Decision tree

In [36]:
# Initialize the Decision Tree classifier
decision_tree = DecisionTreeClassifier(random_state=42)

# Train the Decision Tree classifier
decision_tree.fit(X_train, y_train)

# Predict on the test set
y_pred_decision_tree = decision_tree.predict(X_test)

# Evaluate the Decision Tree
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_decision_tree))
print("\nClassification Report for Decision Tree:")
print(classification_report(y_test, y_pred_decision_tree))


Decision Tree Accuracy: 0.7819023569023569

Classification Report for Decision Tree:
              precision    recall  f1-score   support

           0       0.80      0.80      0.80      6457
           1       0.76      0.76      0.76      5423

    accuracy                           0.78     11880
   macro avg       0.78      0.78      0.78     11880
weighted avg       0.78      0.78      0.78     11880



3.4 Random Forest

In [37]:
# Initialize the Random Forest classifier
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the Random Forest classifier
random_forest.fit(X_train, y_train)

# Predict on the test set
y_pred_random_forest = random_forest.predict(X_test)

# Evaluate the Random Forest
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_random_forest))
print("\nClassification Report for Random Forest:")
print(classification_report(y_test, y_pred_random_forest))

Random Forest Accuracy: 0.8181818181818182

Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.82      0.86      0.84      6457
           1       0.82      0.77      0.80      5423

    accuracy                           0.82     11880
   macro avg       0.82      0.81      0.82     11880
weighted avg       0.82      0.82      0.82     11880



**Key Observations:**
- The `Random Forest` significantly outperforms the `Decision Tree` and `Logistic Regression` in both accuracy and F1-scores.
- It maintains a good balance across both classes.
- `Precision` and `Recall` for Class 0 (Functional) and Class 1 (Non-functional/Needs Repair) are more robust, making this model the most reliable so far.

Random Forest Feature Importance

In [38]:
# Extract feature importance
feature_importances = random_forest.feature_importances_

# Create a DataFrame for better visualization
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Display the top features
print(importance_df.head(10))

                  Feature  Importance
4               longitude    0.144291
5                latitude    0.143124
2              gps_height    0.071258
12      construction_year    0.058459
1                  funder    0.050638
8              population    0.048370
43        quantity_ENOUGH    0.045823
3               installer    0.042954
7                     lga    0.035378
54  waterpoint_type_OTHER    0.033875


Key Features and Their Importance:

 *longitude (14.43%) and latitude (14.31%):*
- These spatial features play a significant role, indicating that the location of a well strongly influences its functionality.
- Potential reasons include regional differences in water table levels, terrain, or management practices.

 *gps_height (7.13%):*
- The elevation of the well likely impacts water availability, as higher elevations might face challenges in accessing groundwater.

 *construction_year (5.85%):*
- Older wells may be more prone to failures or require more repairs due to wear and tear over time.

 *funder (5.06%):*
- The organization funding the well influences its quality and durability, as some funders might use better construction standards.

 *population (4.84%):*
- Larger populations might lead to overuse of wells, increasing the chances of failure or repair needs.

 *quantity_ENOUGH (4.58%):*
- This feature suggests whether the well provides sufficient water, a direct indicator of functionality.

 *installer (4.30%):* 
- The party installing the well impacts its reliability, likely reflecting variations in expertise or materials used.

 *lga (3.54%):*
- The Local Government Area (LGA) might reflect regional policies, maintenance, or socio-economic factors affecting well functionality.

 *waterpoint_type_OTHER (3.38%):*
- The type of waterpoint influences its reliability, with some types being more prone to failure than others.


3.5 Hyperparameter Tuning for Random Forest

In [39]:
# Parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees
    'max_depth': [None, 10, 20, 30],  # Depth of trees
    'min_samples_split': [2, 5, 10],  # Min samples to split a node
    'min_samples_leaf': [1, 2, 4]     # Min samples at a leaf node
}


In [40]:
# Initialize Random Forest
rf = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,  # 3-fold cross-validation
    scoring='accuracy',  # Evaluate using accuracy
    verbose=2,  # Show progress
    n_jobs=-1  # Use all processors
)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)


Fitting 3 folds for each of 108 candidates, totalling 324 fits


In [42]:
# Best hyperparameters
print("Best Parameters:", grid_search.best_params_)

# Best score from GridSearchCV
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Best Cross-Validation Accuracy: 0.8187710437710437


3.6 Training and Evaluating the Best Random Forest Model

In [43]:
# Train the best Random Forest model
best_rf = grid_search.best_estimator_

# Predict on the test set
y_pred_best_rf = best_rf.predict(X_test)

# Evaluate performance
from sklearn.metrics import classification_report, accuracy_score

print("Random Forest Accuracy (Best Model):", accuracy_score(y_test, y_pred_best_rf))
print("\nClassification Report (Best Model):")
print(classification_report(y_test, y_pred_best_rf))


Random Forest Accuracy (Best Model): 0.8247474747474748

Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.81      0.88      0.84      6457
           1       0.84      0.76      0.80      5423

    accuracy                           0.82     11880
   macro avg       0.83      0.82      0.82     11880
weighted avg       0.83      0.82      0.82     11880



3.7 Validating the Model with an entirely new dataset(Test data dowloaded from taarifa)

In [72]:
# Predict on the validation set
validation_predictions = best_rf.predict(processed_validation_X)

# Output the predictions
print("Predictions for Validation Data:")
print(validation_predictions[:10])  # Display the first 10 predictions

Predictions for Validation Data:
[0 0 1 1 0 1 1 1 1 1]


In [74]:
#Convert predictions to a DataFrame
predictions_df = pd.DataFrame(validation_predictions, columns=["Prediction"])

# Analyze the distribution of predictions
class_distribution = predictions_df["Prediction"].value_counts(normalize=True)
print("Class Distribution of Predictions:")
print(class_distribution)


Class Distribution of Predictions:
1    0.780337
0    0.219663
Name: Prediction, dtype: float64


In [75]:
# Distribution in training data
training_class_distribution = pd.Series(y_train).value_counts(normalize=True)
print("Class Distribution in Training Data:")
print(training_class_distribution)


Class Distribution in Training Data:
0    0.542971
1    0.457029
Name: status_group_binary, dtype: float64
