# TERNARY CLASSIFICATION

### Metrics

The main metric to access my models' performance here is **Accuracy Score**. However, for this specific problem, we would also want to be able to identify non-operational waterpoints as well as those that are in need of repair early on, to help the Tanzanian Ministry of Water dispense resources accordingly. Therefore, I would also be looking at **Recall Score** particularly of the 2 classes `non functional` and `functional needs repair`.

In [21]:
# IMPORTING LIBRARIES

import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Feature selection and engineering
from sklearn.feature_selection import RFE

#Model evaluation
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

### 1.0 Loading Dataset

In [3]:
df = pd.read_csv('../data/wells_data_cleaned.csv')
df.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,basin,region,...,construction_year,extraction_type_class,management_group,payment_type,quality_group,quantity,source_class,waterpoint_type,status_group,year_recorded
0,69572,6000.0,2011-03-14,ROMAN,1390.0,ROMAN,34.938093,-9.856322,LAKE NYASA,IRINGA,...,1999,GRAVITY,USER-GROUP,ANNUALLY,GOOD,ENOUGH,GROUNDWATER,COMMUNAL STANDPIPE,FUNCTIONAL,2011
1,8776,0.0,2013-03-06,GRUMETI,1399.0,GRUMETI,34.698766,-2.147466,LAKE VICTORIA,MARA,...,2010,GRAVITY,USER-GROUP,NEVER PAY,GOOD,INSUFFICIENT,SURFACE,COMMUNAL STANDPIPE,FUNCTIONAL,2013
2,34310,25.0,2013-02-25,LOTTERY CLUB,686.0,WORLD VISION,37.460664,-3.821329,PANGANI,MANYARA,...,2009,GRAVITY,USER-GROUP,PER BUCKET,GOOD,ENOUGH,SURFACE,COMMUNAL STANDPIPE MULTIPLE,FUNCTIONAL,2013
3,67743,0.0,2013-01-28,UNICEF,263.0,UNICEF,38.486161,-11.155298,RUVUMA / SOUTHERN COAST,MTWARA,...,1986,SUBMERSIBLE,USER-GROUP,NEVER PAY,GOOD,DRY,GROUNDWATER,COMMUNAL STANDPIPE MULTIPLE,NON FUNCTIONAL,2013
4,19728,0.0,2011-07-13,ACTION IN A,0.0,ARTISAN,31.130847,-1.825359,LAKE VICTORIA,KAGERA,...,0,GRAVITY,OTHER,NEVER PAY,GOOD,SEASONAL,SURFACE,COMMUNAL STANDPIPE,FUNCTIONAL,2011


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     59400 non-null  int64  
 1   amount_tsh             59400 non-null  float64
 2   date_recorded          59400 non-null  object 
 3   funder                 59400 non-null  object 
 4   gps_height             59400 non-null  float64
 5   installer              59400 non-null  object 
 6   longitude              59400 non-null  float64
 7   latitude               59400 non-null  float64
 8   basin                  59400 non-null  object 
 9   region                 59400 non-null  object 
 10  lga                    59400 non-null  object 
 11  population             59400 non-null  int64  
 12  public_meeting         59400 non-null  bool   
 13  scheme_management      59400 non-null  object 
 14  permit                 59400 non-null  bool   
 15  co

### 2.0 Preprocessing

2.1 Data Train Test Split

In [5]:
# Step 1: Splitting the dataset
def split_data():
    print("\n### Splitting Data ###")
    
    # Separate features and target variable
    X = df.drop(['id', 'date_recorded','status_group'], axis=1)  # Features/Input values/x
    y = df['status_group']  # Target/Labels/Output
    
    # Splitting into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    print(f"Training set size: {X_train.shape[0]} rows")
    print(f"Testing set size: {X_test.shape[0]} rows")
    
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data()


### Splitting Data ###
Training set size: 47520 rows
Testing set size: 11880 rows


2.2 Encoding categoricals

In [6]:
# Replace problematic values with a default category
X_train['funder'] = X_train['funder'].replace('RURAL WATER SUPPLY AND SANITAT', 'OTHER')
X_test['funder'] = X_test['funder'].replace('RURAL WATER SUPPLY AND SANITAT', 'OTHER')

In [7]:
from sklearn.preprocessing import LabelEncoder

def balanced_encoding(X_train, X_test):
    print("\n### Encoding Categorical and Boolean Columns with Balanced Approach ###")

    # Handle boolean columns (convert to integers)
    bool_columns = X_train.select_dtypes(include=['bool']).columns
    print(f"Boolean columns: {list(bool_columns)}")
    for col in bool_columns:
        X_train[col] = X_train[col].astype(int)
        X_test[col] = X_test[col].astype(int)

    # Identify categorical columns
    categorical_columns = X_train.select_dtypes(include=['object']).columns
    print(f"Categorical columns: {list(categorical_columns)}")

    # Split into high and low-cardinality columns
    high_cardinality_cols = [col for col in categorical_columns if X_train[col].nunique() > 10]
    low_cardinality_cols = [col for col in categorical_columns if X_train[col].nunique() <= 10]

    # Label Encoding for high-cardinality columns
    label_encoders = {}
    for col in high_cardinality_cols:
        le = LabelEncoder()
        X_train[col] = le.fit_transform(X_train[col])
        X_test[col] = X_test[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
        label_encoders[col] = le  # Save encoder for later use

    # One-Hot Encoding for low-cardinality columns
    X_train = pd.get_dummies(X_train, columns=low_cardinality_cols, drop_first=True)
    X_test = pd.get_dummies(X_test, columns=low_cardinality_cols, drop_first=True)

    # Align columns in train and test sets (handle dummy column mismatch)
    X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

    print("Balanced encoding completed successfully.")
    return X_train, X_test, label_encoders


In [8]:
# Apply the balanced encoding function
X_train_encoded, X_test_encoded, label_encoders = balanced_encoding(X_train, X_test)

# Verify the results
print("X_train_encoded shape:", X_train_encoded.shape)
print("X_test_encoded shape:", X_test_encoded.shape)



### Encoding Categorical and Boolean Columns with Balanced Approach ###
Boolean columns: ['public_meeting', 'permit']
Categorical columns: ['funder', 'installer', 'basin', 'region', 'lga', 'scheme_management', 'extraction_type_class', 'management_group', 'payment_type', 'quality_group', 'quantity', 'source_class', 'waterpoint_type']
Balanced encoding completed successfully.
X_train_encoded shape: (47520, 55)
X_test_encoded shape: (11880, 55)


In [9]:
# Initialize LabelEncoder
label_encoder_y = LabelEncoder()

# Fit and transform y_train, ensure it remains a Pandas Series
y_train_encoded = pd.Series(
    label_encoder_y.fit_transform(y_train),
    index=y_train.index,
    name="status_group"
)

# Transform y_test, ensure it remains a Pandas Series
y_test_encoded = pd.Series(
    label_encoder_y.transform(y_test),
    index=y_test.index,
    name="status_group"
)

# Print the mapping for reference
print("Target Encoding Mapping:")
print(dict(zip(label_encoder_y.classes_, range(len(label_encoder_y.classes_)))))

# Verify encoded values
print("Encoded y_train (first 5):\n", y_train_encoded.head())
print("Encoded y_test (first 5):\n", y_test_encoded.head())


Target Encoding Mapping:
{'FUNCTIONAL': 0, 'FUNCTIONAL NEEDS REPAIR': 1, 'NON FUNCTIONAL': 2}
Encoded y_train (first 5):
 3607     0
50870    0
20413    2
52806    2
50091    2
Name: status_group, dtype: int32
Encoded y_test (first 5):
 2980     2
5246     0
22659    0
39888    2
13361    0
Name: status_group, dtype: int32


2.3 Scaling Numerical features

In [10]:
from sklearn.preprocessing import StandardScaler
# Identify numeric columns
numeric_columns = X_train_encoded.select_dtypes(include=['float64', 'int64']).columns

# Initialize scaler
scaler = StandardScaler()

# Scale numeric columns
X_train_encoded[numeric_columns] = scaler.fit_transform(X_train_encoded[numeric_columns])
X_test_encoded[numeric_columns] = scaler.transform(X_test_encoded[numeric_columns])

2.4 Fitting Models

In [11]:
def check_and_fix_encoding(X_train, X_test):
    # Identify problematic columns
    non_numeric_columns = X_train.select_dtypes(include=['object']).columns

    for col in non_numeric_columns:
        print(f"Encoding column: {col}")
        le = LabelEncoder()
        X_train[col] = le.fit_transform(X_train[col].astype(str))
        X_test[col] = X_test[col].apply(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

    return X_train, X_test

X_train_encoded, X_test_encoded = check_and_fix_encoding(X_train_encoded, X_test_encoded)


Logistic regression

In [12]:
# Step 2: Train Logistic Regression
log_reg = LogisticRegression(random_state=42, max_iter=1000)
log_reg.fit(X_train_encoded, y_train_encoded)
y_pred_log_reg = log_reg.predict(X_test_encoded)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Decision Tree

In [13]:
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train_encoded, y_train_encoded)
y_pred_dec_tree = decision_tree.predict(X_test_encoded)

Random Forest

In [14]:
# Step 4: Train Random Forest
random_forest = RandomForestClassifier(random_state=42, n_estimators=100)
random_forest.fit(X_train_encoded, y_train_encoded)
y_pred_rand_forest = random_forest.predict(X_test_encoded)

2.5 Evaluating Models

In [15]:
# Logistic Regression
print("Logistic Regression Report:")
print(classification_report(y_test_encoded, y_pred_log_reg))

# Decision Tree
print("\nDecision Tree Report:")
print(classification_report(y_test_encoded, y_pred_dec_tree))

# Random Forest
print("\nRandom Forest Report:")
print(classification_report(y_test_encoded, y_pred_rand_forest))

Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.68      0.85      0.76      6457
           1       0.00      0.00      0.00       851
           2       0.68      0.56      0.62      4572

    accuracy                           0.68     11880
   macro avg       0.45      0.47      0.46     11880
weighted avg       0.63      0.68      0.65     11880


Decision Tree Report:
              precision    recall  f1-score   support

           0       0.80      0.79      0.80      6457
           1       0.35      0.39      0.37       851
           2       0.76      0.77      0.76      4572

    accuracy                           0.75     11880
   macro avg       0.64      0.65      0.64     11880
weighted avg       0.76      0.75      0.75     11880


Random Forest Report:
              precision    recall  f1-score   support

           0       0.81      0.87      0.84      6457
           1       0.51      0.35      0.41       851
 

Key Observations

`Random Forest:`

- Best overall performance with the highest accuracy (79%).
- Performs well for Functional and Non Functional classes.
- Struggles with the minority class Functional Needs Repair (Class 1), but slightly better than other models.

`Logistic Regression:`

- Performs poorly for Functional Needs Repair (Class 1) with 0% recall.
- Handles Functional and Non Functional classes moderately well.

`Decision Tree:`

- Better balance between classes compared to Logistic Regression.
- Recall for Functional Needs Repair (Class 1) is better (29%) but still needs improvement.

Accuracy score comparison between train and test models

In [16]:
# Accuracy Comparisons
# Logistic Regression Predictions
y_train_pred_log_reg = log_reg.predict(X_train_encoded)
y_test_pred_log_reg = log_reg.predict(X_test_encoded)

# Decision Tree Predictions
y_train_pred_dec_tree = decision_tree.predict(X_train_encoded)
y_test_pred_dec_tree = decision_tree.predict(X_test_encoded)

# Random Forest Predictions
y_train_pred_rand_forest = random_forest.predict(X_train_encoded)
y_test_pred_rand_forest = random_forest.predict(X_test_encoded)
accuracy_data = {
    "Model": ["Logistic Regression", "Decision Tree", "Random Forest"],
    "Training Accuracy": [
        accuracy_score(y_train_encoded, y_train_pred_log_reg),
        accuracy_score(y_train_encoded, y_train_pred_dec_tree),
        accuracy_score(y_train_encoded, y_train_pred_rand_forest),
    ],
    "Test Accuracy": [
        accuracy_score(y_test_encoded, y_test_pred_log_reg),
        accuracy_score(y_test_encoded, y_test_pred_dec_tree),
        accuracy_score(y_test_encoded, y_test_pred_rand_forest),
    ],
}

# Create a DataFrame for comparison
accuracy_comparison = pd.DataFrame(accuracy_data)

# Display the DataFrame
print("\nAccuracy Comparison:")
print(accuracy_comparison)



Accuracy Comparison:
                 Model  Training Accuracy  Test Accuracy
0  Logistic Regression           0.684806       0.679882
1        Decision Tree           0.996886       0.751768
2        Random Forest           0.996801       0.799074


Insights

`Logistic Regression:`

The performance on the training and test sets is close, indicating the model is not overfitting. However, the overall accuracy is lower compared to the other models.

`Decision Tree:`

The significant gap between training and test accuracy indicates overfitting. The model is memorizing the training data but failing to generalize well on the test data.

`Random Forest:`

While the training accuracy is very high, the test accuracy is significantly better than the Decision Tree. The model still shows some signs of overfitting but generalizes better than the Decision Tree.

2.6 Class Balancing - correction for imbalance observed

In [17]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE on X_train_encoded and y_train_encoded
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_encoded, y_train_encoded)

# Print class distribution before and after
print("Before SMOTE:", Counter(y_train_encoded))
print("After SMOTE:", Counter(y_train_balanced))


Before SMOTE: Counter({0: 25802, 2: 18252, 1: 3466})
After SMOTE: Counter({0: 25802, 2: 25802, 1: 25802})


Refitting models with SMOTE data

In [18]:
# Refit models using the SMOTE-balanced data

# Logistic Regression
log_reg.fit(X_train_balanced, y_train_balanced)

# Decision Tree
decision_tree.fit(X_train_balanced, y_train_balanced)

# Random Forest
random_forest.fit(X_train_balanced, y_train_balanced)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
# Predictions after SMOTE
y_test_pred_log_reg = log_reg.predict(X_test_encoded)
y_test_pred_dec_tree = decision_tree.predict(X_test_encoded)
y_test_pred_rand_forest = random_forest.predict(X_test_encoded)

# Classification Reports
print("Logistic Regression Report:")
print(classification_report(y_test_encoded, y_test_pred_log_reg))

print("\nDecision Tree Report:")
print(classification_report(y_test_encoded, y_test_pred_dec_tree))

print("\nRandom Forest Report:")
print(classification_report(y_test_encoded, y_test_pred_rand_forest))


Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.72      0.62      0.67      6457
           1       0.15      0.46      0.23       851
           2       0.69      0.56      0.62      4572

    accuracy                           0.59     11880
   macro avg       0.52      0.55      0.50     11880
weighted avg       0.67      0.59      0.62     11880


Decision Tree Report:
              precision    recall  f1-score   support

           0       0.81      0.78      0.79      6457
           1       0.34      0.45      0.39       851
           2       0.76      0.75      0.75      4572

    accuracy                           0.74     11880
   macro avg       0.63      0.66      0.64     11880
weighted avg       0.75      0.74      0.75     11880


Random Forest Report:
              precision    recall  f1-score   support

           0       0.83      0.83      0.83      6457
           1       0.40      0.45      0.42       851
 

Insights from SMOTE data

- Logistic Regression sees a drop in overall accuracy but better recall for minority classes.
- Decision Tree improves slightly for minority classes but remains lower than Random Forest.
- Random Forest maintains strong performance and gains improvements for minority class recall

2.7 Hyperparameter Tuning for Random Forest

In [22]:

# Define parameter grid for tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Initialize Random Forest classifier
rf = RandomForestClassifier(random_state=42, class_weight='balanced')

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,
    scoring='f1_weighted',
    verbose=2,
    n_jobs=-1
)

# Fit GridSearchCV
grid_search.fit(X_train_balanced, y_train_balanced)

# Best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_params, best_score


Fitting 3 folds for each of 81 candidates, totalling 243 fits


({'max_depth': None,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 200},
 0.8481974512589772)