# BINARY CLASSIFICATION

In [None]:
# IMPORTING LIBRARIES

import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Feature selection and engineering
from sklearn.feature_selection import RFE

#Model evaluation
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

import sys
import os

# Add the parent directory of 'modules' to sys.path
sys.path.append(os.path.abspath("../modules"))

# Now import
from modules.EDA import EDA
from modules.dataprocessor import DataProcessor


### 1.0 Load dataset

In [19]:

# Load the dataset
df = pd.read_csv("./data/wells_data_cleaned.csv")

# Check the first few rows to confirm the structure
df.head()


Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,basin,region,...,construction_year,extraction_type_class,management_group,payment_type,quality_group,quantity,source_class,waterpoint_type,status_group,year_recorded
0,69572,6000.0,2011-03-14,ROMAN,1390.0,ROMAN,34.938093,-9.856322,LAKE NYASA,IRINGA,...,1999,GRAVITY,USER-GROUP,ANNUALLY,GOOD,ENOUGH,GROUNDWATER,COMMUNAL STANDPIPE,FUNCTIONAL,2011
1,8776,0.0,2013-03-06,GRUMETI,1399.0,GRUMETI,34.698766,-2.147466,LAKE VICTORIA,MARA,...,2010,GRAVITY,USER-GROUP,NEVER PAY,GOOD,INSUFFICIENT,SURFACE,COMMUNAL STANDPIPE,FUNCTIONAL,2013
2,34310,25.0,2013-02-25,LOTTERY CLUB,686.0,WORLD VISION,37.460664,-3.821329,PANGANI,MANYARA,...,2009,GRAVITY,USER-GROUP,PER BUCKET,GOOD,ENOUGH,SURFACE,COMMUNAL STANDPIPE MULTIPLE,FUNCTIONAL,2013
3,67743,0.0,2013-01-28,UNICEF,263.0,UNICEF,38.486161,-11.155298,RUVUMA / SOUTHERN COAST,MTWARA,...,1986,SUBMERSIBLE,USER-GROUP,NEVER PAY,GOOD,DRY,GROUNDWATER,COMMUNAL STANDPIPE MULTIPLE,NON FUNCTIONAL,2013
4,19728,0.0,2011-07-13,ACTION IN A,0.0,ARTISAN,31.130847,-1.825359,LAKE VICTORIA,KAGERA,...,0,GRAVITY,OTHER,NEVER PAY,GOOD,SEASONAL,SURFACE,COMMUNAL STANDPIPE,FUNCTIONAL,2011


### 2.0 Recatecorize Target Variable to Binary class

- Functional -> 0 , Non-functional -> 1 , Function needs repir ->1

In [21]:
# Recategorize the target variable
df['status_group_binary'] = df['status_group'].apply(lambda x: 0 if x == 'FUNCTIONAL' else 1)

# Verify the distribution
print(df['status_group_binary'].value_counts())


0    32259
1    27141
Name: status_group_binary, dtype: int64


In [None]:
# Example usage
if __name__ == "__main__":
    # Load data
    df = pd.read_csv("wells_data_cleaned.csv")

    # Initialize DataProcessor
    processor = DataProcessor(df, target_column="status_group")

    # Process the data
    processor.process_data()

    # Access processed data
    X_train = processor.X_train
    X_test = processor.X_test
    y_train = processor.y_train_encoded
    y_test = processor.y_test_encoded

    print("Processed training and testing data are ready.")