In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.metrics import  accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

## Importing the Required Libraries

In [None]:
train_data=pd.read_csv("/content/traindata.csv")
train_data.head()

Unnamed: 0,Id,Amount_of_water,Gps_height,Waterpoint_type,Basin_name,Village,Regionname,Region_code,Wardname,District_code,...,Payment_type,Water_quality,Quality_group,Quantity,Quantity_group,Source,Source_type,Source_class,Waterpoint_type_group,Status
0,56421,0.0,1555,hand pump,basin3,village8556,region4,15,ward500,3,...,never pay,salty,salty,insufficient,insufficient,shallow well,shallow well,groundwater,hand pump,0
1,21324,500.0,857,communal standpipe multiple,basin7,village7372,region5,10,ward1800,5,...,monthly,soft,good,insufficient,insufficient,river,river/lake,surface,communal standpipe,0
2,74,0.0,0,communal standpipe,basin1,village9460,region6,17,ward883,3,...,never pay,soft,good,insufficient,insufficient,machine dbh,borehole,groundwater,communal standpipe,0
3,74038,0.0,0,other,basin5,village13878,region13,18,ward1505,7,...,never pay,soft,good,insufficient,insufficient,shallow well,shallow well,groundwater,other,0
4,29929,0.0,0,hand pump,basin4,village5192,region6,14,ward942,4,...,never pay,soft,good,enough,enough,machine dbh,borehole,groundwater,hand pump,1


## Exploratory Data Analysis

In [None]:
train_data.describe(include='all')

Unnamed: 0,Id,Amount_of_water,Gps_height,Waterpoint_type,Basin_name,Village,Regionname,Region_code,Wardname,District_code,...,Payment_type,Water_quality,Quality_group,Quantity,Quantity_group,Source,Source_type,Source_class,Waterpoint_type_group,Status
count,44068.0,44068.0,44068.0,44068,44068,44068,44068,44068.0,44068,44068.0,...,44068,44068,44068,44068,44068,44068,44068,44068,44068,44068.0
unique,,,,7,9,16471,21,,2087,,...,7,8,6,5,5,10,7,3,6,
top,,,,communal standpipe,basin5,village18166,region12,,ward1290,,...,never pay,soft,good,enough,enough,shallow well,shallow well,groundwater,communal standpipe,
freq,,,,21085,7397,382,4190,,240,,...,18752,37551,37551,24610,24610,12617,12617,34384,25401,
mean,37092.516384,329.781607,674.30546,,,,,15.224539,,5.664382,...,,,,,,,,,,0.58564
std,21490.884249,3127.604462,698.062813,,,,,17.566722,,9.670612,...,,,,,,,,,,0.492617
min,0.0,0.0,-63.0,,,,,1.0,,0.0,...,,,,,,,,,,0.0
25%,18363.25,0.0,0.0,,,,,5.0,,2.0,...,,,,,,,,,,0.0
50%,37021.5,0.0,370.0,,,,,12.0,,3.0,...,,,,,,,,,,1.0
75%,55685.25,20.0,1327.0,,,,,17.0,,5.0,...,,,,,,,,,,1.0


#### Null Value Check

Less than 30% of Null Values is reasonable to impute. If the Null Values in a column is greater than 30% of Total records, it is adviced to ignore the Column. Although this is not a hard rule, but something that can be followed

In [None]:
train_data.isnull().sum()/train_data.shape[0]*100

Unnamed: 0,0
Id,0.0
Amount_of_water,0.0
Gps_height,0.0
Waterpoint_type,0.0
Basin_name,0.0
Village,0.0
Regionname,0.0
Region_code,0.0
Wardname,0.0
District_code,0.0


In [None]:
# Since SchemeName has more than 30% of missing values, the column is dropped from the dataframe
# Some other columns like 'Id','Village','Organization_funding','Wardname','Company_installed','Region_code' are dropped for this example as they do not contribute to the Target column prediction based on the Attribute Description and other Statistics
train_data.drop(['Id','Village','SchemeName','Organization_funding','Wardname','Company_installed','Region_code'],axis=1, inplace=True)
train_data.columns

Index(['Amount_of_water', 'Gps_height', 'Waterpoint_type', 'Basin_name',
       'Regionname', 'District_code', 'Population', 'Public_meeting',
       'Organization_surveyed', 'Scheme_management', 'Permit', 'Management',
       'Management_group', 'Extraction_type', 'Extraction_type_group',
       'Extraction_type_class', 'Payment', 'Payment_type', 'Water_quality',
       'Quality_group', 'Quantity', 'Quantity_group', 'Source', 'Source_type',
       'Source_class', 'Waterpoint_type_group', 'Status'],
      dtype='object')

### Datatype Conversion

In [None]:
train_data.dtypes

Unnamed: 0,0
Amount_of_water,float64
Gps_height,int64
Waterpoint_type,object
Basin_name,object
Regionname,object
District_code,int64
Population,int64
Public_meeting,object
Organization_surveyed,object
Scheme_management,object


The Dataype Conversion is done with respect to the Attribute Description

In [None]:
# Datatype Conversion
cat_cols= train_data.select_dtypes('object')
for i in cat_cols:
  train_data[i]=train_data[i].astype('category')
train_data['Status']=train_data['Status'].astype('category')
train_data.dtypes

Unnamed: 0,0
Amount_of_water,float64
Gps_height,int64
Waterpoint_type,category
Basin_name,category
Regionname,category
District_code,int64
Population,int64
Public_meeting,category
Organization_surveyed,category
Scheme_management,category


### Train Test Split

In [None]:
X = train_data.drop('Status', axis=1)
y = train_data['Status']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.30, stratify=y, random_state=421)

In [None]:
X_train[['Public_meeting','Scheme_management','Permit']]

Unnamed: 0,Public_meeting,Scheme_management,Permit
21016,True,VWC,True
41402,True,VWC,False
6854,True,VWC,False
43463,True,,False
43697,True,VWC,True
...,...,...,...
37662,True,VWC,True
19434,True,VWC,True
14083,True,VWC,True
10780,True,VWC,True


### Data Preprocessing/ Data Normalization

Imputing Mussing Values

In [None]:
null_cols=['Public_meeting','Scheme_management','Permit']

# Convert mixed types to strings (or other appropriate type)
X_train[null_cols] = X_train[null_cols].astype(str)
X_val[null_cols] = X_val[null_cols].astype(str)


imputer = SimpleImputer(strategy='most_frequent', missing_values=np.nan) # Since the columns in the null_cols are categorical, strategy='most_frequent' is used
imputer = imputer.fit(X_train[null_cols])
X_train[null_cols] = imputer.transform(X_train[null_cols])
X_val[null_cols] = imputer.transform(X_val[null_cols])

Scaling Numerical Variables

In [None]:
num_attr = X_train.select_dtypes(include=['int64','float64']).columns
num_attr
scaler = StandardScaler()
scaler.fit(X_train[num_attr])
X_train_std = pd.DataFrame(scaler.transform(X_train[num_attr]),columns=X_train[num_attr].columns)
X_val_std = pd.DataFrame(scaler.transform(X_val[num_attr]),columns=X_train[num_attr].columns)

Encoding Categorical Variables

In [None]:
cat_attr = X_train.select_dtypes(include=['category']).columns
cat_attr
enc = OneHotEncoder(drop = 'first',handle_unknown='ignore')
enc.fit(X_train[cat_attr])
X_train_ohe=pd.DataFrame(enc.transform(X_train[cat_attr]).toarray(),columns=enc.get_feature_names_out())
X_val_ohe=pd.DataFrame(enc.transform(X_val[cat_attr]).toarray(),columns=enc.get_feature_names_out())

Combining both Scaled and Encoded Data

In [None]:
X_train_con = pd.concat([X_train_std, X_train_ohe], axis=1)
X_val_con = pd.concat([X_val_std, X_val_ohe], axis=1)

In [None]:
X_train_con

Unnamed: 0,Amount_of_water,Gps_height,District_code,Population,Waterpoint_type_communal standpipe,Waterpoint_type_communal standpipe multiple,Waterpoint_type_dam,Waterpoint_type_hand pump,Waterpoint_type_improved spring,Waterpoint_type_other,...,Source_type_river/lake,Source_type_shallow well,Source_type_spring,Source_class_surface,Source_class_unknown,Waterpoint_type_group_communal standpipe,Waterpoint_type_group_dam,Waterpoint_type_group_hand pump,Waterpoint_type_group_improved spring,Waterpoint_type_group_other
0,-0.131697,0.817138,-0.172057,-0.249103,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1.940348,-0.993068,5.605770,-0.332529,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,-0.123409,-0.245748,2.820032,-0.372156,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,-0.123409,1.271841,-0.275233,0.251452,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.131697,0.124325,-0.068882,0.105456,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30842,-0.110977,-0.475251,-0.481584,0.147169,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
30843,-0.131697,-0.965814,-0.172057,-0.374242,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
30844,-0.131697,-0.965814,-0.275233,-0.374242,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
30845,-0.131697,-0.965814,-0.275233,-0.374242,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


## Evaluation Metrics

In [None]:
# creating a function to check the accuracy scores
def evaluate_model(act, pred):
  print("Accurcay : ", accuracy_score(act, pred))

## Model Building

1. Logestic Regression

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train_con, y_train)
train_pred = logreg.predict(X_train_con)
val_pred = logreg.predict(X_val_con)
print("train accuracy")
evaluate_model(y_train,train_pred)
print("test accuracy")
evaluate_model(y_val,val_pred)

train accuracy
Accurcay :  0.788180374104451
test accuracy
Accurcay :  0.7888208153694879


2. Decision Tree

In [None]:
dtclf3 = DecisionTreeClassifier()
dtclf3.fit(X_train_con, y_train)
train_pred = dtclf3.predict(X_train_con)
val_pred = dtclf3.predict(X_val_con)
print("train accuracy")
evaluate_model(y_train,train_pred)
print("test accuracy")
evaluate_model(y_val,val_pred)

train accuracy
Accurcay :  0.9599636917690537
test accuracy
Accurcay :  0.8019816957870055


3. SVM

In [None]:
svc=SVC()
svc.fit(X_train_con, y_train)
train_pred = svc.predict(X_train_con)
val_pred = svc.predict(X_val_con)
print("train accuracy")
evaluate_model(y_train,train_pred)
print("test accuracy")
evaluate_model(y_val,val_pred)

train accuracy
Accurcay :  0.8305183648328849
test accuracy
Accurcay :  0.8171091445427728


4. KNN

In [None]:
knn=KNeighborsClassifier()
knn.fit(X_train_con, y_train)
train_pred = knn.predict(X_train_con)
val_pred = knn.predict(X_val_con)
print("train accuracy")
evaluate_model(y_train,train_pred)
print("test accuracy")
evaluate_model(y_val,val_pred)

train accuracy
Accurcay :  0.8607968359970175
test accuracy
Accurcay :  0.8158989486423115


5. Random Forest

In [None]:
clf2 = RandomForestClassifier()
clf2.fit(X_train_con, y_train)
train_pred = clf2.predict(X_train_con)
val_pred = clf2.predict(X_val_con)
print("train accuracy")
evaluate_model(y_train,train_pred)
print("test accuracy")
evaluate_model(y_val,val_pred)

train accuracy
Accurcay :  0.9599312737057089
test accuracy
Accurcay :  0.8277739959155889
