In [176]:
import pandas as pd
import numpy as np

In [177]:
test_df, train_df =  pd.read_csv("test.csv"), pd.read_csv("train.csv")

In [178]:
train_df.head(4)

Unnamed: 0,policy_id,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,...,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,is_claim
0,ID00001,0.515874,0.05,0.644231,C1,4990,1,A,M1,CNG,...,No,No,No,Yes,No,No,No,Yes,0,0
1,ID00002,0.672619,0.02,0.375,C2,27003,1,A,M1,CNG,...,No,No,No,Yes,No,No,No,Yes,0,0
2,ID00003,0.84111,0.02,0.384615,C3,4076,1,A,M1,CNG,...,No,No,No,Yes,No,No,No,Yes,0,0
3,ID00004,0.900277,0.11,0.432692,C4,21622,1,C1,M2,Petrol,...,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,2,0


In [179]:
test_df.head(4)

Unnamed: 0,policy_id,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,...,is_rear_window_defogger,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating
0,ID58593,0.341732,0.0,0.586538,C3,4076,1,A,M1,CNG,...,No,No,No,No,Yes,No,No,No,Yes,0
1,ID58594,0.307241,0.13,0.442308,C8,8794,1,B2,M6,Petrol,...,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,2
2,ID58595,0.327924,0.12,0.451923,C8,8794,2,A,M3,Petrol,...,No,No,Yes,Yes,Yes,No,Yes,Yes,Yes,2
3,ID58596,0.782654,0.01,0.461538,C5,34738,1,A,M1,CNG,...,No,No,No,No,Yes,No,No,No,Yes,0


In [180]:
train_df = train_df.drop(columns="policy_id")
test_df = test_df.drop(columns='policy_id')

In [181]:
cat_cols = train_df.select_dtypes(include=['object']).columns

In [182]:
bin_cols = []
class_cols = []
for col in cat_cols:
  if len(train_df[col].unique()) == 2:
    bin_cols.append(col)
  else:
    class_cols.append(col)

In [183]:
from sklearn.preprocessing import LabelEncoder

for col in bin_cols:
  le = LabelEncoder()
  train_df[col] = le.fit_transform(train_df[col])
  test_df[col] = le.transform(test_df[col])

In [184]:
train_df_enc = pd.get_dummies(train_df, columns = class_cols)
test_df_enc = pd.get_dummies(test_df, columns = class_cols)

In [185]:
train_df_enc.head(4)

Unnamed: 0,policy_tenure,age_of_car,age_of_policyholder,population_density,make,airbags,is_esc,is_adjustable_steering,is_tpms,is_parking_sensors,...,engine_type_1.5 Turbocharged Revotorq,engine_type_1.5 Turbocharged Revotron,engine_type_F8D Petrol Engine,engine_type_G12B,engine_type_K Series Dual jet,engine_type_K10C,engine_type_i-DTEC,steering_type_Electric,steering_type_Manual,steering_type_Power
0,0.515874,0.05,0.644231,4990,1,2,0,0,0,1,...,False,False,True,False,False,False,False,False,False,True
1,0.672619,0.02,0.375,27003,1,2,0,0,0,1,...,False,False,True,False,False,False,False,False,False,True
2,0.84111,0.02,0.384615,4076,1,2,0,0,0,1,...,False,False,True,False,False,False,False,False,False,True
3,0.900277,0.11,0.432692,21622,1,2,1,1,0,1,...,False,False,False,False,False,False,False,True,False,False


In [186]:
from sklearn.model_selection import train_test_split

y = train_df_enc['is_claim']
X = train_df_enc.drop(columns='is_claim')

In [188]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
x_scaled = ss.fit_transform(X)

In [193]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score, make_scorer

models = [KNeighborsClassifier(n_neighbors=3), DecisionTreeClassifier(random_state=0), 
          LogisticRegression(random_state=0),  GaussianNB()]
names = ["KNN", "DecisionTree", "LogisticRegression", "Naive Bayes"]

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'recall 0': make_scorer(recall_score, pos_label = 0),
    'recall 1': make_scorer(recall_score, pos_label = 1),
    'f1_score': make_scorer(f1_score)
}

model_score = dict()
for model, name in zip(models, names):
    scores = cross_validate(model, x_scaled, y, cv=5, scoring=scoring)
    model_score[name] = [scores['test_accuracy'].mean(), scores["test_recall 0"].mean(),
                         scores["test_recall 1"].mean(), scores["test_f1_score"].mean()]

In [195]:
first_data_set = pd.DataFrame.from_dict(model_score, 'index', columns=['accuracy', 'recall 0', 'recall 1', 'f1 score'])

In [196]:
first_data_set

Unnamed: 0,accuracy,recall 0,recall 1,f1 score
KNN,0.924153,0.986051,0.01841,0.029947
DecisionTree,0.871075,0.923857,0.098721,0.089311
LogisticRegression,0.936032,1.0,0.0,0.0
Naive Bayes,0.621756,0.633908,0.443988,0.130938


In [197]:
train_df = pd.read_csv("data_college.csv")
train_df

Unnamed: 0,type_school,school_accreditation,gender,interest,residence,parent_age,parent_salary,house_area,average_grades,parent_was_in_college,will_go_to_college
0,Academic,A,Male,Less Interested,Urban,56,6950000,83.0,84.09,False,True
1,Academic,A,Male,Less Interested,Urban,57,4410000,76.8,86.91,False,True
2,Academic,B,Female,Very Interested,Urban,50,6500000,80.6,87.43,False,True
3,Vocational,B,Male,Very Interested,Rural,49,6600000,78.2,82.12,True,True
4,Academic,A,Female,Very Interested,Urban,57,5250000,75.1,86.79,False,False
...,...,...,...,...,...,...,...,...,...,...,...
995,Vocational,A,Female,Very Interested,Rural,49,7420000,63.6,85.99,True,True
996,Academic,B,Female,Less Interested,Rural,51,7480000,84.3,89.72,True,True
997,Vocational,A,Male,Less Interested,Urban,49,5550000,75.2,79.56,False,True
998,Academic,B,Male,Uncertain,Rural,53,5840000,105.8,87.18,True,True


In [198]:
cat_cols = train_df.select_dtypes(include=['object']).columns
bin_cols = []
class_cols = []
for col in cat_cols:
  if len(train_df[col].unique()) == 2:
    bin_cols.append(col)
  else:
    class_cols.append(col)

for col in bin_cols:
  le = LabelEncoder()
  train_df[col] = le.fit_transform(train_df[col])

train_df_enc = pd.get_dummies(train_df, columns = class_cols)
train_df_enc.head(4)

Unnamed: 0,type_school,school_accreditation,gender,residence,parent_age,parent_salary,house_area,average_grades,parent_was_in_college,will_go_to_college,interest_Interested,interest_Less Interested,interest_Not Interested,interest_Uncertain,interest_Very Interested
0,0,0,1,1,56,6950000,83.0,84.09,False,True,False,True,False,False,False
1,0,0,1,1,57,4410000,76.8,86.91,False,True,False,True,False,False,False
2,0,1,0,1,50,6500000,80.6,87.43,False,True,False,False,False,False,True
3,1,1,1,0,49,6600000,78.2,82.12,True,True,False,False,False,False,True


In [199]:
y = train_df_enc['will_go_to_college']
X = train_df_enc.drop(columns='will_go_to_college')
y

0       True
1       True
2       True
3       True
4      False
       ...  
995     True
996     True
997     True
998     True
999    False
Name: will_go_to_college, Length: 1000, dtype: bool

In [200]:
ss = StandardScaler()
x_scaled = ss.fit_transform(X)

In [201]:
models = [KNeighborsClassifier(n_neighbors=3), DecisionTreeClassifier(random_state=0), 
          LogisticRegression(random_state=0),  GaussianNB()]
names = ["KNN", "DecisionTree", "LogisticRegression", "Naive Bayes"]

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'recall 0': make_scorer(recall_score, pos_label = 0),
    'recall 1': make_scorer(recall_score, pos_label = 1),
    'f1_score': make_scorer(f1_score)
}

model_score = dict()
for model, name in zip(models, names):
    scores = cross_validate(model, x_scaled, y, cv=5, scoring=scoring)
    model_score[name] = [scores['test_accuracy'].mean(), scores["test_recall 0"].mean(),
                         scores["test_recall 1"].mean(), scores["test_f1_score"].mean()]

In [203]:
second_data_set = pd.DataFrame.from_dict(model_score, 'index', columns=['accuracy', 'recall 0', 'recall 1', 'f1 score'])

In [204]:
second_data_set

Unnamed: 0,accuracy,recall 0,recall 1,f1 score
KNN,0.861,0.87,0.852,0.859298
DecisionTree,0.825,0.824,0.826,0.825202
LogisticRegression,0.861,0.846,0.876,0.863089
Naive Bayes,0.781,0.714,0.848,0.79477


In [231]:
train_df = pd.read_csv("water_potability.csv")
train_df

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0
1,3.716080,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
...,...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681735,47580.991603,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1
3272,7.808856,193.553212,17329.802160,8.061362,,392.449580,19.903225,,2.798243,1
3273,9.419510,175.762646,33155.578218,7.350233,,432.044783,11.039070,69.845400,3.298875,1
3274,5.126763,230.603758,11983.869376,6.303357,,402.883113,11.168946,77.488213,4.708658,1


In [232]:
train_df = train_df.dropna()
train_df

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
5,5.584087,188.313324,28748.687739,7.544869,326.678363,280.467916,8.399735,54.917862,2.559708,0
6,10.223862,248.071735,28749.716544,7.513408,393.663396,283.651634,13.789695,84.603556,2.672989,0
7,8.635849,203.361523,13672.091764,4.563009,303.309771,474.607645,12.363817,62.798309,4.401425,0
...,...,...,...,...,...,...,...,...,...,...
3267,8.989900,215.047358,15921.412018,6.297312,312.931022,390.410231,9.899115,55.069304,4.613843,1
3268,6.702547,207.321086,17246.920347,7.708117,304.510230,329.266002,16.217303,28.878601,3.442983,1
3269,11.491011,94.812545,37188.826022,9.263166,258.930600,439.893618,16.172755,41.558501,4.369264,1
3270,6.069616,186.659040,26138.780191,7.747547,345.700257,415.886955,12.067620,60.419921,3.669712,1


In [233]:
y = train_df_enc['Potability']
X = train_df_enc.drop(columns='Potability')

In [234]:
ss = StandardScaler()
x_scaled = ss.fit_transform(X)

In [235]:
models = [KNeighborsClassifier(n_neighbors=3), DecisionTreeClassifier(random_state=0), 
          LogisticRegression(random_state=0),  GaussianNB()]
names = ["KNN", "DecisionTree", "LogisticRegression", "Naive Bayes"]

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'recall 0': make_scorer(recall_score, pos_label = 0),
    'recall 1': make_scorer(recall_score, pos_label = 1),
    'f1_score': make_scorer(f1_score)
}

model_score = dict()
for model, name in zip(models, names):
    scores = cross_validate(model, x_scaled, y, cv=5, scoring=scoring)
    model_score[name] = [scores['test_accuracy'].mean(), scores["test_recall 0"].mean(),
                         scores["test_recall 1"].mean(), scores["test_f1_score"].mean()]

In [236]:
third_data_set = pd.DataFrame.from_dict(model_score, 'index', columns=['accuracy', 'recall 0', 'recall 1', 'f1 score'])
third_data_set

Unnamed: 0,accuracy,recall 0,recall 1,f1 score
KNN,0.59374,0.725833,0.398288,0.44124
DecisionTree,0.594756,0.653333,0.508157,0.500776
LogisticRegression,0.597215,0.991667,0.013573,0.025165
Naive Bayes,0.602694,0.851667,0.234197,0.319272
