In [79]:
import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import SGDClassifier
RANDOM_STATE = 42

In [80]:
df = pd.read_csv('DoctorFeePrediction_Milestone2.csv')

In [81]:
df.head()

Unnamed: 0,Doctor Name,City,Specialization,Doctor Qualification,Experience(Years),Total_Reviews,Patient Satisfaction Rate(%age),Avg Time to Patients(mins),Wait Time(mins),Hospital Address,Doctors Link,Fee Category
0,Dr. Umair Hafeez,GUJRANWALA,Ent Specialist,"MBBS , FCPS ( Otorhinolaryngologic ENT )",6.0,11,100,19,6,"Central Hospital, Jinnah Colony, Gujranwala",https://www.marham.pk/doctors/gujranwala/ent-s...,Expensive
1,Dr. Haris Shakeel,RAJAN-PUR,General Physician,MBBS,1.0,0,94,14,11,No Address Available,No Link Available,Cheap
2,Dr. Iqra Rehman,MIRPUR-KHAS,General Physician,"MBBS, RMP, CFP (USA), Certified in Covid 19 +",6.0,9,100,10,0,"Rehman Clinic, tandoadam naka, Mirpur Khas",https://www.marham.pk/doctors/mirpur-khas/gene...,Medium-Priced
3,Dr. Erum Memon,HYDERABAD,Gynecologist,"MBBS, FCPS (Gynae &amp; Obstetrics)",11.0,71,96,18,10,"Mehmood Hospital, Qasimabad, Hyderabad",https://www.marham.pk/doctors/hyderabad/gyneco...,Cheap
4,Dr. Aisha Ahmad,LAHORE,Dermatologist,"MBBS, FCPS (Dermatology), Certified (Aesthetic...",12.0,199,100,16,2,"Skinnovation, Johar Town, Lahore",https://www.marham.pk/doctors/lahore/dermatolo...,Medium-Priced


In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2386 entries, 0 to 2385
Data columns (total 12 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Doctor Name                      2386 non-null   object 
 1   City                             2386 non-null   object 
 2   Specialization                   2386 non-null   object 
 3   Doctor Qualification             2386 non-null   object 
 4   Experience(Years)                2386 non-null   float64
 5   Total_Reviews                    2386 non-null   int64  
 6   Patient Satisfaction Rate(%age)  2386 non-null   int64  
 7   Avg Time to Patients(mins)       2386 non-null   int64  
 8   Wait Time(mins)                  2386 non-null   int64  
 9   Hospital Address                 2386 non-null   object 
 10  Doctors Link                     2386 non-null   object 
 11  Fee Category                     2386 non-null   object 
dtypes: float64(1), int64

In [83]:
df.describe()

Unnamed: 0,Experience(Years),Total_Reviews,Patient Satisfaction Rate(%age),Avg Time to Patients(mins),Wait Time(mins)
count,2386.0,2386.0,2386.0,2386.0,2386.0
mean,11.844719,92.473177,96.657586,14.092205,11.264459
std,8.784449,282.162526,4.9623,2.722198,5.636885
min,1.0,0.0,33.0,3.0,0.0
25%,6.0,0.0,94.0,14.0,10.0
50%,10.0,8.0,98.0,14.0,11.0
75%,14.0,54.0,100.0,15.0,11.0
max,53.0,5147.0,100.0,50.0,82.0


In [84]:
df['Fee Category'].unique()

array(['Expensive', 'Cheap', 'Medium-Priced'], dtype=object)

In [85]:
df['Fee Category'].value_counts()

Fee Category
Medium-Priced    1042
Cheap             864
Expensive         480
Name: count, dtype: int64

In [86]:
print(df.shape)
df.drop_duplicates(inplace=True)
df.reset_index(inplace=True)
# Costing Features
df=df.drop(['Doctor Name'],axis=1)
df=df.drop(['index'],axis=1)
print(df.shape)


(2386, 12)
(2373, 11)


In [87]:
first_split = df['Hospital Address'].str.split(',', expand=True)
second_split = first_split[0].str.split(';', expand=True)
new_column_zero = second_split.iloc[:, 0]
df_concatenated = pd.concat([df, new_column_zero], axis=1)
df = df_concatenated.rename(columns={0: 'Hospital Name'})
df.drop('Hospital Address', axis=1, inplace=True)

In [88]:
df.head()

Unnamed: 0,City,Specialization,Doctor Qualification,Experience(Years),Total_Reviews,Patient Satisfaction Rate(%age),Avg Time to Patients(mins),Wait Time(mins),Doctors Link,Fee Category,Hospital Name
0,GUJRANWALA,Ent Specialist,"MBBS , FCPS ( Otorhinolaryngologic ENT )",6.0,11,100,19,6,https://www.marham.pk/doctors/gujranwala/ent-s...,Expensive,Central Hospital
1,RAJAN-PUR,General Physician,MBBS,1.0,0,94,14,11,No Link Available,Cheap,No Address Available
2,MIRPUR-KHAS,General Physician,"MBBS, RMP, CFP (USA), Certified in Covid 19 +",6.0,9,100,10,0,https://www.marham.pk/doctors/mirpur-khas/gene...,Medium-Priced,Rehman Clinic
3,HYDERABAD,Gynecologist,"MBBS, FCPS (Gynae &amp; Obstetrics)",11.0,71,96,18,10,https://www.marham.pk/doctors/hyderabad/gyneco...,Cheap,Mehmood Hospital
4,LAHORE,Dermatologist,"MBBS, FCPS (Dermatology), Certified (Aesthetic...",12.0,199,100,16,2,https://www.marham.pk/doctors/lahore/dermatolo...,Medium-Priced,Skinnovation


In [89]:
def encode_link(link):
    if link != 'No Link Available':
        return 1
    else:
        return 0

# Apply the function to encode the 'Doctors Link' column
df['Doctors Link'] = df['Doctors Link'].apply(lambda x: encode_link(x))

# Now 'Doctors Link' column contains 1 if a link is available, 0 otherwise


In [90]:
df.head()

Unnamed: 0,City,Specialization,Doctor Qualification,Experience(Years),Total_Reviews,Patient Satisfaction Rate(%age),Avg Time to Patients(mins),Wait Time(mins),Doctors Link,Fee Category,Hospital Name
0,GUJRANWALA,Ent Specialist,"MBBS , FCPS ( Otorhinolaryngologic ENT )",6.0,11,100,19,6,1,Expensive,Central Hospital
1,RAJAN-PUR,General Physician,MBBS,1.0,0,94,14,11,0,Cheap,No Address Available
2,MIRPUR-KHAS,General Physician,"MBBS, RMP, CFP (USA), Certified in Covid 19 +",6.0,9,100,10,0,1,Medium-Priced,Rehman Clinic
3,HYDERABAD,Gynecologist,"MBBS, FCPS (Gynae &amp; Obstetrics)",11.0,71,96,18,10,1,Cheap,Mehmood Hospital
4,LAHORE,Dermatologist,"MBBS, FCPS (Dermatology), Certified (Aesthetic...",12.0,199,100,16,2,1,Medium-Priced,Skinnovation


In [91]:
df.shape

(2373, 11)

In [92]:
popular_degrees = ['MBBS', 'FCPS', 'MCPS', 'MS', 'MD', 'FRCS']
qualifications_df = pd.DataFrame({'MBBS':[], 'FCPS':[], 'MCPS':[], 'MS':  [], 'MD':  [], 'FRCS':[]})
qualifications_df.head()

Unnamed: 0,MBBS,FCPS,MCPS,MS,MD,FRCS


NOTE: Doctors with non of these qualifications should have his columns set to 0 instead of being dropped

In [93]:
for  index in range(df.shape[0]):
    current_qualifications = df.iloc[index]['Doctor Qualification']
    row = []
    for qualification in popular_degrees:
        if current_qualifications.find(qualification) != -1:
            row.append(1)
        else:
            row.append(0)
    qualifications_df.loc[len(qualifications_df)] = row 
    
qualifications_df.head()

Unnamed: 0,MBBS,FCPS,MCPS,MS,MD,FRCS
0,1,1,0,0,0,0
1,1,0,0,0,0,0
2,1,0,0,0,0,0
3,1,1,0,0,0,0
4,1,1,0,0,0,0


In [94]:
qualifications_df.shape

(2373, 6)

In [95]:
df[popular_degrees] = qualifications_df.copy()

In [96]:
df.head()

Unnamed: 0,City,Specialization,Doctor Qualification,Experience(Years),Total_Reviews,Patient Satisfaction Rate(%age),Avg Time to Patients(mins),Wait Time(mins),Doctors Link,Fee Category,Hospital Name,MBBS,FCPS,MCPS,MS,MD,FRCS
0,GUJRANWALA,Ent Specialist,"MBBS , FCPS ( Otorhinolaryngologic ENT )",6.0,11,100,19,6,1,Expensive,Central Hospital,1,1,0,0,0,0
1,RAJAN-PUR,General Physician,MBBS,1.0,0,94,14,11,0,Cheap,No Address Available,1,0,0,0,0,0
2,MIRPUR-KHAS,General Physician,"MBBS, RMP, CFP (USA), Certified in Covid 19 +",6.0,9,100,10,0,1,Medium-Priced,Rehman Clinic,1,0,0,0,0,0
3,HYDERABAD,Gynecologist,"MBBS, FCPS (Gynae &amp; Obstetrics)",11.0,71,96,18,10,1,Cheap,Mehmood Hospital,1,1,0,0,0,0
4,LAHORE,Dermatologist,"MBBS, FCPS (Dermatology), Certified (Aesthetic...",12.0,199,100,16,2,1,Medium-Priced,Skinnovation,1,1,0,0,0,0


In [97]:
df.isna().count()

City                               2373
Specialization                     2373
Doctor Qualification               2373
Experience(Years)                  2373
Total_Reviews                      2373
Patient Satisfaction Rate(%age)    2373
Avg Time to Patients(mins)         2373
Wait Time(mins)                    2373
Doctors Link                       2373
Fee Category                       2373
Hospital Name                      2373
MBBS                               2373
FCPS                               2373
MCPS                               2373
MS                                 2373
MD                                 2373
FRCS                               2373
dtype: int64

In [98]:
df.rename(columns={'Experience(Years)':'EXP(YRs)',
                   'Total_Reviews' : '#Reviews',
                   f'Patient Satisfaction Rate(%age)': 'Satisfaction Rate'}, inplace=True)


In [99]:
df.shape

(2373, 17)

In [100]:
# X_ = df.drop('Fee Category', axis=1)
# y_ = df['Fee Category']
train, test = train_test_split(df, train_size=0.8, random_state=RANDOM_STATE)

train.reset_index(inplace=True)
test.reset_index(inplace=True)
train.drop(['index'], axis=1, inplace=True)
test.drop(['index'], axis=1, inplace=True)

In [101]:
train.head()

Unnamed: 0,City,Specialization,Doctor Qualification,EXP(YRs),#Reviews,Satisfaction Rate,Avg Time to Patients(mins),Wait Time(mins),Doctors Link,Fee Category,Hospital Name,MBBS,FCPS,MCPS,MS,MD,FRCS
0,SAHIWAL,Orthopedic Surgeon,"MBBS, FCPS (Orthopedic Surgery)",13.0,8,100,11,14,1,Medium-Priced,Saeed surgical complex,1,1,0,0,0,0
1,ISLAMABAD,Nephrologist,"MBBS, FCPS (Nephrology)",10.0,32,94,13,7,1,Medium-Priced,Advanced International Hospital,1,1,0,0,0,0
2,GUJRAT,"General Practitioner, Gynecologist",MBBS,12.0,0,94,14,11,0,Cheap,No Address Available,1,0,0,0,0,0
3,SAHIWAL,General Physician,MBBS,2.0,12,100,14,11,1,Expensive,No Address Available,1,0,0,0,0,0
4,LAHORE,Pediatrician,"MBBS , DCH , MRCPCH (UK) , FRCPCH (UK)",20.0,122,97,15,14,1,Medium-Priced,Hameedah Memorial Hospital,1,0,0,0,0,0


In [132]:
OE = OrdinalEncoder(categories=[['Cheap', 'Medium-Priced', 'Expensive']])

train['Fee Category'] = OE.fit_transform(train[['Fee Category']])
test['Fee Category'] = OE.transform(test[['Fee Category']])

mean_encoder_c, mean_encoder_s, mean_encoder_h = ce.TargetEncoder(), ce.TargetEncoder(), ce.TargetEncoder()
\

mean_encoder_c.fit(train[['City']], train['Fee Category'])
train['City'] = mean_encoder_c.transform(train['City'])
test['City'] = mean_encoder_c.transform(test['City'])

mean_encoder_s.fit(train['Specialization'], train['Fee Category'])
train['Specialization'] = mean_encoder_s.transform(train['Specialization'])
test['Specialization'] = mean_encoder_s.transform(test['Specialization'])


ValueError: could not convert string to float: 'Cheap'

In [133]:
train.head()

Unnamed: 0,City,Specialization,Doctor Qualification,EXP(YRs),#Reviews,Satisfaction Rate,Avg Time to Patients(mins),Wait Time(mins),Doctors Link,Fee Category,Hospital Name,MBBS,FCPS,MCPS,MS,MD,FRCS
0,0.887147,0.968553,"MBBS, FCPS (Orthopedic Surgery)",13.0,8,100,11,14,1,1.0,0.856674,1,1,0,0,0,0
1,1.39469,1.138709,"MBBS, FCPS (Nephrology)",10.0,32,94,13,7,1,1.0,0.927599,1,1,0,0,0,0
2,0.796095,0.724604,MBBS,12.0,0,94,14,11,0,0.0,0.0,1,0,0,0,0,0
3,0.887147,0.234568,MBBS,2.0,12,100,14,11,1,2.0,0.0,1,0,0,0,0,0
4,1.466638,0.532338,"MBBS , DCH , MRCPCH (UK) , FRCPCH (UK)",20.0,122,97,15,14,1,1.0,0.854713,1,0,0,0,0,0


In [104]:
indices1, indices2 = [], []
for i in range(train.shape[0]):
    if train['Hospital Name'].iloc[i] == 'No Address Available':
        indices1.append(i)

for i in range(test.shape[0]):
    if test['Hospital Name'].iloc[i] == 'No Address Available':
        indices2.append(i)

mean_encoder_h.fit(train[['Hospital Name']], train['Fee Category'])
train['Hospital Name'] = mean_encoder_h.transform(train['Hospital Name'])
test['Hospital Name'] = mean_encoder_h.transform(test['Hospital Name'])


for index in indices1:
    train['Hospital Name'].iloc[index] = 0

for index in indices2:
    test['Hospital Name'].iloc[index] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Hospital Name'].iloc[index] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Hospital Name'].iloc[index] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Hospital Name'].iloc[index] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Hospital Name'].iloc[index] = 0
A value is t

In [105]:
train.head()

Unnamed: 0,City,Specialization,Doctor Qualification,EXP(YRs),#Reviews,Satisfaction Rate,Avg Time to Patients(mins),Wait Time(mins),Doctors Link,Fee Category,Hospital Name,MBBS,FCPS,MCPS,MS,MD,FRCS
0,0.887147,0.968553,"MBBS, FCPS (Orthopedic Surgery)",13.0,8,100,11,14,1,1.0,0.856674,1,1,0,0,0,0
1,1.39469,1.138709,"MBBS, FCPS (Nephrology)",10.0,32,94,13,7,1,1.0,0.927599,1,1,0,0,0,0
2,0.796095,0.724604,MBBS,12.0,0,94,14,11,0,0.0,0.0,1,0,0,0,0,0
3,0.887147,0.234568,MBBS,2.0,12,100,14,11,1,2.0,0.0,1,0,0,0,0,0
4,1.466638,0.532338,"MBBS , DCH , MRCPCH (UK) , FRCPCH (UK)",20.0,122,97,15,14,1,1.0,0.854713,1,0,0,0,0,0


In [106]:

included_features = ['City', 'EXP(YRs)', '#Reviews', 'Satisfaction Rate', 'Avg Time to Patients(mins)', 'Wait Time(mins)', 'Doctors Link', 'Hospital Name', 'Specialization']
features = train[included_features]
target = train["Fee Category"]


k_best = SelectKBest(score_func=f_classif, k=6)

X = k_best.fit_transform(features, target)
y = target

#Get the indices of the selected features
selected_features_indices = k_best.get_support(indices=True)

# Get the scores associated with each feature
feature_scores = k_best.scores_

# Create a list of tuples containing feature names and scores
feature_info = list(zip(features.columns, feature_scores))

# Sort the feature info in descending order based on scores
sorted_feature_info = sorted(feature_info, key=lambda x: x[1], reverse=True)

for feature_name, feature_score in sorted_feature_info[:10]:
    print(f"{feature_name}: {feature_score:.2f}")


Specialization: 307.15
City: 277.59
Hospital Name: 229.88
EXP(YRs): 226.06
Doctors Link: 114.86
#Reviews: 106.84
Wait Time(mins): 14.30
Satisfaction Rate: 9.59
Avg Time to Patients(mins): 3.16


In [107]:
df.head()

Unnamed: 0,City,Specialization,Doctor Qualification,EXP(YRs),#Reviews,Satisfaction Rate,Avg Time to Patients(mins),Wait Time(mins),Doctors Link,Fee Category,Hospital Name,MBBS,FCPS,MCPS,MS,MD,FRCS
0,GUJRANWALA,Ent Specialist,"MBBS , FCPS ( Otorhinolaryngologic ENT )",6.0,11,100,19,6,1,Expensive,Central Hospital,1,1,0,0,0,0
1,RAJAN-PUR,General Physician,MBBS,1.0,0,94,14,11,0,Cheap,No Address Available,1,0,0,0,0,0
2,MIRPUR-KHAS,General Physician,"MBBS, RMP, CFP (USA), Certified in Covid 19 +",6.0,9,100,10,0,1,Medium-Priced,Rehman Clinic,1,0,0,0,0,0
3,HYDERABAD,Gynecologist,"MBBS, FCPS (Gynae &amp; Obstetrics)",11.0,71,96,18,10,1,Cheap,Mehmood Hospital,1,1,0,0,0,0
4,LAHORE,Dermatologist,"MBBS, FCPS (Dermatology), Certified (Aesthetic...",12.0,199,100,16,2,1,Medium-Priced,Skinnovation,1,1,0,0,0,0


In [108]:
selected_features = ['City', 'Specialization', 'EXP(YRs)', 'Hospital Name', '#Reviews', 'Doctors Link']
selected_features.extend(popular_degrees)
# selected_features = df.columns
selected_features_df = train[selected_features]
test__ = test[selected_features]
selected_features_df['Fee Category'] = train['Fee Category']
test__['Fee Category'] = test['Fee Category']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_features_df['Fee Category'] = train['Fee Category']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test__['Fee Category'] = test['Fee Category']


In [109]:
selected_features_df.head()


Unnamed: 0,City,Specialization,EXP(YRs),Hospital Name,#Reviews,Doctors Link,MBBS,FCPS,MCPS,MS,MD,FRCS,Fee Category
0,0.887147,0.968553,13.0,0.856674,8,1,1,1,0,0,0,0,1.0
1,1.39469,1.138709,10.0,0.927599,32,1,1,1,0,0,0,0,1.0
2,0.796095,0.724604,12.0,0.0,0,0,1,0,0,0,0,0,0.0
3,0.887147,0.234568,2.0,0.0,12,1,1,0,0,0,0,0,2.0
4,1.466638,0.532338,20.0,0.854713,122,1,1,0,0,0,0,0,1.0


In [110]:
X_train = selected_features_df.drop(['Fee Category'], axis=1).values
y_train = selected_features_df['Fee Category'].values
X_test = test__.drop(['Fee Category'], axis=1).values
y_test = test__['Fee Category'].values
RS = StandardScaler()
X_train = RS.fit_transform(X_train)
X_test = RS.transform(X_test)

In [111]:
# # Assuming X_train, y_train are your features and labels for training data
# smote = SMOTE(random_state=RANDOM_STATE)
# X_train, y_train = smote.fit_resample(X_train, y_train)

In [112]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [113]:
import time
def train_model(model):
    start_train_time = time.time()
    model.fit(X_train, y_train)
    train_preds = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, train_preds)
    end_train_time = time.time()
    train_time = end_train_time - start_train_time
    print('Train Accuracy:', train_accuracy)
    print("Training time:", train_time)
    print("#"*50)
def test_model(model):
    start_predict_time = time.time()
    test_preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, test_preds)
    precision = precision_score(y_test, test_preds, average='weighted')
    recall = recall_score(y_test, test_preds, average='weighted')
    f1 = f1_score(y_test, test_preds, average='weighted')
    end_predict_time = time.time()
    predict_time = end_predict_time - start_predict_time
    print('Test Accuracy:', accuracy)
    print("Prediction time:", predict_time)
    print("#"*50)
    print('Test Precision:', precision)
    print('Test Recall:', recall)
    print('Test F1 Score:', f1)


In [114]:
LR = LogisticRegression(C=0.1 , multi_class='multinomial', random_state=RANDOM_STATE)
train_model(LR)
test_model(LR)

Train Accuracy: 0.6691253951527925
Training time: 0.01730632781982422
##################################################
Test Accuracy: 0.6442105263157895
Prediction time: 0.009794950485229492
##################################################
Test Precision: 0.6619498857209062
Test Recall: 0.6442105263157895
Test F1 Score: 0.6385491146285738


low accuracy on logistic, hence the model is not linearly seperable
in other words, linear kernel for svm will also give poor accuracy

In [115]:
KNN = KNeighborsClassifier(n_neighbors=10) 
train_model(KNN)
test_model(KNN)

Train Accuracy: 0.7349841938883035
Training time: 0.1408524513244629
##################################################
Test Accuracy: 0.6421052631578947
Prediction time: 0.028644323348999023
##################################################
Test Precision: 0.6508650524385925
Test Recall: 0.6421052631578947
Test F1 Score: 0.6398332447883279


In [116]:
svc_l = SVC(kernel='linear', random_state=RANDOM_STATE)
train_model(svc_l)
test_model(svc_l)

Train Accuracy: 0.7128556375131717
Training time: 0.13129568099975586
##################################################
Test Accuracy: 0.6378947368421053
Prediction time: 0.013133764266967773
##################################################
Test Precision: 0.6668214774349606
Test Recall: 0.6378947368421053
Test F1 Score: 0.6336491605090511


In [117]:
svc_w = SVC(kernel='rbf', random_state=RANDOM_STATE)
train_model(svc_w)
test_model(svc_w)

Train Accuracy: 0.7692307692307693
Training time: 0.2517530918121338
##################################################
Test Accuracy: 0.6652631578947369
Prediction time: 0.045792579650878906
##################################################
Test Precision: 0.6812278006845071
Test Recall: 0.6652631578947369
Test F1 Score: 0.6623389799800706


In [118]:
svc_r = SVC(kernel='rbf', random_state=RANDOM_STATE, gamma=1)
train_model(svc_r)
test_model(svc_r)

Train Accuracy: 0.8777660695468915
Training time: 0.268064022064209
##################################################
Test Accuracy: 0.6210526315789474
Prediction time: 0.04740190505981445
##################################################
Test Precision: 0.6420766277871307
Test Recall: 0.6210526315789474
Test F1 Score: 0.6165120943041673


In [119]:
svc_r = SVC(kernel='rbf', random_state=RANDOM_STATE, gamma=0.5)
train_model(svc_r)
test_model(svc_r)

Train Accuracy: 0.857218124341412
Training time: 0.28119516372680664
##################################################
Test Accuracy: 0.6378947368421053
Prediction time: 0.055846214294433594
##################################################
Test Precision: 0.6527383625167879
Test Recall: 0.6378947368421053
Test F1 Score: 0.636164305164044


In [120]:
svc_r = SVC(kernel='rbf', random_state=RANDOM_STATE, gamma=0.1)
train_model(svc_r)
test_model(svc_r)

Train Accuracy: 0.7760800842992623
Training time: 0.3265974521636963
##################################################
Test Accuracy: 0.6526315789473685
Prediction time: 0.055098533630371094
##################################################
Test Precision: 0.6670031927849643
Test Recall: 0.6526315789473685
Test F1 Score: 0.6502445575822965


In [121]:
svc_s = SVC(kernel='sigmoid', random_state=RANDOM_STATE)
train_model(svc_s)
test_model(svc_s)

Train Accuracy: 0.5468914646996839
Training time: 0.18951702117919922
##################################################
Test Accuracy: 0.5473684210526316
Prediction time: 0.02345895767211914
##################################################
Test Precision: 0.5453494126742766
Test Recall: 0.5473684210526316
Test F1 Score: 0.5453698684480562


In [122]:
for deg in range(6):
    svc_d = SVC(kernel='poly', degree=deg+1, random_state=RANDOM_STATE)
    print("degree: ", deg+1)
    train_model(svc_d)
    test_model(svc_d)
    print("|"*50)

degree:  1
Train Accuracy: 0.6775553213909379
Training time: 0.1295325756072998
##################################################
Test Accuracy: 0.6210526315789474
Prediction time: 0.01745009422302246
##################################################
Test Precision: 0.6491847102117149
Test Recall: 0.6210526315789474
Test F1 Score: 0.6137869705042543
||||||||||||||||||||||||||||||||||||||||||||||||||
degree:  2
Train Accuracy: 0.6506849315068494
Training time: 0.15193390846252441
##################################################
Test Accuracy: 0.5894736842105263
Prediction time: 0.020694971084594727
##################################################
Test Precision: 0.6115142601759144
Test Recall: 0.5894736842105263
Test F1 Score: 0.5776798208346376
||||||||||||||||||||||||||||||||||||||||||||||||||
degree:  3
Train Accuracy: 0.7086406743940991
Training time: 0.15357136726379395
##################################################
Test Accuracy: 0.6126315789473684
Prediction time: 0.020

In [123]:
RFC = RandomForestClassifier(max_depth=12, max_leaf_nodes=20,random_state=RANDOM_STATE, n_estimators=20, bootstrap=True)
train_model(RFC)
test_model(RFC)

Train Accuracy: 0.8582718651211801
Training time: 0.0455479621887207
##################################################
Test Accuracy: 0.64
Prediction time: 0.009947776794433594
##################################################
Test Precision: 0.6613046550043203
Test Recall: 0.64
Test F1 Score: 0.6364829935126138


In [124]:
XG = GradientBoostingClassifier(max_depth=7,max_leaf_nodes=100,random_state=RANDOM_STATE, n_estimators=10)
train_model(XG)
test_model(XG)

Train Accuracy: 0.9430979978925185
Training time: 0.26628851890563965
##################################################
Test Accuracy: 0.5957894736842105
Prediction time: 0.007406473159790039
##################################################
Test Precision: 0.6052067513697396
Test Recall: 0.5957894736842105
Test F1 Score: 0.5930928433130894


In [125]:
from sklearn.ensemble import ExtraTreesClassifier
EXT = ExtraTreesClassifier(random_state=RANDOM_STATE, max_depth=9, max_features='sqrt', max_leaf_nodes=40)
train_model(EXT)
test_model(EXT)

Train Accuracy: 0.8071654373024236
Training time: 0.1452343463897705
##################################################
Test Accuracy: 0.6652631578947369
Prediction time: 0.014612197875976562
##################################################
Test Precision: 0.6920995519622165
Test Recall: 0.6652631578947369
Test F1 Score: 0.6606204591196222


In [126]:
SGDC = SGDClassifier(random_state=RANDOM_STATE, loss='log_loss')
train_model(SGDC)
test_model(SGDC)

Train Accuracy: 0.6417281348788199
Training time: 0.06827998161315918
##################################################
Test Accuracy: 0.6168421052631579
Prediction time: 0.009432554244995117
##################################################
Test Precision: 0.619289248762933
Test Recall: 0.6168421052631579
Test F1 Score: 0.6149482578725005


In [127]:
ADA = AdaBoostClassifier(estimator=XG, n_estimators=20, learning_rate=0.01, random_state=RANDOM_STATE)
train_model(ADA)
test_model(ADA)



Train Accuracy: 0.9473129610115911
Training time: 4.926546335220337
##################################################
Test Accuracy: 0.5936842105263158
Prediction time: 0.03506183624267578
##################################################
Test Precision: 0.6023615141647777
Test Recall: 0.5936842105263158
Test F1 Score: 0.5906931308877134


In [128]:
from sklearn.naive_bayes import GaussianNB

NB = GaussianNB(var_smoothing=0.15)

train_model(NB)
test_model(NB)

Train Accuracy: 0.6227608008429927
Training time: 0.0050830841064453125
##################################################
Test Accuracy: 0.6126315789473684
Prediction time: 0.007433652877807617
##################################################
Test Precision: 0.6090762661675914
Test Recall: 0.6126315789473684
Test F1 Score: 0.6028379454179992


In [129]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
bag_clf = BaggingClassifier(
DecisionTreeClassifier(), n_estimators=30,
max_samples=100, bootstrap=True, n_jobs=-1, random_state=RANDOM_STATE)

train_model(bag_clf)
test_model(bag_clf)


Train Accuracy: 0.8556375131717597
Training time: 3.121716022491455
##################################################
Test Accuracy: 0.5557894736842105
Prediction time: 0.04003787040710449
##################################################
Test Precision: 0.5834401388085598
Test Recall: 0.5557894736842105
Test F1 Score: 0.552789423890853


In [130]:
from sklearn.ensemble import VotingClassifier

# best performing models
vr = VotingClassifier(
    estimators=[('XG', XG), ('EXT', EXT), ('RFR', RFC),  ('KNN', KNN), ('LR', LR), ('SGDC', SGDC),],
voting='soft')
train_model(vr)
test_model(vr)

Train Accuracy: 0.832982086406744
Training time: 0.6473374366760254
##################################################
Test Accuracy: 0.68
Prediction time: 0.03671002388000488
##################################################
Test Precision: 0.6973112353721105
Test Recall: 0.68
Test F1 Score: 0.6783275969681761


In [131]:
import pickle as pkl

pkl.dump(RS, open("RS_classif.pkl", "wb"))
pkl.dump(mean_encoder_c, open("mean_encoder_c_classif.pkl", "wb"))
pkl.dump(mean_encoder_s, open("mean_encoder_s_classif.pkl", "wb"))
pkl.dump(mean_encoder_h, open("mean_encoder_h_classif.pkl", "wb"))
pkl.dump(LR, open("voting_classif.pkl", "wb"))