In [1]:
import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import SGDClassifier

In [2]:
df = pd.read_csv('DoctorFeePrediction_Milestone2.csv')

In [3]:
df.head()

Unnamed: 0,Doctor Name,City,Specialization,Doctor Qualification,Experience(Years),Total_Reviews,Patient Satisfaction Rate(%age),Avg Time to Patients(mins),Wait Time(mins),Hospital Address,Doctors Link,Fee Category
0,Dr. Umair Hafeez,GUJRANWALA,Ent Specialist,"MBBS , FCPS ( Otorhinolaryngologic ENT )",6.0,11,100,19,6,"Central Hospital, Jinnah Colony, Gujranwala",https://www.marham.pk/doctors/gujranwala/ent-s...,Expensive
1,Dr. Haris Shakeel,RAJAN-PUR,General Physician,MBBS,1.0,0,94,14,11,No Address Available,No Link Available,Cheap
2,Dr. Iqra Rehman,MIRPUR-KHAS,General Physician,"MBBS, RMP, CFP (USA), Certified in Covid 19 +",6.0,9,100,10,0,"Rehman Clinic, tandoadam naka, Mirpur Khas",https://www.marham.pk/doctors/mirpur-khas/gene...,Medium-Priced
3,Dr. Erum Memon,HYDERABAD,Gynecologist,"MBBS, FCPS (Gynae &amp; Obstetrics)",11.0,71,96,18,10,"Mehmood Hospital, Qasimabad, Hyderabad",https://www.marham.pk/doctors/hyderabad/gyneco...,Cheap
4,Dr. Aisha Ahmad,LAHORE,Dermatologist,"MBBS, FCPS (Dermatology), Certified (Aesthetic...",12.0,199,100,16,2,"Skinnovation, Johar Town, Lahore",https://www.marham.pk/doctors/lahore/dermatolo...,Medium-Priced


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2386 entries, 0 to 2385
Data columns (total 12 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Doctor Name                      2386 non-null   object 
 1   City                             2386 non-null   object 
 2   Specialization                   2386 non-null   object 
 3   Doctor Qualification             2386 non-null   object 
 4   Experience(Years)                2386 non-null   float64
 5   Total_Reviews                    2386 non-null   int64  
 6   Patient Satisfaction Rate(%age)  2386 non-null   int64  
 7   Avg Time to Patients(mins)       2386 non-null   int64  
 8   Wait Time(mins)                  2386 non-null   int64  
 9   Hospital Address                 2386 non-null   object 
 10  Doctors Link                     2386 non-null   object 
 11  Fee Category                     2386 non-null   object 
dtypes: float64(1), int64

In [5]:
df.describe()

Unnamed: 0,Experience(Years),Total_Reviews,Patient Satisfaction Rate(%age),Avg Time to Patients(mins),Wait Time(mins)
count,2386.0,2386.0,2386.0,2386.0,2386.0
mean,11.844719,92.473177,96.657586,14.092205,11.264459
std,8.784449,282.162526,4.9623,2.722198,5.636885
min,1.0,0.0,33.0,3.0,0.0
25%,6.0,0.0,94.0,14.0,10.0
50%,10.0,8.0,98.0,14.0,11.0
75%,14.0,54.0,100.0,15.0,11.0
max,53.0,5147.0,100.0,50.0,82.0


In [6]:
df['Fee Category'].unique()

array(['Expensive', 'Cheap', 'Medium-Priced'], dtype=object)

In [7]:
df['Fee Category'].value_counts()

Fee Category
Medium-Priced    1042
Cheap             864
Expensive         480
Name: count, dtype: int64

In [8]:
print(df.shape)
df.drop_duplicates(inplace=True)
df.reset_index(inplace=True)
# Costing Features
df=df.drop(['Doctor Name'],axis=1)
df=df.drop(['index'],axis=1)
print(df.shape)


(2386, 12)
(2373, 11)


In [9]:
first_split = df['Hospital Address'].str.split(',', expand=True)
second_split = first_split[0].str.split(';', expand=True)
new_column_zero = second_split.iloc[:, 0]
df_concatenated = pd.concat([df, new_column_zero], axis=1)
df = df_concatenated.rename(columns={0: 'Hospital Name'})
df.drop('Hospital Address', axis=1, inplace=True)

In [10]:
df.head()

Unnamed: 0,City,Specialization,Doctor Qualification,Experience(Years),Total_Reviews,Patient Satisfaction Rate(%age),Avg Time to Patients(mins),Wait Time(mins),Doctors Link,Fee Category,Hospital Name
0,GUJRANWALA,Ent Specialist,"MBBS , FCPS ( Otorhinolaryngologic ENT )",6.0,11,100,19,6,https://www.marham.pk/doctors/gujranwala/ent-s...,Expensive,Central Hospital
1,RAJAN-PUR,General Physician,MBBS,1.0,0,94,14,11,No Link Available,Cheap,No Address Available
2,MIRPUR-KHAS,General Physician,"MBBS, RMP, CFP (USA), Certified in Covid 19 +",6.0,9,100,10,0,https://www.marham.pk/doctors/mirpur-khas/gene...,Medium-Priced,Rehman Clinic
3,HYDERABAD,Gynecologist,"MBBS, FCPS (Gynae &amp; Obstetrics)",11.0,71,96,18,10,https://www.marham.pk/doctors/hyderabad/gyneco...,Cheap,Mehmood Hospital
4,LAHORE,Dermatologist,"MBBS, FCPS (Dermatology), Certified (Aesthetic...",12.0,199,100,16,2,https://www.marham.pk/doctors/lahore/dermatolo...,Medium-Priced,Skinnovation


In [11]:
def encode_link(link):
    if link != 'No Link Available':
        return 1
    else:
        return 0

# Apply the function to encode the 'Doctors Link' column
df['Doctors Link'] = df['Doctors Link'].apply(lambda x: encode_link(x))

# Now 'Doctors Link' column contains 1 if a link is available, 0 otherwise


In [12]:
df.head()

Unnamed: 0,City,Specialization,Doctor Qualification,Experience(Years),Total_Reviews,Patient Satisfaction Rate(%age),Avg Time to Patients(mins),Wait Time(mins),Doctors Link,Fee Category,Hospital Name
0,GUJRANWALA,Ent Specialist,"MBBS , FCPS ( Otorhinolaryngologic ENT )",6.0,11,100,19,6,1,Expensive,Central Hospital
1,RAJAN-PUR,General Physician,MBBS,1.0,0,94,14,11,0,Cheap,No Address Available
2,MIRPUR-KHAS,General Physician,"MBBS, RMP, CFP (USA), Certified in Covid 19 +",6.0,9,100,10,0,1,Medium-Priced,Rehman Clinic
3,HYDERABAD,Gynecologist,"MBBS, FCPS (Gynae &amp; Obstetrics)",11.0,71,96,18,10,1,Cheap,Mehmood Hospital
4,LAHORE,Dermatologist,"MBBS, FCPS (Dermatology), Certified (Aesthetic...",12.0,199,100,16,2,1,Medium-Priced,Skinnovation


In [13]:
df.shape

(2373, 11)

In [14]:
popular_degrees = ['MBBS', 'FCPS', 'MCPS', 'MS', 'MD', 'FRCS']
qualifications_df = pd.DataFrame({'MBBS':[], 'FCPS':[], 'MCPS':[], 'MS':  [], 'MD':  [], 'FRCS':[]})
qualifications_df.head()

Unnamed: 0,MBBS,FCPS,MCPS,MS,MD,FRCS


NOTE: Doctors with non of these qualifications should have his columns set to 0 instead of being dropped

In [15]:
for  index in range(df.shape[0]):
    current_qualifications = df.iloc[index]['Doctor Qualification']
    row = []
    for qualification in popular_degrees:
        if current_qualifications.find(qualification) != -1:
            row.append(1)
        else:
            row.append(0)
    qualifications_df.loc[len(qualifications_df)] = row 
    
qualifications_df.head()

Unnamed: 0,MBBS,FCPS,MCPS,MS,MD,FRCS
0,1,1,0,0,0,0
1,1,0,0,0,0,0
2,1,0,0,0,0,0
3,1,1,0,0,0,0
4,1,1,0,0,0,0


In [16]:
qualifications_df.shape

(2373, 6)

In [17]:
df[popular_degrees] = qualifications_df.copy()

In [18]:
df.head()

Unnamed: 0,City,Specialization,Doctor Qualification,Experience(Years),Total_Reviews,Patient Satisfaction Rate(%age),Avg Time to Patients(mins),Wait Time(mins),Doctors Link,Fee Category,Hospital Name,MBBS,FCPS,MCPS,MS,MD,FRCS
0,GUJRANWALA,Ent Specialist,"MBBS , FCPS ( Otorhinolaryngologic ENT )",6.0,11,100,19,6,1,Expensive,Central Hospital,1,1,0,0,0,0
1,RAJAN-PUR,General Physician,MBBS,1.0,0,94,14,11,0,Cheap,No Address Available,1,0,0,0,0,0
2,MIRPUR-KHAS,General Physician,"MBBS, RMP, CFP (USA), Certified in Covid 19 +",6.0,9,100,10,0,1,Medium-Priced,Rehman Clinic,1,0,0,0,0,0
3,HYDERABAD,Gynecologist,"MBBS, FCPS (Gynae &amp; Obstetrics)",11.0,71,96,18,10,1,Cheap,Mehmood Hospital,1,1,0,0,0,0
4,LAHORE,Dermatologist,"MBBS, FCPS (Dermatology), Certified (Aesthetic...",12.0,199,100,16,2,1,Medium-Priced,Skinnovation,1,1,0,0,0,0


In [19]:
df.isna().count()

City                               2373
Specialization                     2373
Doctor Qualification               2373
Experience(Years)                  2373
Total_Reviews                      2373
Patient Satisfaction Rate(%age)    2373
Avg Time to Patients(mins)         2373
Wait Time(mins)                    2373
Doctors Link                       2373
Fee Category                       2373
Hospital Name                      2373
MBBS                               2373
FCPS                               2373
MCPS                               2373
MS                                 2373
MD                                 2373
FRCS                               2373
dtype: int64

In [20]:
df.rename(columns={'Experience(Years)':'EXP(YRs)',
                   'Total_Reviews' : '#Reviews',
                   f'Patient Satisfaction Rate(%age)': 'Satisfaction Rate'}, inplace=True)


In [21]:
df.shape

(2373, 17)

In [22]:
OE = OrdinalEncoder(categories=[['Cheap', 'Medium-Priced', 'Expensive']])
df[['Fee Category']] = OE.fit_transform(df[['Fee Category']])

mean_encoder = ce.TargetEncoder(smoothing=-1)
mean_encoder.fit(df[['City']], df['Fee Category'])
df['City'] = mean_encoder.transform(df['City'])
mean_encoder.fit(df[['Specialization']], df['Fee Category'])
df['Specialization'] = mean_encoder.transform(df['Specialization'])


In [23]:
df.head()

Unnamed: 0,City,Specialization,Doctor Qualification,EXP(YRs),#Reviews,Satisfaction Rate,Avg Time to Patients(mins),Wait Time(mins),Doctors Link,Fee Category,Hospital Name,MBBS,FCPS,MCPS,MS,MD,FRCS
0,0.837337,0.837337,"MBBS , FCPS ( Otorhinolaryngologic ENT )",6.0,11,100,19,6,1,2.0,Central Hospital,1,1,0,0,0,0
1,0.142859,0.837337,MBBS,1.0,0,94,14,11,0,0.0,No Address Available,1,0,0,0,0,0
2,0.5,0.837337,"MBBS, RMP, CFP (USA), Certified in Covid 19 +",6.0,9,100,10,0,1,1.0,Rehman Clinic,1,0,0,0,0,0
3,0.837337,0.837337,"MBBS, FCPS (Gynae &amp; Obstetrics)",11.0,71,96,18,10,1,0.0,Mehmood Hospital,1,1,0,0,0,0
4,0.837337,0.837337,"MBBS, FCPS (Dermatology), Certified (Aesthetic...",12.0,199,100,16,2,1,1.0,Skinnovation,1,1,0,0,0,0


In [24]:
indices = []
for i in range(df.shape[0]):
    if df['Hospital Name'].iloc[i] == 'No Address Available':
        indices.append(i)

mean_encoder.fit(df[['Hospital Name']], df['Fee Category'])
df['Hospital Name'] = mean_encoder.transform(df['Hospital Name'])


for index in indices:
    df['Hospital Name'].iloc[index] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Hospital Name'].iloc[index] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Hospital Name'].iloc[index] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Hospital Name'].iloc[index] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Hospital Name'].iloc[index] = 0
A value is trying to be 

In [25]:
df.head()

Unnamed: 0,City,Specialization,Doctor Qualification,EXP(YRs),#Reviews,Satisfaction Rate,Avg Time to Patients(mins),Wait Time(mins),Doctors Link,Fee Category,Hospital Name,MBBS,FCPS,MCPS,MS,MD,FRCS
0,0.837337,0.837337,"MBBS , FCPS ( Otorhinolaryngologic ENT )",6.0,11,100,19,6,1,2.0,1.5,1,1,0,0,0,0
1,0.142859,0.837337,MBBS,1.0,0,94,14,11,0,0.0,0.0,1,0,0,0,0,0
2,0.5,0.837337,"MBBS, RMP, CFP (USA), Certified in Covid 19 +",6.0,9,100,10,0,1,1.0,1.0,1,0,0,0,0,0
3,0.837337,0.837337,"MBBS, FCPS (Gynae &amp; Obstetrics)",11.0,71,96,18,10,1,0.0,4.691427e-09,1,1,0,0,0,0
4,0.837337,0.837337,"MBBS, FCPS (Dermatology), Certified (Aesthetic...",12.0,199,100,16,2,1,1.0,1.0,1,1,0,0,0,0


In [26]:

included_features = ['City', 'EXP(YRs)', '#Reviews', 'Satisfaction Rate', 'Avg Time to Patients(mins)', 'Wait Time(mins)', 'Doctors Link', 'Hospital Name', 'Specialization']
features = df[included_features]
target = df["Fee Category"]


k_best = SelectKBest(score_func=f_classif, k=6)

X = k_best.fit_transform(features, target)
y = target

#Get the indices of the selected features
selected_features_indices = k_best.get_support(indices=True)

# Get the scores associated with each feature
feature_scores = k_best.scores_

# Create a list of tuples containing feature names and scores
feature_info = list(zip(features.columns, feature_scores))

# Sort the feature info in descending order based on scores
sorted_feature_info = sorted(feature_info, key=lambda x: x[1], reverse=True)

for feature_name, feature_score in sorted_feature_info[:10]:
    print(f"{feature_name}: {feature_score:.2f}")


Hospital Name: 2197.11
EXP(YRs): 262.42
Specialization: 164.43
City: 147.83
Doctors Link: 145.17
#Reviews: 114.45
Satisfaction Rate: 16.18
Wait Time(mins): 10.32
Avg Time to Patients(mins): 3.12


In [27]:
selected_features_df = features[['City', 'Specialization', 'EXP(YRs)', 'Hospital Name', '#Reviews', 'Doctors Link']]
selected_features_df['Fee Category'] = df['Fee Category']
selected_features_df[popular_degrees] = df[popular_degrees]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_features_df['Fee Category'] = df['Fee Category']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_features_df[popular_degrees] = df[popular_degrees]


In [28]:
X_ = selected_features_df.drop(['Fee Category'], axis=1).values
y_ = selected_features_df['Fee Category'].values
RS = RobustScaler()
X_ = RS.fit_transform(X_)

In [29]:
RANDOM_STATE = 42

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_, y_, train_size=0.8, random_state=RANDOM_STATE)

In [31]:
# Assuming X_train, y_train are your features and labels for training data
smote = SMOTE(random_state=RANDOM_STATE)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [33]:
def evaluate_model(model):
    model.fit(X_train, y_train)
    
    train_preds = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, train_preds)
    print('train accuracy: ' ,train_accuracy)
    print('#'*40)
    test_preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, test_preds)
    precision = precision_score(y_test, test_preds, average='weighted')
    recall = recall_score(y_test, test_preds, average='weighted')
    f1 = f1_score(y_test, test_preds, average='weighted')
    print('test accuracy: ',accuracy)
    print('test precision: ',precision)
    print('test recall: ',recall)
    print('test f1 score', f1)

In [34]:
LR = LogisticRegression(C=0.5, penalty='l2', multi_class='multinomial', random_state=RANDOM_STATE)
evaluate_model(LR)

train accuracy:  0.8007202881152461
########################################
test accuracy:  0.7852631578947369
test precision:  0.7893859071263042
test recall:  0.7852631578947369
test f1 score 0.7802366889445246


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


low accuracy on logistic, hence the model is not linearly seperable
in other words, linear kernel for svm will also give poor accuracy

In [35]:
KNN = KNeighborsClassifier(n_neighbors=3)
evaluate_model(KNN)

train accuracy:  0.8871548619447779
########################################
test accuracy:  0.728421052631579
test precision:  0.7273935677964717
test recall:  0.728421052631579
test f1 score 0.7255703246101481


In [36]:
svc = SVC(kernel='linear', random_state=RANDOM_STATE)
evaluate_model(svc)

train accuracy:  0.8091236494597839
########################################
test accuracy:  0.7957894736842105
test precision:  0.7984559637788794
test recall:  0.7957894736842105
test f1 score 0.7937793520573476


In [37]:
svc_w = SVC(kernel='rbf', random_state=RANDOM_STATE)
evaluate_model(svc_w)

train accuracy:  0.4381752701080432
########################################
test accuracy:  0.35157894736842105
test precision:  0.49857841051018215
test recall:  0.35157894736842105
test f1 score 0.2856332791189124


In [38]:
svc = SVC(kernel='sigmoid', C=0.999, random_state=RANDOM_STATE)
evaluate_model(svc)

train accuracy:  0.4361744697879152
########################################
test accuracy:  0.3368421052631579
test precision:  0.4713458942752219
test recall:  0.3368421052631579
test f1 score 0.29057760448867526


In [39]:
for deg in range(6):
    svc = SVC(kernel='poly', degree=deg, C=0.8, random_state=RANDOM_STATE)
    evaluate_model(svc)

train accuracy:  0.3333333333333333
########################################
test accuracy:  0.2168421052631579
test precision:  0.047020498614958446
test recall:  0.2168421052631579
test f1 score 0.07728282644327081
train accuracy:  0.4305722288915566
########################################


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


test accuracy:  0.33263157894736844
test precision:  0.2964460119370591
test recall:  0.33263157894736844
test f1 score 0.24920999035266372
train accuracy:  0.39295718287314924
########################################
test accuracy:  0.47157894736842104
test precision:  0.422250870563829
test recall:  0.47157894736842104
test f1 score 0.3811256335565079


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


train accuracy:  0.41656662665066024
########################################
test accuracy:  0.32210526315789473
test precision:  0.5644894640371549
test recall:  0.32210526315789473
test f1 score 0.25781169079905003
train accuracy:  0.3933573429371749
########################################
test accuracy:  0.47578947368421054
test precision:  0.6913716873459185
test recall:  0.47578947368421054
test f1 score 0.37409997317356697
train accuracy:  0.40656262505002
########################################
test accuracy:  0.29473684210526313
test precision:  0.711382231873298
test recall:  0.29473684210526313
test f1 score 0.21411892298523597


In [40]:
RFC = RandomForestClassifier(max_depth=15, max_leaf_nodes=50,random_state=RANDOM_STATE, n_estimators=100)
evaluate_model(RFC)

train accuracy:  0.8731492597038816
########################################
test accuracy:  0.8189473684210526
test precision:  0.8239600618325362
test recall:  0.8189473684210526
test f1 score 0.8154607581847542


In [41]:

XG = GradientBoostingClassifier(max_depth=7,max_leaf_nodes=20,random_state=RANDOM_STATE, n_estimators=25)
evaluate_model(XG)

train accuracy:  0.8915566226490597
########################################
test accuracy:  0.8231578947368421
test precision:  0.8311397645805668
test recall:  0.8231578947368421
test f1 score 0.819508676594438


In [42]:
from sklearn.ensemble import ExtraTreesClassifier
EXT = ExtraTreesClassifier(random_state=RANDOM_STATE, max_depth=22)
evaluate_model(EXT)

train accuracy:  0.9615846338535414
########################################
test accuracy:  0.7957894736842105
test precision:  0.7981734332213242
test recall:  0.7957894736842105
test f1 score 0.7937578610619987


In [43]:
SGDC = SGDClassifier()
evaluate_model(SGDC)

train accuracy:  0.7426970788315326
########################################
test accuracy:  0.72
test precision:  0.7247576332706481
test recall:  0.72
test f1 score 0.7132341689372922


In [44]:
ADA = AdaBoostClassifier(estimator=XG, n_estimators=150, algorithm='SAMME.R', learning_rate=0.01, )
evaluate_model(ADA)



train accuracy:  0.9123649459783914
########################################
test accuracy:  0.8168421052631579
test precision:  0.8208214425108111
test recall:  0.8168421052631579
test f1 score 0.8145039096099713


In [45]:
from sklearn.ensemble import VotingClassifier


vr = VotingClassifier(
    estimators=[('XG', XG), ('EXT', EXT), ('RFR', RFC), ('SVC', svc_w), ('KNN', KNN)],
)

evaluate_model(vr)

train accuracy:  0.9079631852741097
########################################
test accuracy:  0.8168421052631579
test precision:  0.8278182034183116
test recall:  0.8168421052631579
test f1 score 0.8122049869418291


In [46]:
NB = GaussianNB(var_smoothing=0.0021)
evaluate_model(NB)

train accuracy:  0.4337735094037615
########################################
test accuracy:  0.4926315789473684
test precision:  0.5389984051036683
test recall:  0.4926315789473684
test f1 score 0.44598910382949636


In [47]:
import pickle as pkl


pkl.dump(XG, open("XG_classif.pkl", "wb"))
