In [40]:
# Importing all Libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")


In [41]:
# Loading the train and test dataset in dataframe using python
train = pd.read_excel("doctor_fee_train.xlsx")

test = pd.read_excel("doctor_fee_test.xlsx")

In [42]:
df = pd.DataFrame(data=train)

In [43]:
# columns of dataframe
df.columns

Index(['Qualification', 'Experience', 'Rating', 'Place', 'Profile',
       'Miscellaneous_Info', 'Fees'],
      dtype='object')

In [44]:
# data types
df.dtypes

Qualification         object
Experience            object
Rating                object
Place                 object
Profile               object
Miscellaneous_Info    object
Fees                   int64
dtype: object

In [45]:
print('Qualification:', df['Qualification'].nunique())
print('Experience:', df['Experience'].nunique())
print('Rating:', df['Rating'].nunique())
print('Place:', df['Place'].nunique())
print('Profile', df['Profile'].nunique())

Qualification: 1420
Experience: 64
Rating: 51
Place: 877
Profile 6


In [46]:
# Extract years of experience
df["Experience"] = df["Experience"].str.split()
df["Experience"] = df["Experience"].str[0].astype("int")

In [47]:
# Extract cities
df["Place"].fillna("Unknown,Unknown",inplace=True)
df["Place"] = df["Place"].str.split(",")
df["City"] = df["Place"].str[-1]
df["Place"] = df["Place"].str[0]

In [48]:
# Seperate Ratings into bins
df["Rating"].fillna("-99%",inplace=True)
df["Rating"] = df["Rating"].str[:-1].astype("int")
bins = [-99,0,10,20,30,40,50,60,70,80,90,100]
labels = [i for i in range(11)]
df["Rating"] = pd.cut(df["Rating"],bins=bins,labels=labels,include_lowest=True)

In [49]:
# Extract relevant qualification
df["Qualification"]=df["Qualification"].str.split(",")
Qualification ={}
for x in df["Qualification"].values:
    for each in x:
        each = each.strip()
        if each in Qualification:
            Qualification[each]+=1
        else:
            Qualification[each]=1

In [50]:
#Identifying  the top 10 qualification that occurs the most as This is the problem of non-standardized data entry
# or data collection.For example, there were entries of ‘MBA -Healthcare’ and ‘MBA’ which I think referred to the 
#same qualification.

In [51]:
most_qua = sorted(Qualification.items(),key=lambda x:x[1],reverse=True)[:10]
final_qua =[]
for tup in most_qua:
    final_qua.append(tup[0])
for title in final_qua:
    df[title]=0
    
for x,y in zip(df["Qualification"].values,np.array([idx for idx in range(len(df))])):
    for q in x:
        q = q.strip()
        if q in final_qua:
            df[q][y] = 1
df.drop("Qualification",axis=1,inplace=True)

The final result is dummies variables for the 10 highest frequency qualification in the dataset.
Now for the ‘Profile’ column we do not have any missing value in this column. 
That is actually pretty neat. Since the whole column only consists of 6 classes, oneHotEncoding the column should do the trick. Before that, a 

quick check on the ‘City’ column we created showed that it also contains a small number of classes (10). However, something weird popped up.

There is an ‘e’ entry out of nowhere and I guessed it should be a mistake (wrong entry). I found that the problem occurred in row 3980 and I changed the 
‘City’ and ‘Place’ columns for that row to ‘unknown’ instead.

In [52]:
df["City"][3980] = "Unknown"
df["Place"][3980] = "Unknown"

In [53]:
# Get dummies
df = pd.get_dummies(df,columns=["City","Profile"],prefix=["City","Profile"])

Taking into account the high percentage of missing values, and the fact that I could not find any 
relevance of the column , I decided to forgo the column and just drop it. 

In [54]:
df.drop("Miscellaneous_Info",axis=1,inplace=True)

In [55]:
X = df.drop("Fees",axis=1)
y = df["Fees"]
# Encoding
enc = OrdinalEncoder()
X = enc.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
# feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [56]:
# support vector machine 
from sklearn.svm import SVR
m = SVR(gamma="scale")
m.fit(scaler.transform(X_train),y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [57]:
def score(y_pred,y):
    y_pred = np.log(y_pred)
    y = np.log(y)
    return 1 - ((np.sum((y_pred-y)**2))/len(y))**1/2
# Prediction
y_pred = m.predict(scaler.transform(X_test))
score(y_pred,y_test)

0.7622484158735299

In [58]:
# Define own scorer
scorer = make_scorer(score,greater_is_better=True)
# Hyperparameter tunning
parameters = {"C":[0.1,1,10],"kernel":["linear","rbf","poly"]}
reg = GridSearchCV(m,param_grid=parameters,scoring=scorer,n_jobs=-1,cv=5)
reg.fit(X_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='scale', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.1, 1, 10],
                         'kernel': ['linear', 'rbf', 'poly']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(score), verbose=0)

In [59]:
y_pred_tuned = reg.predict(scaler.transform(X_test))
score(y_pred_tuned,y_test)

0.8020369744175728

In [61]:
## Using GridSearchCV to find out the best parameters of the KNeighborsRegressor model
neighbors = {"n_neighbors":range(1,30)}
knr=KNeighborsRegressor()
gknr = GridSearchCV(knr,neighbors,cv=10)
gknr.fit(scaler.transform(X_train),y_train)
gknr.best_params_

{'n_neighbors': 29}

In [64]:
knr=KNeighborsRegressor(n_neighbors=29)
knr.fit(scaler.transform(X_train),y_train)
knr.score(X_train,y_train)
pred_y=knr.predict(X_test)

In [68]:
knrscore= knr.score(X_train,y_train)
print(knrscore)

0.09512299355205078


In [69]:
from sklearn.externals import joblib

In [72]:
#Save the model as a pickle in a file
joblib.dump(m,"mdoctor.pkl")

['mdoctor.pkl']

In [73]:
model=joblib.load("mdoctor.pkl")