## Importing Libraries 

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

## Importing the Dataset 

In [2]:
dataset=pd.read_csv('sigma_cabs.csv')
dataset.head()

Unnamed: 0,Trip_ID,Trip_Distance,Type_of_Cab,Customer_Since_Months,Life_Style_Index,Confidence_Life_Style_Index,Destination_Type,Customer_Rating,Cancellation_Last_1Month,Var1,Var2,Var3,Gender,Surge_Pricing_Type
0,T0005689460,6.77,B,1.0,2.42769,A,A,3.905,0,40.0,46,60,Female,2
1,T0005689461,29.47,B,10.0,2.78245,B,A,3.45,0,38.0,56,78,Male,2
2,T0005689464,41.58,,10.0,,,E,3.50125,2,,56,77,Male,2
3,T0005689465,61.56,C,10.0,,,A,3.45375,0,,52,74,Male,3
4,T0005689467,54.95,C,10.0,3.03453,B,A,3.4025,4,51.0,49,102,Male,2


In [3]:
dataset.describe()

Unnamed: 0,Trip_Distance,Customer_Since_Months,Life_Style_Index,Customer_Rating,Cancellation_Last_1Month,Var1,Var2,Var3,Surge_Pricing_Type
count,131662.0,125742.0,111469.0,131662.0,131662.0,60632.0,131662.0,131662.0,131662.0
mean,44.200909,6.016661,2.802064,2.849458,0.782838,64.202698,51.2028,75.099019,2.155747
std,25.522882,3.626887,0.225796,0.980675,1.037559,21.820447,4.986142,11.578278,0.738164
min,0.31,0.0,1.59638,0.00125,0.0,30.0,40.0,52.0,1.0
25%,24.58,3.0,2.65473,2.1525,0.0,46.0,48.0,67.0,2.0
50%,38.2,6.0,2.79805,2.895,0.0,61.0,50.0,74.0,2.0
75%,60.73,10.0,2.94678,3.5825,1.0,80.0,54.0,82.0,3.0
max,109.23,10.0,4.87511,5.0,8.0,210.0,124.0,206.0,3.0


In [4]:
dataset.isnull().sum()

Trip_ID                            0
Trip_Distance                      0
Type_of_Cab                    20210
Customer_Since_Months           5920
Life_Style_Index               20193
Confidence_Life_Style_Index    20193
Destination_Type                   0
Customer_Rating                    0
Cancellation_Last_1Month           0
Var1                           71030
Var2                               0
Var3                               0
Gender                             0
Surge_Pricing_Type                 0
dtype: int64

## Taking Care of Null value 

In [5]:
dataset['Type_of_Cab'] = dataset['Type_of_Cab'].fillna(dataset['Type_of_Cab'].mode()[0])

dataset['Customer_Since_Months'] = dataset['Customer_Since_Months'].fillna(dataset['Customer_Since_Months'].mean())

dataset['Life_Style_Index'] = dataset['Life_Style_Index'].fillna(dataset['Life_Style_Index'].mean())

dataset['Confidence_Life_Style_Index'] = dataset['Confidence_Life_Style_Index'].fillna(dataset['Confidence_Life_Style_Index'].mode()[0])

dataset['Var1'] = dataset['Var1'].fillna(dataset['Var1'].mean())


In [6]:
dataset.isnull().sum()

Trip_ID                        0
Trip_Distance                  0
Type_of_Cab                    0
Customer_Since_Months          0
Life_Style_Index               0
Confidence_Life_Style_Index    0
Destination_Type               0
Customer_Rating                0
Cancellation_Last_1Month       0
Var1                           0
Var2                           0
Var3                           0
Gender                         0
Surge_Pricing_Type             0
dtype: int64

## Converting Categorical values to Numeric Values 

In [7]:
cat_to_num = {"Type_of_Cab": {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5},
                "Confidence_Life_Style_Index": {"A": 1, "B": 2, "C": 3},
                "Destination_Type": {'A': 1, 'E': 5, 'B': 2, 'C': 3, 'G': 7, 'D': 4, 'F': 6, 'K': 11, 'L': 12, 'H': 8, 'I': 9, 'J': 10, 'M': 13,'N': 14},
                "Gender" :{'Male': 1, "Female": 2}}

In [8]:
dataset = dataset.replace(cat_to_num)

In [9]:
dataset.head()

Unnamed: 0,Trip_ID,Trip_Distance,Type_of_Cab,Customer_Since_Months,Life_Style_Index,Confidence_Life_Style_Index,Destination_Type,Customer_Rating,Cancellation_Last_1Month,Var1,Var2,Var3,Gender,Surge_Pricing_Type
0,T0005689460,6.77,2,1.0,2.42769,1,1,3.905,0,40.0,46,60,2,2
1,T0005689461,29.47,2,10.0,2.78245,2,1,3.45,0,38.0,56,78,1,2
2,T0005689464,41.58,2,10.0,2.802064,2,5,3.50125,2,64.202698,56,77,1,2
3,T0005689465,61.56,3,10.0,2.802064,2,1,3.45375,0,64.202698,52,74,1,3
4,T0005689467,54.95,3,10.0,3.03453,2,1,3.4025,4,51.0,49,102,1,2


# Creating the Model

## Splitting into training and testing dataset 

In [10]:
x=dataset.iloc[:,1:-1].values
y=dataset.iloc[:,-1].values
x_train, x_test, y_train, y_test =train_test_split(x,y,test_size=0.2,random_state=0)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(105329, 12) (26333, 12) (105329,) (26333,)


## Feature Scaling

In [11]:
from sklearn.preprocessing import StandardScaler
sc_x= StandardScaler()
x_train=sc_x.fit_transform(x_train)
x_test=sc_x.transform(x_test)

In [12]:
classifier = lgb.LGBMClassifier()
classifier.fit(x_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

## Predicting the Test dataset

In [13]:
y_pred = classifier.predict(x_test)

In [14]:
y_pred=classifier.predict(x_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[2 1]
 [2 2]
 [1 1]
 ...
 [2 2]
 [1 1]
 [2 3]]


## Accuracy Metrics 

In [15]:
print(confusion_matrix(y_test,y_pred))

[[2972 2006  502]
 [ 427 9488 1439]
 [ 417 3277 5805]]


In [16]:
print(accuracy_score(y_test,y_pred))

0.6936163748908214


### Accuracy using K fold Cross validation 

In [17]:
from sklearn.model_selection import cross_val_score
accuracies=cross_val_score(estimator=classifier, X=x_train, y=y_train,cv=10)
print("Accuracy: {:2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:2f} %".format(accuracies.std()*100))

Accuracy: 69.273416 %
Standard Deviation: 0.631443 %
