In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re 
import pickle
import sklearn
import statsmodels.api as sm

from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

## Functions

In [None]:
def convert(item, map_dict):
    return map_dict.get(item, "unknown")

In [None]:
# IQR Function with quantiles (25, 75) 
def IQR(column_name, df):
    IQR = 0
    
    q1 = df[column_name].quantile(.25)
    q3 = df[column_name].quantile(.75)
    
    IQR= q3 - q1
    
    lower= q1 - 1.5 * IQR
    upper= q3 + 1.5 * IQR
    
    df.drop(df[((df[column_name] > upper) | (df[column_name] < lower))].index, inplace= True, axis=0)


In [None]:
def skew_kurtoisis(column):
    data = pd.DataFrame({
    "Skew": [column.skew()],
    "kurtoisis":[column.kurtosis()]
                })
    print(data)

## Reading Data

In [None]:
data = pd.read_csv(open("Customer_Cancelation.csv","r"))
data

In [None]:
data.info()

In [None]:
data.describe()

## Check Duplicates & NANs

In [None]:
data.isna().sum()

In [None]:
data.duplicated().sum()

## Preprocessing

In [None]:
data.columns

In [None]:
data.drop(columns= ["Booking_ID"], axis= 1, inplace= True)
data.head()

#### Encoding for Visualization

In [None]:
# Custom Encoding for Specific Values
meal = {
    "Not Selected" : 0,
    "Meal Plan 1": 1,
    "Meal Plan 2": 2,
    "Meal Plan 3": 3
    
}

data["type of meal"] = data["type of meal"].map(meal)

In [None]:
# Custom Encoding for Specific Values
room = {
    "Room_Type 1": 1,
    "Room_Type 2": 2,
    "Room_Type 3": 3,
    "Room_Type 4": 4,
    "Room_Type 5": 5,
    "Room_Type 6": 6,
    "Room_Type 7": 7,
    
}

data["room type"] = data["room type"].map(room)

In [None]:
# Custom Encoding for Specific Values
booking = {
    "Not_Canceled" : 0,
    "Canceled": 1
}

data["booking status"] = data["booking status"].map(booking)

In [None]:
market = {
    "Offline": 0,
    "Online": 1,
    "Corporate": 2,
    "Aviation": 3,
    "Complementary": 4
    
}
data["market segment type"] = data["market segment type"].map(market)

In [None]:
data.head()

#### Handling Features

In [None]:
data[['month', 'day', 'year']] = data["date of reservation"].str.split("/", expand = True)

In [None]:
# we will delete the remaining 37 instance with "-" to have the same number of rows of data
data.drop(data[data["day"].isna()].index,
             axis= 0, 
             inplace= True,
             )
data.drop(columns="date of reservation", inplace=True)

In [None]:
#Re-Format
data["month"]= data["month"].astype("int64")
data["day"]= data["day"].astype("int64")
data["year"]= data["year"].astype("int64")
data["lead time"]= data["lead time"].astype("float64")


In [None]:
data.info()

#### All Features

In [None]:
features = data.loc[:,~data.columns.isin(["booking status"])]
features

## Continous features

In [None]:
con_features = data.loc[:, data.columns.isin(['lead time', 'average price '])]
con_features

## Discrete Features

In [None]:
disc_features = data.loc[:, ~data.columns.isin(['lead time', 'average price ', 'booking status'])]
disc_features

#### Handling with Continous Features

In [None]:
sns.boxplot(x= data.loc[:,con_features.columns].iloc[:,0] )

In [None]:
sns.boxplot(x= data.loc[:,con_features.columns].iloc[:,1] )

In [None]:
for col in data.loc[:,con_features.columns].columns.to_list():
    IQR(col, data)

data.reset_index(inplace=True, drop= True)

In [None]:
sns.boxplot(x= data.loc[:,con_features.columns].iloc[:,0] )

In [None]:
sns.boxplot(x= data.loc[:,con_features.columns].iloc[:,1] )

## Final Features

In [None]:
features = data.loc[:,~data.columns.isin(["booking status",  "P-C"])]
features

In [None]:
features.info()

## Target

In [None]:
target = data.loc[:, ["booking status"]]
target

## EDA

### Univariate Analysis

In [None]:
sns.countplot(x= "booking status", data= target)
plt.show()

###### ----lead time

In [None]:
sns.histplot(x= "lead time", bins=50, kde= True,data= features)
plt.show()

In [None]:
skew_kurtoisis(con_features.iloc[:,0])

"Note"
lead time column has right skew shape = 1.3 & small kurtoisis = 1.2

###### -----average price

In [None]:
sns.histplot(x= "average price ", bins=50, kde= True,data= features)
plt.show()

In [None]:
skew_kurtoisis(con_features.iloc[:,1])

"Note" average price column has small right skew shape = .7 & high kurtoisis = 3.15

## Splitting & Scailing

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=.2, random_state=42)

In [None]:
ss = StandardScaler()
X_train[con_features.columns] = ss.fit_transform(X_train[con_features.columns])
X_test[con_features.columns]= ss.transform(X_test[con_features.columns])
X_train

## Random Forest 

In [None]:
RF = RandomForestClassifier(n_estimators=326, 
                            max_depth=22, 
                            random_state= 0, 
                            n_jobs= -1, 
                            oob_score= True, 
                           )
RF.fit(X_train, np.ravel(y_train))

#### Threshold = .5

In [None]:
rf_prediction = np.where(RF.predict_proba(X_test) > 0.5, 1, 0)[:, 1]
accuracy_score(y_test, rf_prediction)

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(confusion_matrix(y_test, rf_prediction), annot= True, fmt=".2f")

plt.xlabel("Prediction")
plt.ylabel("True")

plt.show()

In [None]:
print(classification_report(y_test, rf_prediction))

In [None]:
false_positive_rate, true_positive_rate, threshold = roc_curve(y_test, rf_prediction)
print('roc_auc_score for Random Forest: ', roc_auc_score(y_test, rf_prediction))

In [None]:
plt.title('Receiver Operating Characteristic - Random Forest threshold = .5')
plt.plot(false_positive_rate, true_positive_rate)
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

#### Threshold = .265

In [None]:
rf_prediction = np.where(RF.predict_proba(X_test) > 0.265, 1, 0)[:, 1]
accuracy_score(y_test, rf_prediction)

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(confusion_matrix(y_test, rf_prediction), annot= True, fmt=".2f")

plt.xlabel("Prediction")
plt.ylabel("True")

plt.show()

In [None]:
print(classification_report(y_test, rf_prediction))

In [None]:
false_positive_rate, true_positive_rate, threshold = roc_curve(y_test, rf_prediction)
print('roc_auc_score for Random Forest: ', roc_auc_score(y_test, rf_prediction))

In [None]:
plt.title('Receiver Operating Characteristic - Random Forest threshold = .265')
plt.plot(false_positive_rate, true_positive_rate)
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

#### Tuning

In [None]:
# param_grid = {"n_estimators" : [400, 500, 600, 700 ],
#               "max_depth": [30, 40, 50]
#              }

# g_cv = GridSearchCV(RF, param_grid=param_grid, scoring="accuracy", cv= 5, verbose= 6, n_jobs=-1)
# g_cv.fit(X_train, np.ravel(y_train))

# g_cv.best_params_

## Save Model

In [None]:
# import joblib
# joblib.dump(RF, 'RF.pkl')

In [None]:
X_test.iloc[-4, :]

In [None]:
y_test.iloc[-4]