## Part A

In [13]:
import warnings
warnings.filterwarnings("ignore")

In [14]:
import numpy as np # linear algebra
import pandas as pd #data pre-processing
import matplotlib #data visualization 
import matplotlib.pyplot as plt #data visualization 
import seaborn as sns #data visualization 
import missingno as msno #Missing value interpretation
import shap

from IPython.display import display



from sklearn.model_selection import KFold,RepeatedStratifiedKFold, cross_val_score, GridSearchCV #Cross validation
from sklearn.model_selection import train_test_split #train_test_split splits dataset to training and test data and able to randomize the data

#feature scaling and label encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler,LabelEncoder 

#Evaluation metrics
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import log_loss 
from sklearn.metrics import roc_auc_score


#Pipeline to assemble several steps that can be cross-validated together while setting different parameters
from sklearn.pipeline import Pipeline

#Set of different classifiers for the analysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier,Pool
import lightgbm as lgb
from sklearn import svm


#data immuputaion methods
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from collections import Counter



## Predictor class that compares different classifiers and find the best classifier for the given dataset.

In [22]:
class predictor:
    data = pd.DataFrame()
    imp = ''
    scalingStrategy = ''
    pred = ''
    
    def __init__(self,data, imp='missing', strategy = 'standard'):
        #receives data and makes copy to avoid working on original data
        self.data = data.copy(deep=True)
        self.imp = imp
        self.scalingStrategy = strategy
        
        
    def visualizeResults(self,best_model):#Visualizes the classifiers results for comparison
        clfs = []
        result    = pd.DataFrame(best_model.cv_results_)
#        sort models based on rank after cross validation
        result    = result.sort_values('rank_test_score')
    
        m = [['CatBoostClassifier','CatBoost'],['XGBClassifier','XGB'],
             ['RandomForestClassifier','RandomForest'],['DecisionTreeClassifier','DT']
             ,['LGBMClassifier','LGBM'],['SVC','LSVM']]
        
        for i in result['param_classifier']:
            for j in range(len(m)):
                if m[j][0] == type(i).__name__:
                    clfs.append(m[j][1])  
    
#        plot comparison line graph of different classification models
        data_plot = pd.DataFrame({"Classifier": clfs,"Results":result['mean_test_score']})
        sns.lineplot(x = "Classifier", y = "Results", data=data_plot)
        plt.title('Classifiers comparative analysis')
        plt.show()
        
        
    def immputation(self,data): #immpute dataframe using different immputation stratigies 
        feature =  data.loc[:, self.data.columns != 'Class']
        target  =  data.loc[:, 'Class']
        
        if self.imp == 'missing': #remove the specified column
            feature.drop(['F21'], axis = 1, inplace = True) 
        elif self.imp == 'mean': #replace the empty values with mean of that column
            feature.fillna(df.F21.mean(), inplace=True) 
        elif self.imp == 'iterative': #A strategy for imputing missing values by modeling each feature with missing values as a function of other features in a round-robin fashion.
            imp = IterativeImputer() 
            feature = imp.fit_transform(feature) 
        elif self.imp == 'knn': #Each sample’s missing values are imputed using the mean value from n_neighbors nearest neighbors 
            knn_imp = KNNImputer(n_neighbors=3)
            feature = knn_imp.fit_transform(feature)
#         chaining function calls feature scaling methos 
        return self.featureScaling(feature,target)
    
    def featureScaling(self, feature, target): #Normalise data using specified strategy
        feature_temp = ''
        if self.scalingStrategy == 'minmax': #Transform features by scaling each feature to a given range.
            scaler = MinMaxScaler().fit(feature)
            MinMaxScaler()
            feature_temp = scaler.transform(feature)
        elif self.scalingStrategy == 'standard': #Standardize features by removing the mean and scaling to unit variance.  
            scaler = StandardScaler().fit(feature)
            StandardScaler()
            feature_temp = scaler.transform(feature)
        #Target encoding
        target = LabelEncoder().fit_transform(target) #Label encoding target value
        return (feature_temp, target) 
    
    def visualizeResult(self, result, best_model): #Visualizing results obtained using best model
        dic    = dict(Counter(pd.Series(result)))
        dicTemp = dic.items()
        df = pd.DataFrame(dicTemp, columns=['Target', 'Count'])
        df.plot.bar(x='Target', y='Count', rot=0, color={'#C3553A','#76A3B1'})
        diclist = list(dicTemp)
        print('\n')
        print('Number of profitable hotels : ', diclist[0][1])
        print('Number of hotels that may not be profitable : ', diclist[1][1])
        print('\n')


    def predict(self,best_model): #prediction using test data
        #read test dataset
        test      = pd.read_csv('CE802_P2_Test.csv')
        #perform imputation and feature scaling on the test set
        X_test    = self.immputation(test)[0]
        #predict the target value for the given test features
        y_pred    = best_model.predict(X_test)
        self.pred = y_pred
        #calls visualizeResult to visualize obtained results
        self.visualizeResult(y_pred, best_model)

                
        
    def modelFinder(self, clfs): #compares and finds the best model 
        #Performs immputation and feature scaling on the training set
        immputed = self.immputation(self.data)
        feature, target = immputed[0], immputed[1]
        
        #Classifiers pipeline params of chosen classifiers
        params = {
                 'catboost' : 
                        {"classifier": [CatBoostClassifier(verbose=False)],
                  'classifier__learning_rate':[0.01,0.05, 0.10, 0.15, 0.20, 0.25, 0.3]
                   ,'classifier__iterations' : [10,100,500,750,1000]
#                          ,"classifier__n_estimators":[10, 100,200, 1000]
                        , "classifier__max_depth" : [3,5,7,9,10,15,20,25],
                  },
                  'xgb' : {"classifier": [XGBClassifier(random_state = 42,learning_rate = 0.05,
                                                max_depth = 6,eval_metric='mlogloss',
                                                min_child_weight=1,gamma=0.0,
                                                colsample_bytree = '0.7' )]},
                  'randomforest' : {"classifier": [RandomForestClassifier()],
                 'classifier__max_depth' : (3,5,7,9,10,15,20,25),
                 "classifier__n_estimators":[10, 100, 1000],
                 "classifier__bootstrap" : [True, False]
                 },
    
                'dt' : {"classifier": [DecisionTreeClassifier()],
                'classifier__max_depth' : (3,5,7,9,10,15,20,25),
                  'classifier__criterion' : ('gini', 'entropy')
              , 'classifier__min_samples_split' : (2,4,6)
                 },
                'svm' : {"classifier": [svm.SVC()],
                  'classifier__C': [0.1, 1, 10, 100, 1000],
                  'classifier__gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                  'classifier__kernel': ['linear','rbf']
                },
    
                'lgbm' : {"classifier": [lgb.LGBMClassifier()],
                 "classifier__learning_rate"    : [0.01,0.05, 0.10, 0.15, 0.20, 0.25, 0.3 ] , 
#1                 'classifier__bagging_fraction': [0.5, 0.8],
                "classifier__objective":["binary"],
                       "classifier__ bagging_freq":[5,8,10],   
# 2                'classifier__feature_fraction': [0.5, 0.8],
                'classifier__max_depth': [3,5,7,9,10,15,20,25],
                'classifier__min_data_in_leaf': [90, 120],
                'classifier__num_leaves': [1200, 1550]
                }}
        #Train test spit with 80% for training and 20% for testing.
        X_train,X_test,y_train,y_test=train_test_split(feature,target,test_size=0.2,random_state=0)
        #Create pipeline  that Sequentially applys a list of transforms and a final estimator
        pipe = Pipeline([("classifier", RandomForestClassifier())])
        # Create dictionary with candidate learning algorithms and their hyperparameters
        grid_param = [params[i] for i in clfs ]   
        # create a gridsearch of the pipeline, the fit the best model
        gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
        best_model = gridsearch.fit(X_train,y_train)
        print("The mean accuracy of the model is:",best_model.score(X_test,y_test))
        print('\n')
        
        #perform prediction on test data 
        y_pred = best_model.predict(X_test)
        cm = confusion_matrix(y_test,y_pred)
        ax = sns.heatmap(cm, annot=True, cmap='Blues')

        ax.set_title('Confusion Matrix \n\n');
        ax.set_xlabel('\nPredicted Values')
        ax.set_ylabel('Actual Values ');

        ## Ticket labels - List must be in alphabetical order
        ax.xaxis.set_ticklabels(['False','True'])
        ax.yaxis.set_ticklabels(['False','True'])

        ## Display the visualization of the Confusion Matrix.
        print('\n')
        plt.show()
        print(classification_report(y_test,y_pred))
        print('log loss : ', log_loss(y_test, y_pred, eps=1e-15))
        print('\n')
        print('roc_auc_score :',roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1]))
        print('\n')
   
        self.visualizeResults(best_model)
        self.predict(best_model)
        # Create a dataframe of feature importance
        print('\n')
        if type(best_model.best_estimator_._final_estimator).__name__ == 'CatBoostClassifier':
            #calculate feature importace of the CatBoost Classifier
            df_feature_importance = pd.DataFrame(best_model.best_estimator_._final_estimator.get_feature_importance(prettified=True))
            #plotting feature importance
            plt.figure(figsize=(12, 6));
            feature_plot= sns.barplot(x="Importances", y="Feature Id", data=df_feature_importance,palette="cool");
            plt.title('Feature importance');
            
#             shap_values = best_model.best_estimator_._final_estimator.get_feature_importance(Pool(X_test, label=y_test,cat_features=categorical_features_indices), 
#                                                                      type="ShapValues")
#             expected_value = shap_values[0,-1]
#             shap_values = shap_values[:,:-1]

#             shap.initjs()
#             shap.force_plot(expected_value, shap_values[3,:], X_test.iloc[3,:])

## Main class of the machine learning system which includes seperate functions for each stage of data-preprocessing

In [23]:
import missingno as msno 
import seaborn as sns

class Mlsystem: #Mlsytem main class of the ananlysis
    df = ''
    df_test = pd.DataFrame() 
    df_org = ''
    predicted = ''
    
    def __init__(self):
        #Reads both train and test set for the experiment
        self.df      = pd.read_csv('CE802_P2_Data.csv')
        self.df_test = pd.read_csv('CE802_P2_Test.csv')
        #makes copy to avoid making changes to original dataset
        self.df_org  = self.df.copy(deep=True) 
        
    def featureEngineering(self): #Feature engineering procedures
        
        display(self.df.head())
        
        print('Columns Data type and Null Counter \n')
        print(self.df.info()) # column wise information of the DataFrame
        
        print('\n-------------------------------------------------------------------------------------------------------------\n')
        
        print('Rows * columns of training set : ',self.df.shape) # shape of the dataframe
        
        print('\n-------------------------------------------------------------------------------------------------------------\n')
        
        print('Description of given data \n')
        t = self.df.describe().apply(lambda s: s.apply(lambda x: format(x, 'f'))).transpose()
        display(t.head().T)
       
        print('\n-------------------------------------------------------------------------------------------------------------\n')
#         to check the balance of the target column
        print('Target balance check    : ', self.df.Class.unique(),'\n' ) # values of the Target suggest that it is a binary classification problem
        plt.figure()
        
        #Bar chart representation of the target column's balance
        sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [8, 5]})
        sns.distplot(
        self.df['Class'], norm_hist=False, kde=False, bins=20, hist_kws={"alpha": 1}
        ).set(xlabel='Target/Class', ylabel='Count'); #data is balanced   
        plt.show()
        plt.figure()
        print('\n-------------------------------------------------------------------------------------------------------------\n')
        
#       NULL value detection
        print('Missing value indicator : ', self.df.isna().any().any()) # missing value detection
        
        print('\n-------------------------------------------------------------------------------------------------------------\n')
        
#       empty value detection
        print('Null value indicator    : ', self.df.isnull().values.any()) #Checking presence of empty values
        
        print('\n-------------------------------------------------------------------------------------------------------------\n')
        
#       uses bar char to find frequeny of the missing value
        print('Missing value identification using bar chart \n')
        msno.bar(self.df)
        plt.figure()
        plt.show()
        print('Above heatmap clearly shows that 50% of the F21 is missing')
        print('\n-------------------------------------------------------------------------------------------------------------\n')

        
#       barchart representation of each feature for interpretation  
        print('Check Data spread using Histograms \n')
        self.df.hist(bins=30, figsize=(20, 15))
        plt.figure()
        plt.show()

        print('\n-------------------------------------------------------------------------------------------------------------\n')
        
#       heatmap to check missing value spread accross the feature  
        print('Missing value spread accross the column \n')
        sns.heatmap(self.df.isnull(),yticklabels=False,cbar=False,cmap="viridis")
        plt.show()
        

        print('\n-------------------------------------------------------------------------------------------------------------\n')

#         Check null value percentage of the feature F21
        F21NullValuesPercentage = (self.df['F21'].isnull().sum()/self.df.shape[0]*100).round(2) # Output: Percentage of missing values
        print('F21 Null value percentage :' , F21NullValuesPercentage,'%')

        print('\n-------------------------------------------------------------------------------------------------------------\n')
             
        
    def featureSelection(self): #To carry out feature selection procedures
        start = "\033[1m"
        end = "\033[0;0m"        
        # calculate correlation matrix
        print(start+'Correlation matrix heatmap'+end+'\n'.center(200))
        corr = self.df.corr()# plot the heatmap
        fig, ax = plt.subplots(figsize=(20,20))         # Sample figsize in inches
        sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))
        plt.show()
#       Outlier detection using box plot
        print('\n')
        fig, ax = plt.subplots(figsize=(15,5)) 
        print(start+'Outlier detection using Box plot'+end+'\n'.center(110))
        self.df.boxplot()
        plt.show()
    
    def modelSelection(self, models, imp='missing', scalingStrategy='standard'):
        #Creates predictor instance to compare and find best model     
        pred = predictor(self.df,imp='iterative',strategy='standard',)
        #compare and find best model and predict results
        pred.modelFinder(models)
        #gets predicted values from the predictor and store it on instance variable
        self.predicted = pred.pred
    
#     def dataCollection(self):
#         pass

In [24]:
#Choose from List of models for the experiment
models = ['catboost', 'dt', 'randomforest', 'svm', 'lgbm', 'xgb']
#Choose between different immputation methods
impStrategies = ['missing','mean','iterative','knn']
#Choose from different feature scaling methods
scalingStrategies = ['standard', 'minmax']
#Creates instance of Mlsystem class
ml = Mlsystem()
#Feature Engineering
ml.featureEngineering()
#Feature selection
ml.featureSelection()
#creates instance of predict class to compare and find best classifier for prediction then 
# uses the best model to predict and store results on the instance variable self.predicted
ml.modelSelection(models, imp=impStrategies[3], scalingStrategy=scalingStrategies[0] )

In [None]:
# Get predicted values and convert them into a list 
val = ml.predicted.tolist()
# convert result list to a dataframe
final_result = pd.DataFrame({'Class':val})
# inverse label encoding
final_result.replace(1,True, inplace=True)
final_result.replace(0,False, inplace=True)

### Part B

In [None]:
# HERE YOU WILL USE THIS TEMPLATE TO SAVE THE PREDICTIONS ON THE TEST SET

# Load the test data
test_df = pd.read_csv('CE802_P2_Test.csv')

# Make sure you work on a copy
test_data = test_df.iloc[:,:-1].copy()

predicted = final_result # CHANGE HERE -- use your previously trained predictor and apply it to test_data
                # (test_data can be modified if needed but make sure you don't change the order of the rows)...

# Replace the last (empty) column with your prediction
test_df.iloc[:,-1] = predicted

# Save to the destination file
test_df.to_csv('CE802_P2_Test_Predictions.csv', index=False, float_format='%.8g')

# IMPORTANT!! Make sure only the last column has changed
assert pd.read_csv('CE802_P2_Test.csv').iloc[:,:-1].equals(pd.read_csv('CE802_P2_Test_Predictions.csv').iloc[:,:-1])