### Part A

In [6]:
import warnings
warnings.filterwarnings("ignore")

In [7]:
import numpy as np # linear algebra
import pandas as pd #data pre-processing
import matplotlib #data visualization 
import matplotlib.pyplot as plt #data visualization 
import seaborn as sns #data visualization 
import missingno as msno #Missing value interpretation
from IPython.display import display

#import rossvalidation and train_test_split class
from sklearn.model_selection import KFold,RepeatedStratifiedKFold, cross_val_score, GridSearchCV
from sklearn.model_selection import train_test_split

# different scaling strategy classes 
from sklearn.preprocessing import StandardScaler, MinMaxScaler,LabelEncoder

#Pipeline to assemble several steps that can be cross-validated together while setting different parameters
from sklearn.pipeline import Pipeline

#Set of different regression algorithms for the analysis
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.svm import SVR

# different imputation strategies
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from collections import Counter

#evaluation metrics
from sklearn.metrics import  mean_squared_error,mean_absolute_error
from sklearn.metrics import  r2_score

from math import sqrt

## Predictor class which compares different regression models and selects best model

In [11]:
class predictor:
    data = pd.DataFrame()
    test_data = pd.DataFrame()
    imp = ''
    scalingStrategy = ''
    pred = ''
    
    def __init__(self,data, imp='missing', strategy = 'standard'):
        #receives data and makes copy to avoid working on original data
        self.data = data.copy(deep=True)
        self.imp = imp
        self.scalingStrategy = strategy
        
        
    def visualizeResults(self,best_model): #Visualizes the regressors results for comparison
        clfs      = []
        result    = pd.DataFrame(best_model.cv_results_)
        #sort models based on rank after cross validation
        result    = result.sort_values('rank_test_score')
                
        m = [['LinearRegression','lr'],['XGBRegressor','xgb'],['GradientBoostingRegressor','gbr'],
            ['AdaBoostRegressor','abr'],['DecisionTreeRegressor','dtr'],['RandomForestRegressor','rfr']
            ,['SVR','svr']]
        
        for i in result['param_regressor']:
            for j in range(len(m)):
                if m[j][0] == type(i).__name__:
                    clfs.append(m[j][1])
                    
                    
        print('\n')
        #plot comparison line graph of different classification models
        data_plot = pd.DataFrame({"Regressor": clfs,"Results":result['mean_test_score']})
        sns.lineplot(x = "Regressor", y = "Results", data=data_plot)
        plt.title('Regressors comparative analysis')
        plt.show()
        
        
    def immputation(self, data): #immpute and encode dataframe using different stratigies 
        
        #Replace categorical data with ordinal values to retain relationship
        transform = {"F20": {"Very low":0, "Low":1, "Medium":2, "High":3, "Very high":4}}
        data.replace(transform, inplace=True)
        #Perform one hot encoding to transform categorical data into numerical
        data = pd.get_dummies(data, columns= ['F27'])
        #Re-arranges the dataset so Target is at the end
        data = data[['F1','F2','F3','F4','F5','F6','F7','F8','F9','F10','F11','F12','F13','F14','F15','F16',
                   'F17','F18','F19','F20','F21','F22','F23','F24','F25','F26','F28','F29','F30'
                   ,'F31','F32','F33','F34','F35','F36','F27_Europe','F27_Rest','F27_UK','F27_USA','Target']]
        
        # feature column
        X = data.iloc[:, 0:39] 
        #Isolates the Target column from the dataset
        Y = data.iloc[:, [39]]    
        
        #chaining function calls featureScaling 
        return self.featureScaling(X,Y)
    
    def featureScaling(self, feature, target): #Normalise data using specified strategy
        feature_temp = ''
        if self.scalingStrategy == 'minmax': #Transform features by scaling each feature to a given range.
            scaler = MinMaxScaler().fit(feature)
            MinMaxScaler()
            feature_temp = scaler.transform(feature)
        elif self.scalingStrategy == 'standard': #Standardize features by removing the mean and scaling to unit variance.      
            scaler = StandardScaler().fit(feature)
            StandardScaler()
            feature_temp = scaler.transform(feature)
        #Target encoding
        target = LabelEncoder().fit_transform(target)
        return (feature_temp, target) 
    
#     def visualizeResult(self, result, best_model):
#         pass

    def predict(self,best_model):  #prediction using test data 
        #Read test dataset from csv
        self.test_data      = pd.read_csv('CE802_P3_Test.csv')       
        temp      = self.test_data.copy(deep=True) 
        #perform imputation and feature scaling on the test set
        X_test    = self.immputation(temp)[0]
        #predict the target value for the given test feature
        y_pred    = best_model.predict(X_test)
        #store predicted values
        self.pred = y_pred

                
        
    def modelFinder(self, clfs):#compares and finds the best model
        
        #Performs immputation and feature scaling on the training set
        immputed = self.immputation(self.data)
        feature, target = immputed[0], immputed[1]
        X_train,X_test,y_train,y_test=train_test_split(feature,target,test_size=0.2,random_state=0)
        print('X_train : ',X_train.shape ,'y_train : ',y_train.shape)
    
        params = {
                    'lr':{"regressor": [LinearRegression()], },
                    'svr':{"regressor": [SVR(C=1.0, epsilon=0.2)], },
                    'dtr':{"regressor": [DecisionTreeRegressor()], 
                          "regressor__max_depth" : [3,5,7,9,10,15,20,25],
                           "regressor__min_samples_leaf": [1,2,3,4,5,6,7,8,9,10] ,
                          },
                    'rfr':{"regressor": [RandomForestRegressor()],
                          "regressor__n_estimators": [1, 2, 4, 8, 16, 32, 64, 100, 200],
                          "regressor__bootstrap" :[True, False],
                           "regressor__max_depth" : [3,5,7,9,10,15,20,25],
                           "regressor__min_samples_leaf" :[1,2,3,4,5,6,7,8,9,10],
                           "regressor__n_estimators" :[1, 2, 4, 8, 16, 32, 64, 100, 200]
                          
                          },
                           'gbr':{"regressor": [GradientBoostingRegressor()], 
                           "regressor__learning_rate":[1, 0.5, 0.25, 0.1, 0.05, 0.01],
                           "regressor__n_estimators": [1, 2, 4, 8, 16, 32, 64, 100, 200],
                           "regressor__max_depth":[3,5,7,9,10,15,20,25],
                          },
                    'abr':{"regressor": [AdaBoostRegressor()],
                          "regressor__n_estimators": [1, 2, 4, 8, 16, 32, 50, 64, 100, 200],
                           "regressor__learning_rate":[1, 0.5, 0.25, 0.1, 0.05, 0.01],
                           
                          },
                }  
        pipe = Pipeline([("regressor", RandomForestRegressor())])
        # Create dictionary with candidate learning algorithms and their hyperparameters
        grid_param = [params[i] for i in clfs ]   
        # create a gridsearch of the pipeline, the fit the best model
        gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
        best_model = gridsearch.fit(X_train,y_train)
        self.visualizeResults(best_model)
        #prediction using best model on test set
        pred = best_model.predict(X_test)
        
        print('\n-------------------------------------------------------------------------------------------------------------\n')
        #model evaluation metrics
        print("The mean accuracy of the model is:",best_model.score(X_test,y_test))
        print('Mean Squared Error               : %.2f'% mean_squared_error(y_test, pred))
        print('Coefficient of Determination     : %.2f'% r2_score(y_test, pred))
        #calls prediction function with best model as the argyment
        self.predict(best_model)




   
      


## Main class of the machine learning system with fuction for each functionality of the learning procedure

In [12]:
class Mlsystem: #Mlsytem main class of the ananlysis
    df = ''
    df_test = pd.DataFrame() 
    df_org = ''
    predicted = ''
    
    
    def __init__(self):
        #Reads both train and test set for the experiment
        self.df      = pd.read_csv('CE802_P3_Data.csv')
        self.df_test = pd.read_csv('CE802_P3_Test.csv')
        #makes copy to avoid making changes to original dataset
        self.df_org  = self.df.copy(deep=True) 
    
    def space(self):
        print('\n-------------------------------------------------------------------------------------------------------------\n')

        
    def featureEngineering(self): #Feature engineering procedures 
        
        display(self.df.head())
        self.space()
       
        print('Columns Data type and Null Counter \n')
        print(self.df.info()) # column wise information of the DataFrame
        self.space()
        
        print('Rows * columns of training set : ',self.df.shape) # shape of the dataframe
        self.space()
        
        print('Description of given data \n')
        t = self.df.describe().T
        display(t)
        self.space()
        
        print('Missing value indicator : ', self.df.isna().any().any()) # missing value detection        
        self.space()
        
        
        print('Null value indicator    : ', self.df.isnull().values.any()) #Checking presence of empty values       
        self.space()
        
        #Use bar char to find frequeny of the missing value
        print('Missing value identification using bar chart \n')
        msno.bar(self.df)
        plt.figure()
        plt.show()
        self.space()  
        
        #barchart representation of each feature for interpretation  
        print('Check Data skewness using Bar chart \n')
        self.df.hist(bins=30, figsize=(20, 15))
        plt.figure()
        plt.show()   
        self.space()    
    
        
    def featureSelection(self): #To carry out feature selection procedures
        start = "\033[1m"
        end = "\033[0;0m"        
        # calculate correlation matrix
        print(start+'Correlation matrix heatmap'+end+'\n'.center(200))
        corr = self.df.corr()# plot the heatmap
        fig, ax = plt.subplots(figsize=(25,25))         # Sample figsize in inches
        sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))
        plt.figure()
        plt.show()      
#       Detect outliers using boxplot
        self.space()  
        fig, ax = plt.subplots(figsize=(15,5)) 
        print(start+'Outlier detection using Box plot'+end+'\n'.center(110))
        self.df.boxplot()
        plt.show()
    
    def modelBuilding(self, models, imp='missing', scalingStrategy='standard'):
        #Creates predictor instance to compare and find best model
        pred = predictor(self.df,imp,scalingStrategy)
        #compare and find best model and predict results
        pred.modelFinder(models)
        #gets predicted values from the predictor and store it on instance variable
        self.predicted = pred.pred


## Object oriented procedure for the learning system

In [13]:
#Choose from List of models for the experiment
models = [
    'lr', 'gbr',"abr",
    'dtr','rfr','svr'
]
#Choose between different immputation methods
impStrategies = ['missing','mean','iterative','knn']
#Choose from different feature scaling methods
scalingStrategies = ['standard', 'minmax']
#Creates instance of Mlsystem class
ml = Mlsystem()
#Feature Engineering procedures
ml.featureEngineering()
#Feature selection procedures
ml.featureSelection()
#creates instance of predict class to compare and find best regressor for prediction then 
# uses the best model to predict and store results on the instance variable self.predicted
ml.modelBuilding(models,imp=impStrategies[0], scalingStrategy=scalingStrategies[0] )
# joblib.dump(gs.best_estimator_, 'filename.pkl')


In [None]:
ml.predicted

## Part B

In [None]:
# HERE YOU WILL USE THIS TEMPLATE TO SAVE THE PREDICTIONS ON THE TEST SET

# Load the test data
# test_df = pd.read_csv('../input/mlassignmentpart3/CE802_P3_Test.csv')

# Make sure you work on a copy
test_data = test_df.iloc[:,:-1].copy()

predicted = ml.predicted # CHANGE HERE -- use your previously trained predictor and apply it to test_data
                # (test_data can be modified if needed but make sure you don't change the order of the rows)...

# Replace the last (empty) column with your prediction
test_df.iloc[:,-1] = predicted
test_df.head(30)

# Save to the destination file
test_df.to_csv('CE802_P3_Test_Predictions.csv', index=False, float_format='%.8g')

# IMPORTANT!! Make sure only the last column has changed
assert pd.read_csv('CE802_P3_Test.csv').iloc[:,:-1].equals(pd.read_csv('CE802_P3_Test_Predictions.csv').iloc[:,:-1])