In [45]:
# importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# splitting data
from sklearn.model_selection import train_test_split

# models
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

# pipeline
from sklearn.pipeline import Pipeline

# scaler
from sklearn.preprocessing import StandardScaler

# metrics
from sklearn import metrics

# model saver
import pickle
import joblib

class RegressionModels:
    def __init__(self, df:pd.DataFrame, target):
        self.df = df
        self.data = self.df
        self.target = target
        self.tts = dict()
        self.scaler = {
            'ss': StandardScaler()
        }
        self.results = pd.DataFrame()
        print('Setting Up the Object...')
    
    def select_x(self, columns=[]):
        # select some columns specified or all
        if columns:
            self.data = self.df[columns]
        
        # check if target is still in x
        # if true remove it
        if self.target in list(self.data.columns):
            temp = list(self.data.columns)
            temp.remove(self.target)
            self.data = self.data[temp]
        return self.data
    
    def split_train_test(self, test_size=0.2, random_state=0):
        xtrain, xtest, ytrain, ytest = train_test_split(self.data, self.df[self.target], test_size=test_size, random_state=random_state)
        self.tts['xtrain'] = xtrain
        self.tts['xtest'] = xtest
        self.tts['ytrain'] = ytrain
        self.tts['ytest'] = ytest
        
    def metricsResults(self, ytrue, ypred):
        # Regression metrics
        explained_variance=metrics.explained_variance_score(ytrue, ypred)
        mean_absolute_error=metrics.mean_absolute_error(ytrue, ypred) 
        mse=metrics.mean_squared_error(ytrue, ypred) 
        # mean_squared_log_error=metrics.mean_squared_log_error(ytrue, ypred)
        median_absolute_error=metrics.median_absolute_error(ytrue, ypred)
        r2=metrics.r2_score(ytrue, ypred)

        return r2
        
        
    def regressionResults(self,model, prediction_training, prediction_testing):
        
        # getting metrics
        results_training = self.metricsResults(self.tts['ytrain'], prediction_training)
        results_testing = self.metricsResults(self.tts['ytest'], prediction_testing)
        
        # creating dataframe for results
        results = [results_training, results_testing]
        return results
    
    def saveModel(self, model, title):
        path_pickle = 'saved models/'+title+'.pkl'
        path_joblib = 'saved models/'+title+'.sav'
        pickle.dump(model, open(path_pickle, 'wb'))
        joblib.dump(model, open(path_joblib, 'wb'))
        print('model saved at',path_pickle,'and',path_joblib)
        
    def linearRegressionModel(self, scaler='ss'):
        pipeline = Pipeline(steps = [
            ('preprocessor', self.scaler[scaler] if scaler in self.scaler else self.scaler['ss']),
            ('regressor', LinearRegression())
        ])
        pipeline.fit(self.tts['xtrain'], self.tts['ytrain'])
        prediction_train = pipeline.predict(self.tts['xtrain'])
        prediction_test = pipeline.predict(self.tts['xtest'])
        # return model, prediction on training, prediction on test
        self.results['Linear Regression'] = self.regressionResults(pipeline, prediction_train, prediction_test)
        
        # save model
        self.saveModel(pipeline, 'linear_regression_model')
        return pipeline
    
    def logisticRegressionModel(self, scaler='ss'):
        pipeline = Pipeline(steps = [
            ('preprocessor', self.scaler[scaler] if scaler in self.scaler else self.scaler['ss']),
            ('regressor', LogisticRegression())
        ])
        pipeline.fit(self.tts['xtrain'], self.tts['ytrain'])
        prediction_train = pipeline.predict(self.tts['xtrain'])
        prediction_test = pipeline.predict(self.tts['xtest'])
        # return model, prediction on training, prediction on test
        self.results['Logistic Regression'] = self.regressionResults(pipeline, prediction_train, prediction_test)
        
        # save model
        self.saveModel(pipeline, 'logistic_regression_model')
        return pipeline
    
    def decisionTreeRegressionModel(self):
        pipeline = Pipeline(steps = [
            ('regressor', DecisionTreeRegressor())
        ])
        pipeline.fit(self.tts['xtrain'], self.tts['ytrain'])
        prediction_train = pipeline.predict(self.tts['xtrain'])
        prediction_test = pipeline.predict(self.tts['xtest'])
        # return model, prediction on training, prediction on test
        self.results['Decision Tree Regression'] = self.regressionResults(pipeline, prediction_train, prediction_test)
        
        # save model
        self.saveModel(pipeline, 'decision_tree_regression_model')
        return pipeline
        
    def polynomialRegressionModel(self, scaler='ss', degree=2):
        pipeline = Pipeline(steps = [
            ('preprocessor', self.scaler[scaler] if scaler in self.scaler else self.scaler['ss']),
            ('polynomial', PolynomialFeatures(degree=degree)),
            ('regressor', DecisionTreeRegressor())
        ])
        pipeline.fit(self.tts['xtrain'], self.tts['ytrain'])
        prediction_train = pipeline.predict(self.tts['xtrain'])
        prediction_test = pipeline.predict(self.tts['xtest'])
        # return model, prediction on training, prediction on tes
        self.results['Polynomial Regression'] = self.regressionResults(pipeline, prediction_train, prediction_test)
        
        # save model
        self.saveModel(pipeline, 'polynomial_regression_model')
        return pipeline
    
    
df = pd.read_csv('../_data/pek-sha.csv')
obj = Preprocessing(df, target='price')
obj.transformToTimestamp(columns=['departureDate','arrivalDate', 'createDate'])
obj.labelEncodingColumns(skip=['departureDate','arrivalDate', 'createDate'])
data, columns = obj.featureSelectionCorrelation(effect=0.3, plot=False)
print(columns)
models = RegressionModels(data, target='price')
models.select_x()
models.split_train_test()
models.linearRegressionModel()
models.decisionTreeRegressionModel()
models.polynomialRegressionModel()
models.results

['cabinClass', 'priceClass', 'price', 'rate']
Setting Up the Object...
model saved at saved models/linear_regression_model.pkl and saved models/linear_regression_model.sav
model saved at saved models/decision_tree_regression_model.pkl and saved models/decision_tree_regression_model.sav
model saved at saved models/polynomial_regression_model.pkl and saved models/polynomial_regression_model.sav


Unnamed: 0,Linear Regression,Decision Tree Regression,Polynomial Regression
0,0.796305,0.980398,0.980398
1,0.797161,0.980002,0.980002
