In [6]:
import pandas as pd
import numpy as np
from scipy.stats import boxcox
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
# from sklearn.cross_validation import KFold
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

In [2]:
test = pd.read_csv('test_3.csv')
train = pd.read_csv('train_3.csv')

test = test.drop(columns=['day_of_month'])
train = train.drop(columns=['day_of_month'])

In [3]:
class TestModels:
    def __init__(self, model, data_train, data_dev, params=None):
        self.model = model
        self.grid_search = None
        self.params = params
        scaler = MinMaxScaler()
        self.X_train = data_train.drop(columns=['forecast_volume'])
        self.y_train = data_train.forecast_volume
        self.X_dev = data_dev.drop(columns=['forecast_volume'])
        self.y_dev = data_dev.forecast_volume
        self.X_train = scaler.fit_transform(self.X_train)
        self.X_dev = scaler.transform(self.X_dev)
        self.pred = None
    
    def fit_model(self):         
        if self.params != None:
            cv = KFold(n_splits=5,shuffle=True) 
            self.grid_search = GridSearchCV(self.model,self.params,cv=cv)
            self.grid_search.fit(self.X_train,self.y_train)
            self.pred = self.grid_search.predict(self.X_dev)
            return self.pred, self.grid_search
        else:
            self.model.fit(self.X_train,self.y_train)
            self.pred = self.model.predict(self.X_dev)    
            return self.pred, self.model
        
    def show_metrics(self):
        if type(self.pred) != type(np.array([1,2])):
            raise Exception('Firstly fit the model!')
        else:
            if self.params != None:
                print("{:<30}{}".format('MSE score on dev:',mean_squared_error(self.y_dev, self.pred)))
                print("{:<30}{}".format('Best params:',self.grid_search.best_params_))
            else:
                print("{:<30}{}".format('MSE score on dev:',mean_squared_error(self.y_dev, self.pred)))

    # Функция взята с офф документации sklearn            
    def plot_confusion_matrix(self, normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
        
        # Plot non-normalized confusion matrix
        plt.figure()
        
        if type(self.pred) != type(np.array([1,2])):
            raise Exception('Firstly fit the model!')
        
        else:        
            cm = confusion_matrix(self.y_dev, self.pred)
            np.set_printoptions(precision=2)
            classes = ['0', '1']

            """
            This function prints and plots the confusion matrix.
            Normalization can be applied by setting `normalize=True`.
            """
            if normalize:
                cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
                print("Normalized confusion matrix")
            else:
                print('Confusion matrix, without normalization')

            print(cm)

            plt.imshow(cm, interpolation='nearest', cmap=cmap)
            plt.title(title)
            plt.colorbar()
            tick_marks = np.arange(len(classes))
            plt.xticks(tick_marks, classes, rotation=45)
            plt.yticks(tick_marks, classes)

            fmt = '.2f' if normalize else 'd'
            thresh = cm.max() / 2.
            for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
                plt.text(j, i, format(cm[i, j], fmt),
                         horizontalalignment="center",
                         color="white" if cm[i, j] > thresh else "black")

            plt.ylabel('True label')
            plt.xlabel('Predicted label')
            plt.tight_layout()
            plt.show()        

In [7]:
# param_grid = [{
#     'n_estimators':[10,20,40],
#     'max_depth':[4,5,8,10],
#     'min_samples_leaf':[1,10,25]
#     }]
# randomforest_model = TestModels(RandomForestRegressor(),train, test, params=param_grid)
gradient_model = TestModels(GradientBoostingRegressor(random_state=42,
                                                      max_depth=14,
                                                      min_samples_leaf=10,
                                                      n_estimators=30),train, test, params=None)

In [8]:
gradient_model.fit_model()

(array([0.33994076, 0.35801697, 0.32106581, ..., 0.27570879, 0.55154273,
        0.31161389]),
 GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='ls', max_depth=14, max_features=None,
              max_leaf_nodes=None, min_impurity_decrease=0.0,
              min_impurity_split=None, min_samples_leaf=10,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=30, presort='auto', random_state=42,
              subsample=1.0, verbose=0, warm_start=False))

In [24]:
gradient_model.show_metrics()

MSE score on dev:             7.669691674880589
