In [None]:
pip install catboost

In [None]:
pip install lightgbm

In [None]:
!pip install setuptools numpy scipy scikit-learn -U
!pip install xgboost
!pip install linearmodels
!pip install mlxtend

In [None]:
# data analysis and wrangling
import researchpy as rp
import pandas as pd
import numpy as np
import statistics
from itertools import chain
# visualization
from pandas.plotting import scatter_matrix
from statsmodels.graphics.tsaplots import plot_acf
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from pprint import pprint
%matplotlib inline
#plotly
import plotly.io as pio
import plotly.express as px
from plotly.offline import download_plotlyjs,init_notebook_mode, plot, iplot
import plotly as py 
import plotly.graph_objs as go # plotly graphical object
import plotly.io as pio
pio.renderers.default='notebook'
# setting the general visualization style
sns.set_style('whitegrid')
# feature engineering
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
#Libraries for Statistical Models
import statsmodels.api as sm
# ignoring warnings in the notebook
import warnings 
warnings.filterwarnings('ignore') 
# To display full output 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# feature engineering
import pandas_profiling as pp
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# machine learning models
from sklearn.linear_model import Lasso,ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
# model selection
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

# Tuning and Esembling
from sklearn.model_selection import GridSearchCV        
from sklearn.model_selection import RandomizedSearchCV 
from mlxtend.regressor import StackingCVRegressor

In [None]:
def feature_importance(regressor, title):
# setting up the frame
    fig, axes = plt.subplots(figsize=(8,15))
# setting up parameters
    indices = np.argsort(regressor.feature_importances_)[::-1][:15]
# ploting feature importance
    g = sns.barplot(y=X_train.columns[indices][0:30],
                x = regressor.feature_importances_[indices][0:15],
                orient='h')
# labeling
    g.set_xlabel("Relative importance",fontsize=15)
    g.set_ylabel("Features",fontsize=15)
    g.tick_params(labelsize=9)
    g.set_title(title + " feature importance");

In [None]:
def cv_score_defaultmodel(score, figure_name):
    # creating lists to store the cv results
    cv_results = []
    cv_means = []
    cv_std = []
    # get cv results for models in the classifier list
    for i in regressors:
        cv_results.append(cross_val_score(i, X_train, y_train, scoring = score, cv = kfold, n_jobs = -1))
    # calculate and append means and stdv
    for cv_result in cv_results:
        cv_means.append(cv_result.mean())
        cv_std.append(cv_result.std())
    # creating results table
    cv_res = pd.DataFrame({'Models':['Lasso Regression',
                                     'Elastic Nets',
                                     'DecisionTreeRegressor',
                                     'RandomForestRegressor',
                                     'CatBoostRegressor',
                                     'LightGBM Regressor',
                                     'AdaBoostRegressor',
                                     'XGBRegressor'],
                                'CrossValMeans' : cv_means,
                                 'CrossValerrors' : cv_std,})
    # highlight the model with highest score
    final_cvtable = cv_res.style.highlight_max(color = 'yellow', axis = 0, subset=['CrossValMeans'])
    # plotting the cv scores for each model
    g = sns.barplot('CrossValMeans','Models',data = cv_res, palette='Set3',orient = 'h',**{'xerr':cv_std})
    # labeling
    g.set_xlabel('Mean ' + score + ' Score')
    g = g.set_title(figure_name + ' Cross validation scores' )
    
    return final_cvtable

In [None]:
df_cursed = pd.read_csv("/project/data_for_models/feature_eng_cursed.csv",index_col=0)

Target1 = pd.read_csv("/project/data_for_models/Target_mkt.csv",index_col=0)
Target2 = pd.read_csv("/project/data_for_models/Target_sub.csv",index_col=0)

random_state = 42
# Split into train and validation set
X_train, X_test, y_train, y_test = train_test_split(df_cursed,Target2,test_size=0.20, random_state=random_state,shuffle=True)
kfold = KFold(n_splits = 3,shuffle = True)
y_train = y_train['Excess_sales_sub'].values.ravel()
y_test = y_test['Excess_sales_sub'].values.ravel()

regressors = []
regressors.append(Lasso(random_state=random_state))
regressors.append(ElasticNet(random_state=random_state))
regressors.append(DecisionTreeRegressor(random_state=random_state))
regressors.append(RandomForestRegressor(random_state=random_state))
regressors.append(CatBoostRegressor(random_state=random_state))
regressors.append(LGBMRegressor(random_state=random_state))
regressors.append(AdaBoostRegressor(random_state=random_state))
regressors.append(XGBRegressor(random_state=random_state))

In [None]:
cross_val_score(RandomForestRegressor(random_state=random_state), X_train, y_train, scoring = 'neg_root_mean_squared_error', cv = kfold, n_jobs= -1)


In [None]:
cv_score_defaultmodel('neg_root_mean_squared_error', 'CURSED neg_root_mean_squared_error')

In [None]:
cv_score_defaultmodel('r2', 'CURSED r2')

In [None]:
#feature_importance(Lasso(random_state=random_state).fit(X_train, y_train) , 'Lasso')

In [None]:
#feature_importance(ElasticNet(random_state=random_state).fit(X_train, y_train) , 'ElasticNet')

In [None]:
feature_importance(DecisionTreeRegressor(random_state=random_state).fit(X_train, y_train) , 'DecisionTreeRegressor')

In [None]:
feature_importance(RandomForestRegressor(random_state=random_state).fit(X_train, y_train) , 'RandomForestRegressor')

In [None]:
feature_importance(CatBoostRegressor(random_state=random_state).fit(X_train, y_train), 'CatBoostRegressor')

In [None]:
feature_importance(LGBMRegressor(random_state=random_state).fit(X_train, y_train), 'LGBMRegressor')

In [None]:
feature_importance(AdaBoostRegressor(random_state=random_state).fit(X_train, y_train), 'AdaBoostRegressor')

In [None]:
feature_importance(XGBRegressor(random_state=random_state).fit(X_train, y_train), 'XGBRegressor')