## Confidentiality

The programmatic cases in this notebook are utilized from different internet resources (in this notebook especially from kaggle.com) and are for demonstrational purposes only.

Please do not copy or distribute this notebook.


## Table of content

Mercedes-Benz 

1. Programmatic case 1 
2. Programmatic case 2

## Introduction

The programmatic cases in this notebook are very advanced. The goal of this notebook should be to keep increasing a detailed understanding by repetitive analysis.

## Previous knowledge

For a good understanding of this notebook you should have a few years of data-science and programming experience and have studied the advanced programming notebooks.


#### Programmatic case 1

In [None]:
from google.colab import files
uploaded = files.upload()

Saving test.csv to test.csv
Saving train.csv to train.csv


In [None]:
import io
import pandas as pd
train = pd.read_csv(io.BytesIO(uploaded['train.csv']))
test = pd.read_csv(io.BytesIO(uploaded['test.csv']))



In [None]:
import numpy as np
from sklearn.base import BaseEstimator,TransformerMixin, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import ElasticNetCV, LassoLarsCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline, make_union
from sklearn.utils import check_array
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import r2_score



class StackingEstimator(BaseEstimator, TransformerMixin):
    
    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, y=None, **fit_params):
        self.estimator.fit(X, y, **fit_params)
        return self
    def transform(self, X):
        X = check_array(X)
        X_transformed = np.copy(X)
        # Adding class probabilities as a synthetic feature
        if issubclass(self.estimator.__class__, ClassifierMixin) and hasattr(self.estimator, 'predict_proba'):
            X_transformed = np.hstack((self.estimator.predict_proba(X), X))

        # Adding class prediction as a synthetic feature
        X_transformed = np.hstack((np.reshape(self.estimator.predict(X), (-1, 1)), X_transformed))

        return X_transformed


for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))



n_comp = 12

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test)

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)

#saving columns list before adding the decomposition components

usable_columns = list(set(train.columns) - set(['y']))

# Appending decomposition components to datasets
for i in range(1, n_comp + 1):
    train['pca_' + str(i)] = pca2_results_train[:, i - 1]
    test['pca_' + str(i)] = pca2_results_test[:, i - 1]

    train['ica_' + str(i)] = ica2_results_train[:, i - 1]
    test['ica_' + str(i)] = ica2_results_test[:, i - 1]

    train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
    test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

    train['grp_' + str(i)] = grp_results_train[:, i - 1]
    test['grp_' + str(i)] = grp_results_test[:, i - 1]

    train['srp_' + str(i)] = srp_results_train[:, i - 1]
    test['srp_' + str(i)] = srp_results_test[:, i - 1]

#usable_columns = list(set(train.columns) - set(['y']))

y_train = train['y'].values
y_mean = np.mean(y_train)
id_test = test['ID'].values
#finaltrainset and finaltestset are data to be used only the stacked model (does not contain PCA, SVD... arrays) 
finaltrainset = train[usable_columns].values
finaltestset = test[usable_columns].values


'''Train the xgb model then predict the test data'''

xgb_params = {
    'n_trees': 520, 
    'eta': 0.0045,
    'max_depth': 4,
    'subsample': 0.93,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1
}
# NOTE: Make sure that the class is labeled 'class' in the data file

dtrain = xgb.DMatrix(train.drop('y', axis=1), y_train)
dtest = xgb.DMatrix(test)

num_boost_rounds = 1250
# Training model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
y_pred = model.predict(dtest)

'''Train the stacked models then predict the test data'''

stacked_pipeline = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    StackingEstimator(estimator=GradientBoostingRegressor(learning_rate=0.001, loss="huber", max_depth=3, max_features=0.55, min_samples_leaf=18, min_samples_split=14, subsample=0.7)),
    LassoLarsCV()

)


stacked_pipeline.fit(finaltrainset, y_train)
results = stacked_pipeline.predict(finaltestset)

'''R2 Score on the entire Train data when averaging'''

print('R2 score on train data:')
print(r2_score(y_train,stacked_pipeline.predict(finaltrainset)*0.2855 + model.predict(dtrain)*0.7145))

'''Average the preditionon test data  of both models then save it on a csv file'''

sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred*0.75 + results*0.25
sub.to_csv('stacked-models.csv', index=False)


# Any results you write to the current directory are saved as output.

#### Programmatic case 2 - Functions



In [None]:
def basic_details(df):
    b = pd.DataFrame()
    b['Missing value'] = df.isnull().sum()
    b['N unique value'] = df.nunique()
    b['dtype'] = df.dtypes
    return b
basic_details(train)

Unnamed: 0,Missing value,N unique value,dtype
ID,0,4209,int64
y,0,2545,float64
X0,0,47,object
X1,0,27,object
X2,0,44,object
...,...,...,...
X380,0,2,int64
X382,0,2,int64
X383,0,2,int64
X384,0,2,int64


In [None]:
def feature_creation(df):
    for i in ['X0', 'X1', 'X2', 'X3', 'X5', 'X6', 'X8']:
        for j in ['X0', 'X1', 'X2', 'X3', 'X5', 'X6', 'X8']:
            df[i + "_" + j] = df[i].astype('str') + "_" + df[j].astype('str')

    return df

train = feature_creation(train)

train.sample(3)

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,X20,X21,X22,X23,X24,X26,X27,X28,X29,X30,X31,X32,X33,X34,X35,X36,X37,X38,X39,X40,...,X1_X2,X1_X3,X1_X5,X1_X6,X1_X8,X2_X0,X2_X1,X2_X2,X2_X3,X2_X5,X2_X6,X2_X8,X3_X0,X3_X1,X3_X2,X3_X3,X3_X5,X3_X6,X3_X8,X5_X0,X5_X1,X5_X2,X5_X3,X5_X5,X5_X6,X5_X8,X6_X0,X6_X1,X6_X2,X6_X3,X6_X5,X6_X6,X6_X8,X8_X0,X8_X1,X8_X2,X8_X3,X8_X5,X8_X6,X8_X8
486,927,129.0,af,s,as,c,d,d,h,q,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,s_as,s_c,s_d,s_h,s_q,as_af,as_s,as_as,as_c,as_d,as_h,as_q,c_af,c_s,c_as,c_c,c_d,c_h,c_q,d_af,d_s,d_as,d_c,d_d,d_h,d_q,h_af,h_s,h_as,h_c,h_d,h_h,h_q,q_af,q_s,q_as,q_c,q_d,q_h,q_q
1412,2794,93.57,o,l,as,f,d,ac,d,a,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,l_as,l_f,l_ac,l_d,l_a,as_o,as_l,as_as,as_f,as_ac,as_d,as_a,f_o,f_l,f_as,f_f,f_ac,f_d,f_a,ac_o,ac_l,ac_as,ac_f,ac_ac,ac_d,ac_a,d_o,d_l,d_as,d_f,d_ac,d_d,d_a,a_o,a_l,a_as,a_f,a_ac,a_d,a_a
1209,2411,113.39,x,aa,ai,e,d,ab,g,f,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,...,aa_ai,aa_e,aa_ab,aa_g,aa_f,ai_x,ai_aa,ai_ai,ai_e,ai_ab,ai_g,ai_f,e_x,e_aa,e_ai,e_e,e_ab,e_g,e_f,ab_x,ab_aa,ab_ai,ab_e,ab_ab,ab_g,ab_f,g_x,g_aa,g_ai,g_e,g_ab,g_g,g_f,f_x,f_aa,f_ai,f_e,f_ab,f_g,f_f


In [None]:
# Reducing memory for a clear mind

def reduce_memory_usage(df):
    """ The function will reduce memory of dataframe
    Note: Apply this function after removing missing value"""
    intial_memory = df.memory_usage().sum()/1024**2
    print('Intial memory usage:',intial_memory,'MB')
    for col in df.columns:
        mn = df[col].min()
        mx = df[col].max()
        if df[col].dtype != object:            
            if df[col].dtype == int:
                if mn >=0:
                    if mx < np.iinfo(np.uint8).max:
                        df[col] = df[col].astype(np.uint8)
                    elif mx < np.iinfo(np.uint16).max:
                        df[col] = df[col].astype(np.uint16)
                    elif mx < np.iinfo(np.uint32).max:
                        df[col] = df[col].astype(np.uint32)
                    elif mx < np.iinfo(np.uint64).max:
                        df[col] = df[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
            if df[col].dtype == float:
                df[col] =df[col].astype(np.float32)
    
    red_memory = df.memory_usage().sum()/1024**2
    print('Memory usage after complition: ',red_memory,'MB')
    
reduce_memory_usage(train)

Intial memory usage: 12.138504028320312 MB
Memory usage after complition:  1.758260726928711 MB


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

# Function to calculate mean absolute error
def mae(y_true, y_pred):
    return np.mean(abs(y_true - y_pred))
    
# Baseline score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

X = train.drop('y', axis=1)
y = train.y

baseline_guess = np.median(y)

rf = RandomForestRegressor(max_depth=4, n_estimators=5)

print('The baseline guess based on Y-median value %0.2f' % baseline_guess)
print("Baseline Performance based on Y-median value: MAE = %0.4f" % mae(y_test, baseline_guess))


The baseline guess based on Y-median value 99.15
Baseline Performance based on Y-median value: MAE = 9.9475


In [None]:
def xgb_r2_score(preds, dtrain):
    labels = dtrain.get_label()
    return 'r2', r2_score(labels, preds)

train_y = train['y'].values
train_X = train.drop(columns=['ID', 'y','X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8'])

dtrain = xgb.DMatrix(train_X, train_y, feature_names=train_X.columns.values)

