In [1]:
#importing modules
from statsmodels.stats.outliers_influence import variance_inflation_factor
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer

In [2]:
#reading train and test dataset
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
#Take a quick peak of data 
train_df.head()

In [4]:
train_df.shape

In [5]:
test_df.head()

In [6]:
test_df.shape

In [7]:
#Remove Id column from dataframes
train_df = train_df.drop("Id",axis=1)
test_df = test_df.drop("Id",axis=1)

In [8]:
#Create 'Month' and 'Year' int columns, remove 'Open Date' column

def time_feature_convert(dataset):
    dataset['Date'] = pd.to_datetime(dataset['Open Date'])  #Adding new features-Date,Month,Year,Years _old
    dataset['Month'] = [x.month for x in dataset['Date']]
    dataset['Year'] = [x.year for x in dataset['Date']]
    #dataset['Years_old'] = pd.to_datetime('25-01-2015').year - dataset['Year']  #25-01-2014 is the latest date in the
    dataset.drop(['Open Date','Date'],axis=1,inplace=True)                      #dataset.So taking 25-01-2015 as reference.
    return dataset


In [9]:
time_feature_convert(train_df)
time_feature_convert(test_df)

Unnamed: 0,City,City Group,Type,P1,P2,P3,P4,P5,P6,P7,...,P30,P31,P32,P33,P34,P35,P36,P37,Month,Year
0,Niğde,Other,FC,1,4.0,4.0,4.0,1,2,5,...,0,0,0,0,0,0,0,0,1,2011
1,Konya,Other,IL,3,4.0,4.0,4.0,2,2,5,...,0,0,0,0,0,0,0,0,3,2011
2,Ankara,Big Cities,FC,3,4.0,4.0,4.0,2,2,5,...,0,0,0,0,0,0,0,0,10,2013
3,Kocaeli,Other,IL,2,4.0,4.0,4.0,2,3,5,...,0,4,0,0,0,0,0,0,5,2013
4,Afyonkarahisar,Other,FC,2,4.0,4.0,4.0,1,2,5,...,0,0,0,0,0,0,0,0,7,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,Antalya,Other,FC,5,5.0,4.0,4.0,2,2,5,...,0,0,0,0,0,0,0,0,1,2000
99996,Niğde,Other,IL,1,2.0,4.0,3.0,1,1,1,...,5,0,0,0,4,0,0,0,7,2011
99997,İstanbul,Big Cities,IL,4,5.0,4.0,4.0,1,2,5,...,5,5,3,2,4,4,4,2,12,2012
99998,İstanbul,Big Cities,FC,12,7.5,6.0,6.0,4,4,10,...,0,0,0,4,0,0,0,0,10,2013


In [10]:
#Take a quick peak of data & Shape of data

train_df.info()
train_df.head()

test_df.info()
test_df.head()

In [11]:
#correlation heatmap

quantitative_feats = [i for i in train_df if train_df[i].dtype != np.object]

def heatmap(df, figsize):
    fig, axs = plt.subplots(figsize=figsize)

    sns.set_theme()
    sns.heatmap(df.corr(), annot=True, linewidths=.5, ax=axs)
    #sns.heatmap(df.corr(), annot=True, linewidths=.7, cmap='coolwarm', fmt='.1f', ax=axs)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  quantitative_feats = [i for i in train_df if train_df[i].dtype != np.object]


In [12]:
# display the overall correlation heatmap
heatmap(df=train_df[quantitative_feats], figsize=(25, 25))

In [13]:
#calculate VIF 

def calc_vif(X):
    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return (vif)

In [14]:
## Calcualte the VIF of the quantitaive features
vif = calc_vif(train_df[quantitative_feats])

## VIF < 50
low_vif = vif[vif["VIF"] < 50].sort_values("VIF",ascending=True) # 

print(low_vif)

   variables        VIF
37   revenue   5.657841
26       P27   6.099163
38     Month   6.927757
21       P22  10.535807
22       P23  12.434122
20       P21  13.501885
16       P17  14.666256
36       P37  15.530037
32       P33  17.187369
4         P5  17.470356
5         P6  19.508310
29       P30  23.759225
13       P14  25.083089
30       P31  25.356186
19       P20  25.488478
18       P19  25.602905
10       P11  27.073864
28       P29  32.589613
27       P28  32.780340
24       P25  38.572113
14       P15  40.525114
34       P35  43.889129
23       P24  46.906321
6         P7  49.465125


In [15]:
# define the function for label and one-hot encoding
def label_encode_transform(df, cols):
    cols = cols
    le = preprocessing.LabelEncoder()
    df[cols] = df[cols].apply(le.fit_transform)
    return df


def onehot_encode_transform(df, cols):
    cols = cols
    df = pd.get_dummies(df, columns=cols)
    return df

In [16]:
#one-hot and label encodings
train_df=label_encode_transform(df=train_df, cols=['City'])
train_df=onehot_encode_transform(df=train_df, cols=['City Group', 'Type']) #_encode

test_df=label_encode_transform(df=test_df, cols=['City'])
test_df=onehot_encode_transform(df=test_df, cols=['City Group', 'Type']) #_encode

In [17]:
train_df.info()
train_df.head()

In [18]:
# define the function for Standart(Standartization) or MinMax(Normalization) Scaler

def scaler(df):
    sc = StandardScaler()
    #sc = MinMaxScaler()
    data = df[features_to_standardize]
    scalered_data = sc.fit_transform(data)
    scalered_data_df = pd.DataFrame(scalered_data, columns=[features_to_standardize])
    
    return scalered_data_df

In [19]:
#Standartization or Normalization of train features

#features_to_standardize = quantitative_feats[:-3]
#features_to_standardize = ["P2", "P6", "P28", "P11", "P17", "P21", "P22", "P26", "P23", "P33"]
features_to_standardize = ["P2", "P6", "P28", "P27","P22","P23"]
features_to_drop=["P1","P2","P3","P4","P5","P6","P7","P8","P9","P10","P11","P12","P13","P14","P15","P16","P17","P18","P19","P20",
                  "P21","P22","P23","P24","P25","P26","P27","P28","P29","P30","P31","P32","P33","P34","P35","P36","P37"]
scalered_data_df = scaler(train_df)
train_df = train_df.drop(features_to_drop, axis=1)
train_df[features_to_standardize] = scalered_data_df[features_to_standardize]


In [20]:
#Standartization or Normalization of test features

scalered_data_df = scaler(test_df)
test_df = test_df.drop(features_to_drop, axis=1)
test_df[features_to_standardize] = scalered_data_df[features_to_standardize]

In [21]:
train_df

Unnamed: 0,City,revenue,Month,Year,City Group_Big Cities,City Group_Other,Type_DT,Type_FC,Type_IL,P2,P6,P28,P27,P22,P23
0,31,5653753.0,7,1999,True,False,False,False,True,0.391716,-0.638471,-0.531493,1.385793,0.630997,-0.093190
1,3,6923131.0,2,2008,True,False,False,True,False,0.391716,-0.638471,-0.096779,-0.556444,0.630997,-0.313312
2,10,2055379.0,3,2013,False,True,False,False,True,-0.270816,-0.168199,-0.966207,-0.556444,-1.000071,-0.533434
3,28,2675511.0,2,2012,False,True,False,False,True,0.060450,0.302072,-0.314136,0.657454,-1.000071,1.447663
4,14,4316715.0,5,2009,False,True,False,False,True,-0.270816,-0.638471,-0.966207,1.871352,-0.184537,-0.533434
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132,29,5787594.0,6,2008,False,True,False,True,False,-0.933348,-0.638471,-0.531493,-0.556444,-1.000071,-0.533434
133,32,9262754.0,10,2006,True,False,False,True,False,0.391716,-0.168199,-0.096779,-0.556444,-1.000071,-0.533434
134,18,2544857.0,7,2006,False,True,False,True,False,-0.270816,-0.168199,-0.531493,-0.556444,-0.184537,-0.313312
135,31,7217634.0,10,2010,True,False,False,True,False,0.391716,-0.638471,-0.096779,-0.556444,-1.000071,-0.533434


In [22]:
train_df.head(5)

Unnamed: 0,City,revenue,Month,Year,City Group_Big Cities,City Group_Other,Type_DT,Type_FC,Type_IL,P2,P6,P28,P27,P22,P23
0,31,5653753.0,7,1999,True,False,False,False,True,0.391716,-0.638471,-0.531493,1.385793,0.630997,-0.09319
1,3,6923131.0,2,2008,True,False,False,True,False,0.391716,-0.638471,-0.096779,-0.556444,0.630997,-0.313312
2,10,2055379.0,3,2013,False,True,False,False,True,-0.270816,-0.168199,-0.966207,-0.556444,-1.000071,-0.533434
3,28,2675511.0,2,2012,False,True,False,False,True,0.06045,0.302072,-0.314136,0.657454,-1.000071,1.447663
4,14,4316715.0,5,2009,False,True,False,False,True,-0.270816,-0.638471,-0.966207,1.871352,-0.184537,-0.533434


In [23]:
test_df.head(5)

Unnamed: 0,City,Month,Year,City Group_Big Cities,City Group_Other,Type_DT,Type_FC,Type_IL,Type_MB,P2,P6,P28,P27,P22,P23
0,38,1,2011,False,True,False,True,False,False,-0.299599,-0.57587,-0.57743,-0.534273,-1.049298,0.076289
1,27,3,2011,False,True,False,False,True,False,-0.299599,-0.57587,-1.045445,-0.534273,-0.315718,-0.55942
2,3,10,2013,True,False,False,True,False,False,-0.299599,-0.57587,-0.57743,-0.534273,1.885022,0.288193
3,26,5,2013,False,True,False,False,True,False,-0.299599,0.077118,-0.57743,-0.534273,-0.315718,-0.347517
4,1,7,2013,False,True,False,True,False,False,-0.299599,-0.57587,0.826615,-0.534273,-1.049298,-0.55942


## Data Processing

In [24]:
## dependant vairable 
y = train_df["revenue"]
train_df = train_df.drop("revenue",axis=1)
test_df = test_df.drop("Type_MB",axis=1)

## combining
df_all = pd.concat([train_df, test_df],axis=0)

## check on the shape
df_all.shape

(100137, 14)

In [25]:
df_all.head()

Unnamed: 0,City,Month,Year,City Group_Big Cities,City Group_Other,Type_DT,Type_FC,Type_IL,P2,P6,P28,P27,P22,P23
0,31,7,1999,True,False,False,False,True,0.391716,-0.638471,-0.531493,1.385793,0.630997,-0.09319
1,3,2,2008,True,False,False,True,False,0.391716,-0.638471,-0.096779,-0.556444,0.630997,-0.313312
2,10,3,2013,False,True,False,False,True,-0.270816,-0.168199,-0.966207,-0.556444,-1.000071,-0.533434
3,28,2,2012,False,True,False,False,True,0.06045,0.302072,-0.314136,0.657454,-1.000071,1.447663
4,14,5,2009,False,True,False,False,True,-0.270816,-0.638471,-0.966207,1.871352,-0.184537,-0.533434


In [26]:
## Missing value
def missing_value(df):
    
    ## number of missing values
    number = df.isnull().sum().sort_values(ascending=False)
    number = number[number > 0]
    
    ## percentage of missing value
    percentage = df.isnull().sum()*100 /df.shape[0]
    percentage = percentage[percentage > 0].sort_values(ascending=False)
    
    return pd.concat([number,percentage],axis=1,keys=["Total","Percentage"])

missing_value(df_all)

Unnamed: 0,Total,Percentage


In [27]:
## Split the dataset back into train and test dataset
n = len(y)

## train dataset
train_df = df_all[:n]

## test dataset
test_df = df_all[n:]

## Check on thier shapes
print("Shape of train dataset: {}".format(train_df.shape))
print("Shape of test dataset: {}".format(test_df.shape))
test_df.head()

Shape of train dataset: (137, 14)
Shape of test dataset: (100000, 14)


## Model Building

In [28]:
## import package
from sklearn.model_selection import train_test_split

## Split the data into train and test set
X_train, X_test, y_train, y_test =  train_test_split(train_df,y,test_size=0.33,random_state=42)


## Check on the dataset shape
print("Shapes: ", X_train.shape, X_test.shape, y_train.shape, y_test.shape)

Shapes:  (91, 14) (46, 14) (91,) (46,)


In [30]:
# GridSearchCV Best Parameters 

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, GradientBoostingRegressor, AdaBoostRegressor

### Random Forest

In [33]:
## Parameters
params = {
    "max_depth": ["None",10, 30, 50, 75, 100],
    "max_features": ["auto",0.3, 0.6],
    "min_samples_leaf": [1,3,5,7],
    "min_samples_split": [2, 4, 8, 12],
    "n_estimators": [30, 50, 100, 200],
    "random_state" : [42]
}

## RandomForestRegressor
RFR = RandomForestRegressor()
RFR_grid = GridSearchCV(RFR, params, scoring='r2', cv=7, n_jobs=-1)
RFR_grid.fit(X_train, y_train)

## Output
print("Best parameters:  {}:".format(RFR_grid.best_params_))
print("Best score: {}".format(RFR_grid.best_score_))

best_RF_model = RFR_grid.best_estimator_

Best parameters:  {'max_depth': 10, 'max_features': 0.3, 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 50, 'random_state': 42}:
Best score: -0.10048726435896627


3584 fits failed out of a total of 8064.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1344 fits failed with the following error:
Traceback (most recent call last):
  File "d:\practice\neuralnetwork\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\practice\neuralnetwork\venv\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "d:\practice\neuralnetwork\venv\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "d:\practice\neuralnetwork\venv\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constrai

In [34]:
#RF Prediction
## Fit the model 
best_RF_model = best_RF_model.fit(train_df,y) 

## Submission
submission = pd.read_csv("sampleSubmission.csv")

submission.iloc[:,1] = best_RF_model.predict(test_df)
submission.to_csv('submission', index=False)
submission

Unnamed: 0,Id,Prediction
0,0,5.165639e+06
1,1,3.239814e+06
2,2,3.909606e+06
3,3,3.267896e+06
4,4,3.597067e+06
...,...,...
99995,99995,4.473815e+06
99996,99996,3.906508e+06
99997,99997,5.176956e+06
99998,99998,5.390989e+06


### Gradient Boosting

In [36]:
## Parameters
params = {
    "max_depth": [2, 3, 6, 10],
    "max_features": ["auto",0.3, 0.6],
    "min_samples_leaf": [1,3],
    "min_samples_split": [2, 5],
    "n_estimators": [30, 50, 100, 200],
    "random_state" : [42],
    "tol" : [0.0001, 0.001, 0.01, 0.1]
}

## GradientBoostingRegressor
GBR = GradientBoostingRegressor()
GBR_grid = GridSearchCV(GBR, params, scoring='r2', cv=7, n_jobs=-1)
GBR_grid.fit(X_train, y_train)

## Output
print("Best parameters:  {}:".format(GBR_grid.best_params_))
print("Best score: {}".format(GBR_grid.best_score_))

best_GBR_model = GBR_grid.best_estimator_

Best parameters:  {'max_depth': 2, 'max_features': 0.6, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 30, 'random_state': 42, 'tol': 0.0001}:
Best score: -0.21722566697141055


1792 fits failed out of a total of 5376.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1146 fits failed with the following error:
Traceback (most recent call last):
  File "d:\practice\neuralnetwork\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\practice\neuralnetwork\venv\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "d:\practice\neuralnetwork\venv\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "d:\practice\neuralnetwork\venv\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constrai

In [37]:
#GBR Prediction
## Fit the model 
best_GBR_model = best_GBR_model.fit(train_df,y) 

## Submission
submission = pd.read_csv("sampleSubmission.csv")

submission.iloc[:,1] = best_GBR_model.predict(test_df)
submission.to_csv('submission', index=False)
submission

Unnamed: 0,Id,Prediction
0,0,5.066354e+06
1,1,3.207280e+06
2,2,3.104824e+06
3,3,2.975386e+06
4,4,3.539329e+06
...,...,...
99995,99995,6.657067e+06
99996,99996,4.651120e+06
99997,99997,4.924446e+06
99998,99998,3.990451e+06


### AdaBoostRegressor

In [39]:
## parameters
params = {
    "n_estimators": [10, 30, 50, 100],
    "learning_rate": [.01, 0.1, 0.5, 0.9, 0.95, 1],
    "random_state" : [42]
}

## XGBoost Regressor
AdaBoostR =   AdaBoostRegressor()
AdaBoostR_grid = GridSearchCV(AdaBoostR, params, scoring='r2', cv=7, n_jobs=-1)
AdaBoostR_grid.fit(X_train, y_train)

## Output
print("Best parameters:  {}:".format(AdaBoostR_grid.best_params_))
print("Best score: {}".format(AdaBoostR_grid.best_score_))

## Append to list
best_AdaBoostR_model = AdaBoostR_grid.best_estimator_

Best parameters:  {'learning_rate': 0.01, 'n_estimators': 30, 'random_state': 42}:
Best score: -0.12345480209324537


In [40]:
#AdaBoostR Prediction
## Fit the model 
best_AdaBoostR_model = best_AdaBoostR_model.fit(train_df,y) 

## Submission
submission = pd.read_csv("sampleSubmission.csv")

submission.iloc[:,1] = best_AdaBoostR_model.predict(test_df)
submission.to_csv('submission', index=False)
submission

Unnamed: 0,Id,Prediction
0,0,4.942454e+06
1,1,3.518630e+06
2,2,3.084603e+06
3,3,2.897992e+06
4,4,3.403823e+06
...,...,...
99995,99995,4.543628e+06
99996,99996,4.543628e+06
99997,99997,5.066216e+06
99998,99998,4.102491e+06


### Voting Regressor

In [43]:
VR_model = VotingRegressor([('RF', best_RF_model), ('GBR', best_GBR_model), ('AdaBoostR', best_AdaBoostR_model)])
VR_model = VR_model.fit(train_df,y) 

## Submission
submission = pd.read_csv("sampleSubmission.csv")

submission.iloc[:,1] = VR_model.predict(test_df)
submission.to_csv('submission', index=False)
submission

Unnamed: 0,Id,Prediction
0,0,5.058149e+06
1,1,3.321908e+06
2,2,3.366344e+06
3,3,3.047091e+06
4,4,3.513406e+06
...,...,...
99995,99995,5.224836e+06
99996,99996,4.367085e+06
99997,99997,5.055873e+06
99998,99998,4.494644e+06


### Evaluation

In [44]:
#Evaluate model, calculate RMSE and MAPE

from sklearn.metrics import mean_squared_error, mean_absolute_error,  mean_absolute_percentage_error

sub = submission["Prediction"]

print(mean_squared_error(y_test,sub[:46])**0.5)
print(mean_absolute_percentage_error(y_test,sub[:46]))
#mean_absolute_error(y_test, sub[:46])

3063197.578798027
0.41574932764686456


In [33]:
y_test