### Importing Required Libraries

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

from lightgbm import LGBMRegressor
import time
from sklearn.model_selection import KFold

### Loading Data

In [None]:
train=pd.read_csv('Train.csv',parse_dates=['Date'])
test=pd.read_csv('Test.csv',parse_dates=['Date'])
sub=pd.read_excel('Sample_Submission.xlsx')

### Basic Understanding of Data

In [None]:
train.head()

Unnamed: 0,Product,Product_Brand,Item_Category,Subcategory_1,Subcategory_2,Item_Rating,Date,Selling_Price
0,P-2610,B-659,bags wallets belts,bags,hand bags,4.3,2/3/2017,291.0
1,P-2453,B-3078,clothing,women s clothing,western wear,3.1,7/1/2015,897.0
2,P-6802,B-1810,home decor festive needs,showpieces,ethnic,3.5,1/12/2019,792.0
3,P-4452,B-3078,beauty and personal care,eye care,h2o plus eye care,4.0,12/12/2014,837.0
4,P-8454,B-3078,clothing,men s clothing,t shirts,4.3,12/12/2013,470.0


In [None]:
train.tail()

Unnamed: 0,Product,Product_Brand,Item_Category,Subcategory_1,Subcategory_2,Item_Rating,Date,Selling_Price
2447,P-8870,B-2292,clothing,kids clothing,girls wear,2.3,11/3/2017,741.0
2448,P-10672,B-3078,footwear,women s footwear,casual shoes,1.9,4/12/2016,1590.0
2449,P-2134,B-479,clothing,men s clothing,t shirts,1.9,19/4/2019,995.0
2450,P-724,B-133,automotive,accessories spare parts,car interior exterior,2.7,1/12/2014,1598.0
2451,P-1154,B-174,bags wallets belts,bags,pouches and potlis,4.1,3/3/2019,397.0


In [None]:
test.head()

Unnamed: 0,Product,Product_Brand,Item_Category,Subcategory_1,Subcategory_2,Item_Rating,Date
0,P-11284,B-2984,computers,network components,routers,4.3,1/12/2018
1,P-6580,B-1732,jewellery,bangles bracelets armlets,bracelets,3.0,20/12/2012
2,P-5843,B-3078,clothing,women s clothing,western wear,1.5,1/12/2014
3,P-5334,B-1421,jewellery,necklaces chains,necklaces,3.9,1/12/2019
4,P-5586,B-3078,clothing,women s clothing,western wear,1.4,1/12/2017


In [None]:
test.tail()

Unnamed: 0,Product,Product_Brand,Item_Category,Subcategory_1,Subcategory_2,Item_Rating,Date
1046,P-9758,B-2543,sports fitness,outdoor adventure,camping hiking,2.0,28/2/2014
1047,P-11898,B-3197,jewellery,necklaces chains,necklaces,4.7,1/12/2019
1048,P-904,B-133,automotive,accessories spare parts,car interior exterior,3.5,1/12/2014
1049,P-1714,B-358,kitchen dining,lighting,bulbs,2.4,23/6/2013
1050,P-620,B-133,automotive,accessories spare parts,car interior exterior,3.1,1/12/2012


In [None]:
train.dtypes

Unnamed: 0,0
Product,object
Product_Brand,object
Item_Category,object
Subcategory_1,object
Subcategory_2,object
Item_Rating,float64
Date,object
Selling_Price,float64


In [None]:
train.describe()

Unnamed: 0,Item_Rating,Selling_Price
count,2452.0,2452.0
mean,3.078467,2494.375612
std,1.187137,7115.256516
min,1.0,33.0
25%,2.0,371.0
50%,3.1,596.0
75%,4.1,1195.25
max,5.0,116289.0


### Concating train,test for preprocessing and FE

In [None]:
train['train_or_test']='train'
test['train_or_test']='test'
df=pd.concat([train,test])

### Time Based Features

In [None]:
def create_date_featues(df):
    df['Month'] = pd.to_datetime(df['Date'],dayfirst=True).dt.month

    df['Day'] = pd.to_datetime(df['Date'],dayfirst=True).dt.day

    df['DayOfyear'] = pd.to_datetime(df['Date'],dayfirst=True).dt.dayofyear

    df['Quarter'] = pd.to_datetime(df['Date'],dayfirst=True).dt.quarter

    df['Is_month_start'] = pd.to_datetime(df['Date'],dayfirst=True).dt.is_month_start

    return df

In [None]:
df=create_date_featues(df)

#### Replaced Unknown as NAN , created a boolean feat which return True for all NAN values. Replacing NAN of Subcategory_1 didnt give useful results ,so filled back with unknown

In [None]:
df['Subcategory_1']=df['Subcategory_1'].replace('unknown', np.nan)
df['Subcategory_2']=df['Subcategory_2'].replace('unknown', np.nan)

cols_with_missing = [col for col in df.columns
                     if df[col].isnull().any()]
for col in cols_with_missing:
    df[col + '_was_missing'] = df[col].isnull()

df['Subcategory_1'].fillna('unknown',inplace=True)
#df['Subcategory_1'].fillna(df['Item_Category'],inplace=True)
#df['Subcategory_2'].fillna(df['Subcategory_1'],inplace=True)

#### Replaced Majority of NAN of sub_category_2 with mode grouped by with sub_cat_1 ,remaining few NA values with unknown as it is.

In [None]:
df.reset_index(inplace=True)
def fast_mode(df, key_cols, value_col):
    """
    Calculate a column mode, by group, ignoring null values.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame over which to calcualate the mode.
    key_cols : list of str
        Columns to groupby for calculation of mode.
    value_col : str
        Column for which to calculate the mode.

    Return
    ------
    pandas.DataFrame
        One row for the mode of value_col per key_cols group. If ties,
        returns the one which is sorted first.
    """
    return (df.groupby(key_cols + [value_col]).size()
              .to_frame('counts').reset_index()
              .sort_values('counts', ascending=False)
              .drop_duplicates(subset=key_cols)).drop(columns='counts')


df.loc[df.Subcategory_2.isnull(), 'Subcategory_2'] = df.Subcategory_1.map(fast_mode(df, ['Subcategory_1'], 'Subcategory_2').set_index('Subcategory_1').Subcategory_2)
del df['index']
df['Subcategory_2'].fillna('unknown',inplace=True)

In [None]:
### Interaction Features

In [None]:
interaction1=df['Product_Brand']+'_'+df['Subcategory_1']
interaction2=df['Product_Brand']+'_'+df['Subcategory_2']

interaction3=df['Item_Category']+'_'+df['Subcategory_1']
interaction4=df['Item_Category']+'_'+df['Subcategory_2']

interaction5=df['Subcategory_1']+'_'+df['Subcategory_2']



df['PB_S1']=interaction1
df['PB_S2']=interaction2
df['IC_S1']=interaction3
df['IC_S2']=interaction4
df['S1_S2']=interaction5

### Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for col in ['Item_Category','Product','Subcategory_1','IC_S1','S1_S2']:
    df[col]= le.fit_transform(df[col])


# Frequency Encoding

In [None]:
fe_pol = (df.groupby('Product_Brand').size()) / len(df)
df['Product_Brand_fe'] = df['Product_Brand'].apply(lambda x : fe_pol[x])

In [None]:
fe_pol = (df.groupby('Subcategory_1').size()) / len(df)
df['Subcategory_1_fe'] = df['Subcategory_1'].apply(lambda x : fe_pol[x])

In [None]:
fe_pol = (df.groupby('Subcategory_2').size()) / len(df)
df['Subcategory_2_fe'] = df['Subcategory_2'].apply(lambda x : fe_pol[x])

In [None]:
fe_pol = (df.groupby('PB_S2').size()) / len(df)
df['PB_S2_fe'] = df['PB_S2'].apply(lambda x : fe_pol[x])

### Binary Encoding

In [None]:
#pip install category-encoders

In [None]:
import category_encoders as ce
encoder=ce.BinaryEncoder(cols=['Product_Brand'])
dfbin=encoder.fit_transform(df['Product_Brand'])
df=pd.concat([df,dfbin],axis=1)
del df['Product_Brand']

In [None]:
import category_encoders as ce
encoder=ce.BinaryEncoder(cols=['Subcategory_2'])
dfbin=encoder.fit_transform(df['Subcategory_2'])
df=pd.concat([df,dfbin],axis=1)
del df['Subcategory_2']

In [None]:
import category_encoders as ce
encoder=ce.BinaryEncoder(cols=['PB_S1'])
dfbin=encoder.fit_transform(df['PB_S1'])
df=pd.concat([df,dfbin],axis=1)
del df['PB_S1']

In [None]:
import category_encoders as ce
encoder=ce.BinaryEncoder(cols=['PB_S2'])
dfbin=encoder.fit_transform(df['PB_S2'])
df=pd.concat([df,dfbin],axis=1)
del df['PB_S2']

In [None]:
import category_encoders as ce
encoder=ce.BinaryEncoder(cols=['IC_S2'])
dfbin=encoder.fit_transform(df['IC_S2'])
df=pd.concat([df,dfbin],axis=1)
del df['IC_S2']

### Getting back train and test

In [None]:
train=df.loc[df.train_or_test.isin(['train'])]
test=df.loc[df.train_or_test.isin(['test'])]
train.drop(columns={'train_or_test'},axis=1,inplace=True)
test.drop(columns={'train_or_test'},axis=1,inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.drop(columns={'train_or_test'},axis=1,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.drop(columns={'train_or_test'},axis=1,inplace=True)


### Log transforming target variable

In [None]:
train['Selling_Price']=np.log(train['Selling_Price'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Selling_Price']=np.log(train['Selling_Price'])


In [None]:
train.columns

Index(['Product', 'Item_Category', 'Subcategory_1', 'Item_Rating', 'Date',
       'Selling_Price', 'Month', 'Day', 'DayOfyear', 'Quarter',
       'Is_month_start', 'Subcategory_1_was_missing',
       'Subcategory_2_was_missing', 'Selling_Price_was_missing', 'IC_S1',
       'S1_S2', 'Product_Brand_fe', 'Subcategory_1_fe', 'Subcategory_2_fe',
       'PB_S2_fe', 'Product_Brand_0', 'Product_Brand_1', 'Product_Brand_2',
       'Product_Brand_3', 'Product_Brand_4', 'Product_Brand_5',
       'Product_Brand_6', 'Product_Brand_7', 'Product_Brand_8',
       'Product_Brand_9', 'Product_Brand_10', 'Subcategory_2_0',
       'Subcategory_2_1', 'Subcategory_2_2', 'Subcategory_2_3',
       'Subcategory_2_4', 'Subcategory_2_5', 'Subcategory_2_6',
       'Subcategory_2_7', 'Subcategory_2_8', 'PB_S1_0', 'PB_S1_1', 'PB_S1_2',
       'PB_S1_3', 'PB_S1_4', 'PB_S1_5', 'PB_S1_6', 'PB_S1_7', 'PB_S1_8',
       'PB_S1_9', 'PB_S1_10', 'PB_S2_0', 'PB_S2_1', 'PB_S2_2', 'PB_S2_3',
       'PB_S2_4', 'PB_S2_5', 'PB_S2

In [None]:
x=train.drop(columns={'Selling_Price','Date'},axis=1)
y=train['Selling_Price']
test=test.drop(columns={'Selling_Price','Date'},axis=1)

### Standared Scaling

In [None]:
col=x.columns
from sklearn.preprocessing import StandardScaler
st=StandardScaler()
st.fit(x)
x=st.transform(x)
test=st.transform(test)
x=pd.DataFrame(x,columns=col)
test=pd.DataFrame(test,columns=col)


### Since we have log transformed target variable, calculating rmse will give rmsle

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y,y_pred))

### Cross validating with kfold

In [None]:
from sklearn.ensemble import RandomForestRegressor
err = []
y_pred_tot_rf = []

fold = KFold(n_splits=5, shuffle=True, random_state=2020)
i = 1

for train_index, test_index in fold.split(x, y):
    x_train, x_val = x.iloc[train_index], x.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    m = RandomForestRegressor(bootstrap=True, max_features=0.45, min_samples_leaf=1, min_samples_split=5, n_estimators=100,random_state=42)
    m.fit(x_train, y_train)
    pred_y = m.predict(x_val)*0.999
    pred_x=m.predict(x_train)
    #print("weight correction")
    #W=[(0.990+(i/1000)) for i in range(20)]
    #S =[]
    #for w in W:
        #error = rmse(y_val,pred_y*w)
        #print('RMSE for {:.3f}:{:.6f}'.format(w,error))
        #S.append(error)
    #Score = pd.Series(S,index=W)
    #Score.plot()
    #BS = Score[Score.values == Score.values.min()]
    #print ('Best weight for Score:{}'.format(BS))
    print(i, " err_rf: ",  rmse(y_val,pred_y))
    print(i, " err_rf: ",  rmse(y_train,pred_x))
    err.append(rmse(y_val,pred_y))
    pred_test = m.predict(test)
    i = i + 1
    y_pred_tot_rf.append(pred_test)
(err[0]+err[1]+err[2]+err[3]+err[4])/5

1  err_rf:  0.6756123358428542
1  err_rf:  0.30330605848411646
2  err_rf:  0.5901124491888043
2  err_rf:  0.3116627512975582
3  err_rf:  0.6460923683539763
3  err_rf:  0.3058781111933884
4  err_rf:  0.662325453560842
4  err_rf:  0.3062157508912076
5  err_rf:  0.6217398333822917
5  err_rf:  0.30772734200197993


0.6391764880657537

In [None]:
0.637

0.637

### Final RF Model

In [None]:
m = RandomForestRegressor(bootstrap=True, max_features=0.45, min_samples_leaf=1, min_samples_split=5, n_estimators=100,random_state=2020)
m.fit(x,y)
rfpred = m.predict(test)*0.999

### This random forest prediction is used for blending along with lgbm,check in lgbm notebook

In [None]:
#sub['Selling_Price']=np.exp(rfpred)
sub['Selling_Price']=rfpred
sub.to_excel('randomforest.xlsx',index=False)

### Cross validating Blending Accuracy

In [None]:
from sklearn.ensemble import RandomForestRegressor
err = []
y_pred_tot_xgb = []



fold = KFold(n_splits=5, shuffle=True, random_state=2020)
i = 1

for train_index, test_index in fold.split(x, y):
    x_train, x_val = x.iloc[train_index], x.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]

    m = RandomForestRegressor(bootstrap=True, max_features=0.45, min_samples_leaf=1, min_samples_split=5, n_estimators=100,random_state=2020)
    m.fit(x_train, y_train)
    rfpred = m.predict(x_val)*0.999

    m = LGBMRegressor(n_estimators=442, num_leaves=127, max_depth=8,min_child_samples=4,
                      learning_rate=0.02, colsample_bytree=0.4, reg_alpha=0.5, reg_lambda=2,random_state=2020)
    m.fit(x_train, y_train)
    lgbpred=m.predict(x_val)*0.999
    average=0.55*rfpred+0.45*lgbpred

    print(i, " err_xgb: ",  rmse(y_val,average))
    err.append(rmse(y_val,average))
    #pred_test = m.predict(test)
    i = i + 1
    #y_pred_tot_xgb.append(pred_test)
(err[0]+err[1]+err[2]+err[3]+err[4])/5

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000723 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1126
[LightGBM] [Info] Number of data points in the train set: 1961, number of used features: 67
[LightGBM] [Info] Start training from score 6.638816
1  err_xgb:  0.6678665916743816
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000752 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1131
[LightGBM] [Info] Number of data points in the train set: 1961, number of used features: 67
[LightGBM] [Info] Start training from score 6.639528
2  err_xgb:  0.592119921342133
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000751 seconds.
You can set `force_row_

0.6348623086288017