In [1]:
import numpy as np
import pandas as pd


from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.metrics import r2_score,mean_squared_error

In [2]:
def drop_cols(df,cols_list):
    df = df.drop(columns=cols_list)
    return df

In [23]:
def oh_transform(df, cols_to_oh):
    df_new = pd.get_dummies(df[cols_to_oh],sparse=False,drop_first=True)
    df = pd.concat([df,df_new],axis=1)
    df = drop_cols(df,cols_to_oh)
    return df

In [20]:
def label_transform(df,cols_to_le):
    le_transform = LabelEncoder()
    for col in cols_to_le:
        df[col] = le_transform.fit_transform(df[col])
    return df

In [5]:
def scale_data(df):
    sc_transform = StandardScaler()
    df_scaled = sc_transform.fit_transform(df)
    return df_scaled

In [6]:
def split_data(features,target):
    X_train,X_test,y_train,y_test = train_test_split(features,target,test_size=0.3,random_state=32)
    return X_train,X_test,y_train,y_test

In [30]:
def model_train(model,X_train,y_train):
    model = model.fit(X_train,y_train)
    return model

def model_predict(model,X_test):
    y_pred = model.predict(X_test)
    return y_pred

def model_performance(y_pred,y_test):
    r2_sc = r2_score(y_test,y_pred)
    rmse = round(np.sqrt(mean_squared_error(y_test,y_pred)),4)
    return r2_sc,rmse

In [34]:
df = pd.read_csv('./cleaned_data.csv')


cols_to_drop = ['Unnamed: 0', 'Item_Identifier','Outlet_Identifier']
df = drop_cols(df,cols_to_drop)

cols = list(df.columns)
cat_cols = [col for col in df.select_dtypes('object')]
num_cols = list(set(cols) - set(cat_cols))

cols_to_oh = [col for col in cat_cols if df[col].nunique()<10]
cols_to_le = list(set(cat_cols) - set(cols_to_oh))

df = oh_transform(df,cols_to_oh)

df = label_transform(df,cols_to_le)

df_X = df.drop(columns=['Item_Outlet_Sales'])
df_y = df['Item_Outlet_Sales'].ravel()

df_X_scaled = scale_data(df_X)

X_train,X_test,y_train,y_test = split_data(df_X_scaled,df_y)

ri = Ridge(alpha=0.9)
rf = RandomForestRegressor(n_estimators=1000)
svr = SVR(C=0.9)
xgb = XGBRegressor(n_estimators=100,max_depth=5,eta=0.1)

model_name = ['Ridge','Random Forest', 'SVM', 'XGboost']
models = [ri,rf,svr,xgb]

r2_scores = []
rmses = []

for m in models:
    model = model_train(m,X_train,y_train)
    y_pred = model_predict(model,X_test)
    r2_sc,rmse= model_performance(y_pred,y_test)
    r2_scores.append(r2_sc)
    rmses.append(rmse)

In [35]:
d = {'model':model_name,'r2_score':r2_scores,'RMSE':rmses}
Evaluation = pd.DataFrame(data=d)

Evaluation

Unnamed: 0,model,r2_score,RMSE
0,Ridge,0.674575,2.0395
1,Random Forest,0.653026,2.1059
2,SVM,0.684511,2.0081
3,XGboost,0.68186,2.0165
