**IMPORTING DEPENDENCIES**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVR

**IMPORTING DATASET**

In [2]:
df = pd.read_csv('hrtrain_set.csv')

**DATA CLEANING**

In [3]:
df.dtypes

Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

In [4]:
df.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

Will replace null values in 'Item_Weight' column with mean values and 'Outlet_Size' with most common value.

In [5]:
df["Item_Weight"].fillna(df["Item_Weight"].mean(), inplace = True)
df["Outlet_Size"].fillna(df['Outlet_Size'].value_counts().index[0], inplace = True)

In [6]:
df.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

In [7]:
df.head(15)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Medium,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
5,FDP36,10.395,Regular,0.0,Baking Goods,51.4008,OUT018,2009,Medium,Tier 3,Supermarket Type2,556.6088
6,FDO10,13.65,Regular,0.012741,Snack Foods,57.6588,OUT013,1987,High,Tier 3,Supermarket Type1,343.5528
7,FDP10,12.857645,Low Fat,0.12747,Snack Foods,107.7622,OUT027,1985,Medium,Tier 3,Supermarket Type3,4022.7636
8,FDH17,16.2,Regular,0.016687,Frozen Foods,96.9726,OUT045,2002,Medium,Tier 2,Supermarket Type1,1076.5986
9,FDU28,19.2,Regular,0.09445,Frozen Foods,187.8214,OUT017,2007,Medium,Tier 2,Supermarket Type1,4710.535


**TIME FOR ENCODING THE CATEGORICAL DATA**

In [8]:
lb = LabelEncoder()

df['Item_Identifier'] = lb.fit_transform(df['Item_Identifier'])

df['Item_Fat_Content'] = lb.fit_transform(df['Item_Fat_Content'])

df['Item_Type'] = lb.fit_transform(df['Item_Type'])

df['Outlet_Identifier'] = lb.fit_transform(df['Outlet_Identifier'])

df['Outlet_Size'] = lb.fit_transform(df['Outlet_Size'])

df['Outlet_Location_Type'] = lb.fit_transform(df['Outlet_Location_Type'])

df['Outlet_Type'] = lb.fit_transform(df['Outlet_Type'])


In [9]:
df.dtypes

Item_Identifier                int64
Item_Weight                  float64
Item_Fat_Content               int64
Item_Visibility              float64
Item_Type                      int64
Item_MRP                     float64
Outlet_Identifier              int64
Outlet_Establishment_Year      int64
Outlet_Size                    int64
Outlet_Location_Type           int64
Outlet_Type                    int64
Item_Outlet_Sales            float64
dtype: object

**CREATING DATASET**

In [10]:
x = df.iloc[: , :-1].values
y = df.iloc[: , -1].values

**TIME TO SPLIT DATASET TO TEST AND TRAIN**

In [11]:
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size = 0.2 , random_state = 0)

**FEATURE SCALING**

In [12]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

**TRAINING MODEL**

**MODEL 1 : DECISION TREE REGRESSOR**

In [28]:
model_1 = DecisionTreeRegressor(max_depth=15, min_samples_leaf=100,random_state = 0)
model_1.fit(x_train, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=15,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=100, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=0, splitter='best')

In [29]:
y_pred = model_1.predict(x_test)

In [30]:
score=r2_score(y_test,y_pred)
print('r2 socre is : ',score)
print('mean_sqrd_error is : ',mean_squared_error(y_test,y_pred))
print('root_mean_squared error of is : ',np.sqrt(mean_squared_error(y_test,y_pred)))

r2 socre is :  0.5833316855691364
mean_sqrd_error is :  1219529.594792497
root_mean_squared error of is :  1104.3231387562687


**MODEL 2 : RANDOM FOREST TREE REGRESSOR**

In [16]:
model_2 = RandomForestRegressor(n_estimators=1000,max_depth=10, min_samples_leaf=100,n_jobs=4)
model_2.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=10, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=100,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=1000, n_jobs=4, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [17]:
y_pred = model_2.predict(x_test)

In [18]:
score=r2_score(y_test,y_pred)
print('r2 socre is : ',score)
print('mean_sqrd_error is : ',mean_squared_error(y_test,y_pred))
print('root_mean_squared error of is : ',np.sqrt(mean_squared_error(y_test,y_pred)))

r2 socre is :  0.5885329788150978
mean_sqrd_error is :  1204306.1404885417
root_mean_squared error of is :  1097.4088301487927


**MODEL 3: MULTIPLE LINEAR REGRESSION**

In [19]:
model_3 = LinearRegression()
model_3.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [20]:
y_pred = model_3.predict(x_test)

In [21]:
score=r2_score(y_test,y_pred)
print('r2 socre is : ',score)
print('mean_sqrd_error is : ',mean_squared_error(y_test,y_pred))
print('root_mean_squared error of is : ',np.sqrt(mean_squared_error(y_test,y_pred)))

r2 socre is :  0.5126899755005854
mean_sqrd_error is :  1426287.9516716918
root_mean_squared error of is :  1194.2729803824968


**MODEL 4 : POLYNOMIAL REGRESSION**

In [22]:
poly_reg = PolynomialFeatures(degree = 2)
x_poly = poly_reg.fit_transform(x)
model_4 = LinearRegression()
model_4.fit(x_poly, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [23]:
y_pred = model_4.predict(poly_reg.fit_transform(x_test))

In [24]:
score=r2_score(y_test,y_pred)
print('r2 socre is : ',score)
print('mean_sqrd_error is : ',mean_squared_error(y_test,y_pred))
print('root_mean_squared error of is : ',np.sqrt(mean_squared_error(y_test,y_pred)))

r2 socre is :  -1529.5066814868735
mean_sqrd_error is :  4479577948.350565
root_mean_squared error of is :  66929.64924718018


**MODEL 5:SUPPORT VECTOR REGRESSOR**

In [25]:
model_5 = SVR(kernel = 'linear')
model_5.fit(x_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [26]:
y_pred = model_5.predict(x_test)

In [27]:
score=r2_score(y_test,y_pred)
print('r2 socre is : ',score)
print('mean_sqrd_error is : ',mean_squared_error(y_test,y_pred))
print('root_mean_squared error of is : ',np.sqrt(mean_squared_error(y_test,y_pred)))

r2 socre is :  0.46555432864899726
mean_sqrd_error is :  1564247.365225173
root_mean_squared error of is :  1250.6987507890033
