In [36]:
# imports

import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd 
import seaborn as sns

from plotly.offline import init_notebook_mode,download_plotlyjs,iplot

import cufflinks as cf
init_notebook_mode(connected=True)
cf.go_offline()

import warnings
warnings.filterwarnings('ignore')

In [37]:
#importing dataset

df_train = pd.read_csv('Train.csv')
df_test = pd.read_csv('Test.csv')

In [38]:
df_train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [39]:
df_test.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.3,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,,Tier 2,Supermarket Type1
4,FDY38,,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3


In [40]:
df_train.shape

(8523, 12)

In [41]:
df_test.shape

(5681, 11)

In [42]:
#Basic analysis and features engineering

#1. Removing Unwanted cloumns and features
try:
    df_train.drop(labels=['Item_Identifier', 'Outlet_Identifier', 'Outlet_Establishment_Year'], axis=1, inplace=True)
    df_test.drop(labels=['Item_Identifier', 'Outlet_Identifier', 'Outlet_Establishment_Year'], axis=1, inplace=True)
except Exception as e:
    pass

In [43]:
df_train.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,9.3,Low Fat,0.016047,Dairy,249.8092,Medium,Tier 1,Supermarket Type1,3735.138
1,5.92,Regular,0.019278,Soft Drinks,48.2692,Medium,Tier 3,Supermarket Type2,443.4228
2,17.5,Low Fat,0.01676,Meat,141.618,Medium,Tier 1,Supermarket Type1,2097.27
3,19.2,Regular,0.0,Fruits and Vegetables,182.095,,Tier 3,Grocery Store,732.38
4,8.93,Low Fat,0.0,Household,53.8614,High,Tier 3,Supermarket Type1,994.7052


In [44]:
df_test.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,20.75,Low Fat,0.007565,Snack Foods,107.8622,Medium,Tier 1,Supermarket Type1
1,8.3,reg,0.038428,Dairy,87.3198,,Tier 2,Supermarket Type1
2,14.6,Low Fat,0.099575,Others,241.7538,,Tier 3,Grocery Store
3,7.315,Low Fat,0.015388,Snack Foods,155.034,,Tier 2,Supermarket Type1
4,,Regular,0.118599,Dairy,234.23,Medium,Tier 3,Supermarket Type3


In [45]:
#2. Getting information about null values

temp_df = df_train.isnull().sum().reset_index()
temp_df['Percentage'] = (temp_df[0]/len(df_train))*100

temp_df.columns = ['Column Name','Number of null values', 'Null values in percentage']

print(f'The length of dataset is \t {len(df_train)}')

The length of dataset is 	 8523


In [46]:
temp_df

Unnamed: 0,Column Name,Number of null values,Null values in percentage
0,Item_Weight,1463,17.165317
1,Item_Fat_Content,0,0.0
2,Item_Visibility,0,0.0
3,Item_Type,0,0.0
4,Item_MRP,0,0.0
5,Outlet_Size,2410,28.276428
6,Outlet_Location_Type,0,0.0
7,Outlet_Type,0,0.0
8,Item_Outlet_Sales,0,0.0


In [47]:
#3. Making correction in 'Item_Fat_content' column

def convert(x):
    if x in ['low fat','LF']:
        return 'Low Fat'
    elif x=='reg':
        return 'Regular'
    else:
        return x
    
df_train['Item_Fat_Content'] = df_train['Item_Fat_Content'].apply(convert)

df_test['Item_Fat_Content'] = df_test['Item_Fat_Content'].apply(convert)

print(f'Now Unique values in this column in Train set are \t {df_train["Item_Fat_Content"].unique()}')
print(f'Now unique values in this cloumn in test set are \t {df_test["Item_Fat_Content"].unique()}')

Now Unique values in this column in Train set are 	 ['Low Fat' 'Regular']
Now unique values in this cloumn in test set are 	 ['Low Fat' 'Regular']


In [48]:
#4 Dealing with missing values in categorical type column ie 'Outlet_size'

count = df_train['Outlet_Size'].value_counts().reset_index()
count.iplot(kind='bar',color='deepskyblue',x='index',y='Outlet_Size',
            title='High VS Medium VS Small',xTitle='Size',yTitle='Frequency')

In [49]:
# Removing missing values from medium in both training and test

df_train['Outlet_Size'].fillna(value='Medium',inplace=True)
df_test['Outlet_Size'].fillna(value='Medium',inplace=True)

In [61]:
#Prediction with regression models

#1. Importing machinelearning libraries

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

In [51]:
#Creating feature matrix and target vector

x_train = df_train.iloc[:,:-1].values
y_train = df_train.iloc[:,-1].values

x_test = df_test.values

In [52]:
df_train.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,9.3,Low Fat,0.016047,Dairy,249.8092,Medium,Tier 1,Supermarket Type1,3735.138
1,5.92,Regular,0.019278,Soft Drinks,48.2692,Medium,Tier 3,Supermarket Type2,443.4228
2,17.5,Low Fat,0.01676,Meat,141.618,Medium,Tier 1,Supermarket Type1,2097.27
3,19.2,Regular,0.0,Fruits and Vegetables,182.095,Medium,Tier 3,Grocery Store,732.38
4,8.93,Low Fat,0.0,Household,53.8614,High,Tier 3,Supermarket Type1,994.7052


In [53]:
imputer = Imputer()
x_train[:,[0]] = imputer.fit_transform(x_train[:,[0]])
x_test[:,[0]] = imputer.fit_transform(x_test[:,[0]])

In [54]:
#2 Dealing with categorical values in features /cloumns 

labelencoder_x = LabelEncoder()
x_train[:,1] = labelencoder_x.fit_transform(x_train[:,1])
x_train[:,3] = labelencoder_x.fit_transform(x_train[:,3])
x_train[:,5] = labelencoder_x.fit_transform(x_train[:,5])
x_train[:,6] = labelencoder_x.fit_transform(x_train[:,6])
x_train[:,7] = labelencoder_x.fit_transform(x_train[:,7])

onehotencoder_x = OneHotEncoder(categorical_features=[3,5,6,7])
x_train = onehotencoder_x.fit_transform(x_train).toarray()

x_test[:,1] = labelencoder_x.fit_transform(x_test[:,1])
x_test[:,3] = labelencoder_x.fit_transform(x_test[:,3])
x_test[:,5] = labelencoder_x.fit_transform(x_test[:,5])
x_test[:,6] = labelencoder_x.fit_transform(x_test[:,6])
x_test[:,7] = labelencoder_x.fit_transform(x_test[:,7])


#need to be done when we have more than twocategorical values

onehotencoder_x = OneHotEncoder(categorical_features=[3,5,6,7])
x_test = onehotencoder_x.fit_transform(x_test).toarray()

In [55]:
#apply Feature scaling on feature matrix

sc_X = StandardScaler()
x_train = sc_X.fit_transform(x_train)
x_test = sc_X.fit_transform(x_test)

In [56]:
from sklearn.decomposition import PCA

pca = PCA(n_components = None)
x_train = pca.fit_transform(x_train)
x_test = pca.fit_transform(x_test)

explained_variance = pca.explained_variance_ratio_

explained_variance

array([1.07435489e-01, 7.06744630e-02, 5.70527808e-02, 5.08878951e-02,
       3.97957072e-02, 3.89091132e-02, 3.88234637e-02, 3.77363114e-02,
       3.69386094e-02, 3.66026144e-02, 3.64230241e-02, 3.61861880e-02,
       3.59329577e-02, 3.51077720e-02, 3.47520156e-02, 3.45387916e-02,
       3.43031207e-02, 3.41655851e-02, 3.40415465e-02, 3.33654807e-02,
       3.13104261e-02, 2.98462525e-02, 2.95960836e-02, 2.38043054e-02,
       1.93621535e-02, 2.40784947e-03, 1.77665175e-31, 1.94834419e-32,
       4.70453053e-33, 4.21768408e-33])

In [57]:
#taking n_components = 24

pca = PCA(n_components = 25)
x_train = pca.fit_transform(x_train)
x_test = pca.fit_transform(x_test)

In [58]:
y_train.dtype

dtype('float64')

In [59]:
#Apply lulti-Linear Regression Model ,Polynomial regression and random Forrest MOdel and compare their accuracy

# Multi-linear regression Model.
regressor_multi = LinearRegression()
regressor_multi.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [62]:
# Let us check the accuray
accuracy = cross_val_score(estimator=regressor_multi, X=x_train, y=y_train,cv=10)
print(f"The accuracy of the Multi-linear Regressor Model is \t {accuracy.mean()}")
print(f"The deviation in the accuracy is \t {accuracy.std()}")

The accuracy of the Multi-linear Regressor Model is 	 0.5575764045856493
The deviation in the accuracy is 	 0.02120576291236604


In [None]:
# Random Forest Model.
regressor_random = RandomForestRegressor(n_estimators=100,)
regressor_random.fit(x_train,y_train)

# Let us check the accuray
accuracy = cross_val_score(estimator=regressor_random, X=x_train, y=y_train,cv=10)
print(f"The accuracy of the Random Forest Model is \t {accuracy.mean()}")
print(f"The deviation in the accuracy is \t {accuracy.std()}")

print("Here accuray is 53% with deviation of 3%.")

In [None]:

# Fitting polynomial regression to dataset
from sklearn.preprocessing import PolynomialFeatures
poly_reg=PolynomialFeatures(degree=4) #These 3 steps are to convert X matrix into X polynomial
x_poly=poly_reg.fit_transform(x_train) #matrix. 
regressor_poly=LinearRegression()
regressor_poly.fit(x_poly,y_train)

# Let us check the accuray
accuracy = cross_val_score(estimator=regressor_poly, X=x_train, y=y_train,cv=10)
print(f"The accuracy of the Polynomial Regression Model is \t {accuracy.mean()}")
print(f"The deviation in the accuracy is \t {accuracy.std()}")

print("Here accuracy is 55% with deviation of 2%")

#### observation:-
* As the accuracy of Multi-linear regression Model is the best one.
* Multi-linear Regression Model takes less time as compare to Random forest and Polynomial regression Models.
* We will choose Multi-linear regression Model.
* Here we are getting the accuracy of 55% and deviation of 2%, means in future if we mak eprediction on new values then we will get the accuracy in range 53% to 57%.
* We are getting low accuracy due to less quantity of data.

In [None]:
y_pred = regressor_multi.predict(x_test)

y_pred[:5]