In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

Installing new library

In [None]:
!pip install xgboost 

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn import metrics

In [None]:
data=pd.read_csv("big_mart_train.csv")   #loading the file to pandas dataframe
data

In [None]:
data.head(5)

# About the Columns:

1. Item Identifier : It identifies the type of items for eg. FDA15 stands for any food item , NCD19 stands for any non-consumable item.

2. Item_Weight : This column tells us about the weight of teh item.

3. Item_Fat_content : Tells us about the fat content in that particular item.

4. Item_Visibility : tells us how much people are buying that product.

5. Item_Type : represents the type of item more specifically eg, in eatables which kind soft drinks or meat 

6. Item_MRP : tells us the price of the item in terms of USD.

7. Outlet_Identifier : represents different stores for the items.

8. Outlet_Establishment_year: tells us the year in which a particular outlet or store started.

9. Outlet_size : represents the size of the store eg, high, medium or low.

10. Outlet_location_type : tells us whether the outlet is located in a very populated region i.e,Tier 1  or least famous region i.e. Tier 3 

11. Outlet_Type: tells us whether the oulet is supermarket type or a small grocery store. 

12. Item_Outlet_sales : tells us what is the particular sales value of an item in that outlet.

In [None]:
data.shape

Getting some information about the dataset

In [None]:
data.info()

Categorical features:

 1. Item_Identifier
 2. Item_Fat_Content
 3. Item_Type
 4. Outlet_Identifier
 5. Outlet_Size                
 6. Outlet_Location_Type       
 7. Outlet_Type 
 

Checking for missing values :

In [None]:
data.isnull().sum()

Handling missing values :

One of the columns with missing values is Categorical that is ,Outlet_Size, we cannot take out its mean , let's handle it with mode then , and for Item_Weight colum, we will handle the missing values by finding the mean since it is having numerical values.

In [None]:
#finding mean of Item_Weight column.
data["Item_Weight"].mean()

In [None]:
#replacing the missing values of column Item_Weight with mean of that column.
data["Item_Weight"].fillna(data["Item_Weight"].mean(), inplace=True)

In [None]:
data.isnull().sum()

Now , let's replace null values of categorical column Outlet_Size with mode.

In [None]:
mode_of_outlet_size=data.pivot_table(values="Outlet_Size", columns= "Outlet_Type", aggfunc=(lambda x:x.mode()[0]))
print(mode_of_outlet_size)

So , above table is created with two rows Outlet_size and type , just to refer we have taken a column which is more correlated with Outlet_Size....lambda is used same as def function which can be used multiple times but lambda is used in one place only.

In [None]:
missing_values=data["Outlet_Size"].isnull()
print(missing_values)                #false means it is not null , that is value is present.

Now, we will give another code to convert these missing values into mode. we will use lambda function again but a new one this one as one time only it can be used.

In [None]:
data.loc[missing_values,"Outlet_Size"]=data.loc[missing_values, "Outlet_Type"].apply(lambda x:mode_of_outlet_size)

In [None]:
data.isnull().sum()

As, we can see above the missing values have been removed now with mean and mode respectively.

Data Analysis :

In [None]:
data.describe()

Checking Distribution of values :

In [None]:
sns.distplot(data["Item_Weight"])

In [None]:
sns.distplot(data["Item_Visibility"])

In [None]:
sns.distplot(data["Item_MRP"])

In [None]:
sns.distplot(data["Item_Outlet_Sales"])

In [None]:
sns.countplot(data['Outlet_Establishment_Year'])

Analyzing the categorical values :

In [None]:
sns.countplot(data['Item_Fat_Content'])

Since, low fat ,LF are same as "Low Fat", and reg represents "Regular", we can combine them and clean data

In [None]:
sns.countplot(data['Item_Type'])

In [None]:
#Since it is too conjusted, let's give figsize.
plt.figure(figsize=(30,6))
sns.countplot(x="Item_Type", data=data)
plt.title("Item_type")
plt.show()

In [None]:
sns.countplot(data['Outlet_Location_Type'])

In [None]:
sns.countplot(data['Outlet_Type'])

Data Pre-processing :

In [None]:
data.head()

low fat ,LF are same as "Low Fat", and reg represents "Regular", we can combine them and clean data

In [None]:
data["Item_Fat_Content"].value_counts()

In [None]:
#Now replacing
data.replace({"Item_Fat_Content" : {"low fat" : 'Low Fat', "LF": 'Low Fat','reg':'Regular'}} , inplace=True)

In [None]:
data["Item_Fat_Content"].value_counts()

In [None]:
#Encoding all the categorical values into numerical values using Label Encoder
encode=LabelEncoder()

In [None]:
data["Item_Identifier"]=encode.fit_transform(data["Item_Identifier"])
data["Item_Fat_Content"]=encode.fit_transform(data["Item_Fat_Content"])
data["Item_Type"]=encode.fit_transform(data["Item_Type"])
data["Outlet_Identifier"]=encode.fit_transform(data["Outlet_Identifier"])
data["Outlet_Size"]=encode.fit_transform(data["Outlet_Size"])
data["Outlet_Location_Type"]=encode.fit_transform(data["Outlet_Location_Type"])
data["Outlet_Type"]=encode.fit_transform(data["Outlet_Type"])

In [None]:
data.head()

Splitting features and target : 

In [None]:
x=data.drop(columns="Item_Outlet_Sales",axis=1)  #axis=1 is used to drop column and axis=0 is used to drop rows

In [None]:
y= data["Item_Outlet_Sales"]

In [None]:
x

In [None]:
y

Training and Testing:

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2)

In [None]:
x_train.shape

In [None]:
y_train.shape

In [None]:
x_test.shape

In [None]:
y_test.shape

Machine Learning Model Learning using XGBRegressor :

In [None]:
regressor=XGBRegressor()

In [None]:
regressor.fit(x_train,y_train)

Evaluation :

In [None]:
training_data_prediction=regressor.predict(x_train)

Calculating R2 score :

In [None]:
r2_train=metrics.r2_score(y_train,training_data_prediction )

In [None]:
r2_train

In [None]:
testing_data_prediction=regressor.predict(x_test)

In [None]:
r2_test=metrics.r2_score(y_test,testing_data_prediction)

In [None]:
r2_test