In [7]:
import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer

data = pd.read_csv(r'./big_mart_sales/train_v9rqX0R.csv')
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


### Variable Identification

In [8]:
num_cols = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Item_Outlet_Sales']
cat_cols = ['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
ordinal_cols = ['Item_Fat_Content', 'Outlet_Location_Type', 'Outlet_Size', 'Outlet_Type'] #Outlet_Size
nominal_cols = ['Item_Identifier','Item_Type', 'Outlet_Identifier']
target_var = ['Item_Outlet_Sales']

### Data Preprocessing

In [9]:
# Data Preprocessing
# Solve type errors in Item_Fat_Content column
rectify_Item_Fat_Content_typo = {'LF': 'Low Fat', 'low fat': 'Low Fat', 'Low Fat': 'Low Fat', 'reg': 'Regular', 'Regular': 'Regular'}
data['Item_Fat_Content'] = data['Item_Fat_Content'].map(rectify_Item_Fat_Content_typo)

# Encode outlet size before imputing
data['Outlet_Size'] = data['Outlet_Size'].replace(dict(High=3, Medium=2, Small=1))

"""
Treat numerical missing value
"""
# Get unique weight of each product
unique_item_weight = data[~data['Item_Weight'].isnull()][['Item_Identifier', 'Item_Weight']].drop_duplicates()

# ASsign the unique weight obtained to the respective product 
data.drop(['Item_Weight'], axis=1, inplace=True)
data = pd.merge(data, unique_item_weight, on='Item_Identifier', how='left')

# Pending missing value
data = data[~data.Item_Weight.isnull()]

### Handle Categorical data

In [10]:
data[cat_cols] = data[cat_cols].astype('category')

ordinal_encoder = ce.OrdinalEncoder(cols=ordinal_cols)
data_ordinal_encoded = ordinal_encoder.fit_transform(data)

nominal_encoder = ce.BinaryEncoder(cols=nominal_cols)
data_encoded = nominal_encoder.fit_transform(data_ordinal_encoded)

### Treating missing values

In [11]:
knn = KNNImputer(n_neighbors=3)
data = knn.fit_transform(data_encoded[['Outlet_Size','Outlet_Location_Type','Outlet_Type']])
outlet_size_imputed = pd.DataFrame(data_encoded, columns=['Outlet_Size','Outlet_Location_Type','Outlet_Type'])

data_encoded.drop(['Outlet_Size','Outlet_Location_Type','Outlet_Type'], axis=1, inplace=True)
df = pd.concat([data_encoded, outlet_size_imputed], axis=1)

### Linear Regression

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [13]:
X = df.drop(target_var, axis=1)
y = df[target_var]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [14]:
lr = LinearRegression()
reg = lr.fit(x_train, y_train)

In [21]:
reg.intercept_

array([-137986.47742908])

In [19]:
reg.score(x_train, y_train)

0.5635640137466742