In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, classification_report

In [2]:
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv') 

In [3]:
print(train.shape)
train.head()

(8523, 12)


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [4]:
print(test.shape)
test.head()

(5681, 11)


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.3,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,,Tier 2,Supermarket Type1
4,FDY38,,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3


In [5]:
train['Item_Type'].value_counts()

Fruits and Vegetables    1232
Snack Foods              1200
Household                 910
Frozen Foods              856
Dairy                     682
Canned                    649
Baking Goods              648
Health and Hygiene        520
Soft Drinks               445
Meat                      425
Breads                    251
Hard Drinks               214
Others                    169
Starchy Foods             148
Breakfast                 110
Seafood                    64
Name: Item_Type, dtype: int64

In [6]:
X_train = train.iloc[:,:-1]
Y_train = train.iloc[:,-1]

In [7]:
X_train.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

In [8]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 11 columns):
Item_Identifier              8523 non-null object
Item_Weight                  7060 non-null float64
Item_Fat_Content             8523 non-null object
Item_Visibility              8523 non-null float64
Item_Type                    8523 non-null object
Item_MRP                     8523 non-null float64
Outlet_Identifier            8523 non-null object
Outlet_Establishment_Year    8523 non-null int64
Outlet_Size                  6113 non-null object
Outlet_Location_Type         8523 non-null object
Outlet_Type                  8523 non-null object
dtypes: float64(3), int64(1), object(7)
memory usage: 732.5+ KB


In [9]:
X_train['Item_Fat_Content'].value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [10]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
item_weight = X_train['Item_Weight'].values.reshape((-1,1))
X_train['Item_Weight'] = imputer.fit_transform(item_weight)

In [11]:
X_train['Item_Fat_Content'] = X_train['Item_Fat_Content'].map({
    'Low Fat' : 'LOW', 'LF' : 'LOW', 'low fat' : 'LOW', 'Regular' : 'REGULAR', 'reg' : 'REGULAR'})
X_train['Item_Fat_Content'].value_counts()

LOW        5517
REGULAR    3006
Name: Item_Fat_Content, dtype: int64

In [12]:
X_train['Item_Fat_Content'].value_counts()

LOW        5517
REGULAR    3006
Name: Item_Fat_Content, dtype: int64

In [13]:
le1 = LabelEncoder()
le2 = LabelEncoder()
le3 = LabelEncoder()

X_train['Item_Fat_Content'] = le1.fit_transform(X_train['Item_Fat_Content'])
X_train['Outlet_Location_Type'] = le2.fit_transform(X_train['Outlet_Location_Type'])
X_train['Outlet_Type'] = le3.fit_transform(X_train['Outlet_Type'])

In [14]:
X_train.drop(labels=['Item_Identifier','Item_Type', 'Outlet_Identifier', 'Outlet_Size'], axis=1, inplace=True)
columns = X_train.columns
X_train.head(100)

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Location_Type,Outlet_Type
0,9.300000,0,0.016047,249.8092,1999,0,1
1,5.920000,1,0.019278,48.2692,2009,2,2
2,17.500000,0,0.016760,141.6180,1999,0,1
3,19.200000,1,0.000000,182.0950,1998,2,0
4,8.930000,0,0.000000,53.8614,1987,2,1
5,10.395000,1,0.000000,51.4008,2009,2,2
6,13.650000,1,0.012741,57.6588,1987,2,1
7,12.857645,0,0.127470,107.7622,1985,2,3
8,16.200000,1,0.016687,96.9726,2002,1,1
9,19.200000,1,0.094450,187.8214,2007,1,1


In [15]:
one_hot_encoder = OneHotEncoder(categorical_features=(5,6))
X_train = one_hot_encoder.fit_transform(X_train).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [16]:
lr =  LinearRegression()
lr.fit(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [17]:
item_weight = test['Item_Weight'].values.reshape((-1,1))
test['Item_Weight'] = imputer.transform(item_weight)
test['Item_Fat_Content'] = test['Item_Fat_Content'].map({
    'low fat' : 'LOW', 'LF' : 'LOW', 'Low Fat' : 'LOW', 'Regular' : 'REGULAR', 'reg' : 'REGULAR'
})

test['Item_Fat_Content'].value_counts()
test['Item_Fat_Content'] = le1.transform(test['Item_Fat_Content'])
test['Outlet_Location_Type'] = le2.transform(test['Outlet_Location_Type'])
test['Outlet_Type'] = le3.transform(test['Outlet_Type'])

test.drop(labels=['Item_Identifier', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size'], axis=1, inplace=True)
test = one_hot_encoder.fit_transform(test).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [18]:
X_test = pd.read_csv('test.csv')
y_pred = lr.predict(test)

In [19]:
X_test['Prediction_Sales'] = pd.Series(y_pred)

In [20]:
X_test.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Prediction_Sales
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1,1799.9435
1,FDW14,8.3,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1,1538.119642
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store,1922.022393
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,,Tier 2,Supermarket Type1,2546.707898
4,FDY38,,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3,5180.145456
