In [4]:
import pandas as pd
import numpy as np

In [5]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
sample_data = pd.read_csv('SampleSubmission.csv')

In [6]:
train_data['source']='train'
test_data['source']='test'
data = pd.concat([train_data, test_data],ignore_index=True)
print (train_data.shape, test_data.shape, data.shape)

(4990, 14) (3532, 13) (8522, 14)


In [7]:
product_avg_weight = data.pivot_table(values='Product_Weight', index='Product_Identifier') #aggfunc is mean by default!

In [8]:
#filling the missing data
def impute_weight(cols):
    Weight = cols[0]
    Identifier = cols[1]
    
    if pd.isnull(Weight):
        return product_avg_weight['Product_Weight'][product_avg_weight.index == Identifier]
    else:
        return Weight

In [9]:
print ('Orignal missing: %d'% sum(data['Product_Weight'].isnull()))
data['Product_Weight'] = data[['Product_Weight','Product_Identifier']].apply(impute_weight, axis=1).astype(float)
print ('Final missing: %d'% sum(data['Product_Weight'].isnull()))

Orignal missing: 1463
Final missing: 4


In [10]:
#filling the remaining missing 4 values
data['Product_Weight'] = data['Product_Weight'].fillna(11.9)
sum(data['Product_Weight'].isnull())

0

In [11]:
supermarket_size_mode = data.pivot_table(values='Supermarket _Size', columns = 'Supermarket_Type',aggfunc=lambda x: x.mode())

In [12]:
def impute_size_mode(cols):
    Size = cols[0]
    Type = cols[1]
    if pd.isnull(Size):
        return supermarket_size_mode.loc['Supermarket _Size'][supermarket_size_mode.columns == Type][0]
    else:
        return Size

print ('Orignal #missing: %d'%sum(data['Supermarket _Size'].isnull()))
data['Supermarket _Size'] = data[['Supermarket _Size','Supermarket_Type']].apply(impute_size_mode,axis=1)
print ('Final #missing: %d'%sum(data['Supermarket _Size'].isnull()))

Orignal #missing: 2409
Final #missing: 0


In [13]:
visibility_item_avg = data.pivot_table(values='Product_Shelf_Visibility',index='Product_Identifier')

In [14]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

#New variable for outlet
data['Supermarket'] = le.fit_transform(data['Supermarket_Identifier'])

var_mod = ['Product_Fat_Content','Supermarket_Location_Type','Supermarket _Size','Supermarket_Type','Supermarket']
le = LabelEncoder()

for i in var_mod:
    data[i] = le.fit_transform(data[i])

In [15]:
#for one-hot encoding
datas = pd.get_dummies(data, columns = var_mod)

In [16]:
#Drop the columns which have been converted to different types:
datas.drop(['Product_Supermarket_Identifier','Product_Type',
           'Supermarket_Identifier','Product_Identifier'],axis=1,inplace=True)

In [17]:
train_data = datas.loc[datas['source']=="train"]
test_data = datas.loc[datas['source']=="test"]

In [18]:
#Drop unnecessary columns:
test_data.drop(['Product_Supermarket_Sales','source'],axis=1,inplace=True)
train_data.drop(['source'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [19]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3532 entries, 4990 to 8521
Data columns (total 27 columns):
Product_Price                  3532 non-null float64
Product_Shelf_Visibility       3532 non-null float64
Product_Weight                 3532 non-null float64
Supermarket_Opening_Year       3532 non-null int64
Product_Fat_Content_0          3532 non-null uint8
Product_Fat_Content_1          3532 non-null uint8
Product_Fat_Content_2          3532 non-null uint8
Supermarket_Location_Type_0    3532 non-null uint8
Supermarket_Location_Type_1    3532 non-null uint8
Supermarket_Location_Type_2    3532 non-null uint8
Supermarket _Size_0            3532 non-null uint8
Supermarket _Size_1            3532 non-null uint8
Supermarket _Size_2            3532 non-null uint8
Supermarket_Type_0             3532 non-null uint8
Supermarket_Type_1             3532 non-null uint8
Supermarket_Type_2             3532 non-null uint8
Supermarket_Type_3             3532 non-null uint8
Supermarket_0    

In [20]:
x = train_data.drop(['Product_Supermarket_Sales'],axis=1)
y = train_data['Product_Supermarket_Sales']

In [21]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)

In [22]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [23]:
y_predict = regressor.predict(x_test)

In [24]:
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(y_predict, y_test)
lin_rmse = np.sqrt(lin_mse)
print('Liner Regression RMSE: %.5f' % lin_rmse)

Liner Regression RMSE: 2937.29441


In [None]:
y_result = regressor.predict(test_data)

In [50]:
y_result

array([3449.10797602, 7293.97830302, 7281.14832975, ..., 9415.49137017,
       3641.96090606, 7217.13910736])

In [51]:
regressor.score(x_test,y_test)

0.5567462019326636

In [54]:
linear_submission = pd.DataFrame({'Product_Supermarket_Identifier':sample_data['Product_Supermarket_Identifier'],
                                  'Product_Supermarket_Sales': y_result},
                                 columns=['Product_Supermarket_Identifier','Product_Supermarket_Sales'])

#to csv
linear_submission.to_csv('linear_result.csv',index=False)