In [87]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [60]:
filepath = '../input/final.csv'

df = pd.read_csv(filepath)

### EDA (Light version)

In [61]:
df.tail()

Unnamed: 0,store,dayofweek,date,sales,customers,open,promo,lat,lon
1016090,1111,5,2020-07-29,5723,422,1,1,55.878577,37.706292
1016091,1112,5,2020-07-29,9626,767,1,1,55.791374,37.823187
1016092,1113,5,2020-07-29,7289,720,1,1,55.756921,37.824353
1016093,1114,5,2020-07-29,27508,3745,1,1,55.798922,37.822139
1016094,1115,5,2020-07-29,8680,538,1,1,55.582867,37.653329


In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1016095 entries, 0 to 1016094
Data columns (total 9 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   store      1016095 non-null  int64  
 1   dayofweek  1016095 non-null  int64  
 2   date       1016095 non-null  object 
 3   sales      1016095 non-null  int64  
 4   customers  1016095 non-null  int64  
 5   open       1016095 non-null  int64  
 6   promo      1016095 non-null  int64  
 7   lat        1016095 non-null  float64
 8   lon        1016095 non-null  float64
dtypes: float64(2), int64(6), object(1)
memory usage: 69.8+ MB


In [63]:
df_train = df[df['date'] < '2020-07-01']
df_test = df[df['date'] >= '2020-07-01']

In [64]:
df_train.tail()

Unnamed: 0,store,dayofweek,date,sales,customers,open,promo,lat,lon
983755,1111,4,2020-06-30,3945,329,1,1,55.878577,37.706292
983756,1112,4,2020-06-30,7447,577,1,1,55.791374,37.823187
983757,1113,4,2020-06-30,7932,793,1,1,55.756921,37.824353
983758,1114,4,2020-06-30,24898,3784,1,1,55.798922,37.822139
983759,1115,4,2020-06-30,6858,452,1,1,55.582867,37.653329


In [65]:
df_test.head()

Unnamed: 0,store,dayofweek,date,sales,customers,open,promo,lat,lon
983760,1,5,2020-07-01,4665,538,1,1,55.682829,37.416212
983761,2,5,2020-07-01,4838,1029,1,1,55.68641,37.415503
983762,3,5,2020-07-01,8149,742,1,1,55.885081,37.476452
983763,4,5,2020-07-01,11543,1395,1,1,55.711356,37.389027
983764,5,5,2020-07-01,4475,527,1,1,55.902867,37.609132


In [66]:
features = [
    'store',
    'dayofweek',
    'customers',
    'open',
    'promo'
]

target = ['sales']

In [67]:
X_train = df_train[features]
y_train = df_train[target]

X_test = df_test[features]
y_test = df_test[target]

In [68]:
X_train.head()

Unnamed: 0,store,dayofweek,customers,open,promo
0,1,3,668,1,0
1,2,3,650,1,0
2,3,3,805,1,0
3,4,3,1429,1,0
4,5,3,577,1,0


In [69]:
y_train.head()

Unnamed: 0,sales
0,5530
1,4422
2,6823
3,9941
4,4253


### Model

In [86]:
model = LinearRegression()

In [71]:
model.fit(X_train, y_train)

In [72]:
coefs = model.coef_[0]

In [73]:
intercept = model.intercept_
intercept

array([352.11911259])

In [74]:
df_train[features].head(1)

Unnamed: 0,store,dayofweek,customers,open,promo
0,1,3,668,1,0


In [75]:
1 * coefs[0] + 3 * coefs[1] + 668 * coefs[2] + 1 * coefs[3] + 0 * coefs[4] + intercept

array([5918.71726263])

In [76]:
model.predict(X_train.head(1))

array([[5918.71726263]])

In [77]:
X_train.head(1)

Unnamed: 0,store,dayofweek,customers,open,promo
0,1,3,668,1,0


In [78]:
pred_train = model.predict(X_train)
pred_train

array([[ 5918.71726263],
       [ 5809.22276437],
       [ 6750.64047619],
       ...,
       [ 7725.64445989],
       [25894.73285244],
       [ 5653.89116239]])

In [79]:
y_train

Unnamed: 0,sales
0,5530
1,4422
2,6823
3,9941
4,4253
...,...
983755,3945
983756,7447
983757,7932
983758,24898


In [80]:
train_preds = pd.DataFrame({'pred': pred_train.flatten(), 'sales': y_train['sales']})
train_preds

Unnamed: 0,pred,sales
0,5918.717263,5530
1,5809.222764,4422
2,6750.640476,6823
3,10541.062966,9941
4,5365.321166,4253
...,...,...
983755,4907.314930,3945
983756,6413.673888,7447
983757,7725.644460,7932
983758,25894.732852,24898


In [81]:
pred_test = model.predict(X_test)

In [82]:
pred_test

array([[ 6289.00153871],
       [ 9271.49730015],
       [ 7527.9254348 ],
       ...,
       [ 7226.63866794],
       [25602.26472028],
       [ 6120.75266178]])

In [83]:
test_preds = pd.DataFrame({'pred': pred_test.flatten(), 'sales': y_test['sales']})
test_preds

Unnamed: 0,pred,sales
983760,6289.001539,4665
983761,9271.497300,4838
983762,7527.925435,8149
983763,11494.512399,11543
983764,6221.576406,4475
...,...,...
1016090,5416.698889,5723
1016091,7512.297641,9626
1016092,7226.638668,7289
1016093,25602.264720,27508


In [84]:
from sklearn.metrics import mean_squared_error, r2_score

In [85]:
print(f"MSE train {mean_squared_error(y_train, pred_train)}")
print(f"MSE test {mean_squared_error(y_test, pred_test)}")
print()
print(f"R2 train {r2_score(y_train, pred_train)}")
print(f"R2 test {r2_score(y_test, pred_test)}")

MSE train 2188564.0929350276
MSE test 2308053.3515654523

R2 train 0.8526676817253495
R2 test 0.8242831203993978
