In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#Read in .csv files into pandas dataframes
train_lag   = pd.read_csv('month_lag_grouped.csv')
items       = pd.read_csv('items.csv')

In [4]:
#Down casts the data entries from int64 to int32 and float64 to float32
#This reduces the size of the records by almost half. (From 134mb to 61mb)
def downcast_dtypes(df):
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype in ["int64", "int32"]]
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int16)
    return df

In [5]:
train_lag  = downcast_dtypes(train_lag)
items      = downcast_dtypes(items)

In [6]:
train_lag

Unnamed: 0,date_block_num,shop_id,item_category_id,item_id,item_price,item_cnt_day,mon_lag_1,mon_lag_2,mon_lag_3,mon_lag_4,mon_lag_5
0,0,0,2,5572,1322.0,10.0,,,,,
1,0,0,2,5573,560.0,1.0,,,,,
2,0,0,2,5575,806.0,4.0,,,,,
3,0,0,2,5576,2231.0,5.0,,,,,
4,0,0,2,5609,2381.0,1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
1739017,33,59,79,17717,1250.0,4.0,1.0,1.0,4.0,2.0,1.0
1739018,33,59,79,17717,1999.0,1.0,1.0,1.0,4.0,2.0,1.0
1739019,33,59,83,22087,119.0,6.0,3.0,2.0,5.0,1.0,2.0
1739020,33,59,83,22088,119.0,2.0,1.0,7.0,7.0,4.0,3.0


In [7]:
# moves item_cnt_day to end
train_cnt_2 = train_lag['item_cnt_day']
train_lag.drop(labels=['item_cnt_day'], axis=1, inplace = True)
train_lag.insert(10, 'item_cnt_day', train_cnt_2)

In [17]:
train_lag = train_lag.dropna()

In [18]:
# stores all feature columns in the x variable, and stores the target variable as the last column in y variable
x = train_lag.iloc[:, :-1].values
y = train_lag.iloc[:,-1].values

In [19]:
# Encoding categorical data
# provides a value to data that can be then used in regression equations, eg. Friday = 1 etc.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer([('encoder', OneHotEncoder(), [0, 1, 2, 3])], remainder = 'passthrough')
x = ct.fit_transform(x)

In [20]:
# split data set into training set and testing set
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [21]:
# fitting multiple linear regression to the training set
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, Y_train)

# prediction of test split from original data, train_grouped_month
y_prediction = regressor.predict(X_test)

# The output of the this is the r squared
regressor.score(X_test, Y_test)

0.4808326655423856

In [None]:
#AUTOML CODE
#Download 64bit java
import h2o
from h2o.automl import H2OAutoML

h2o.init()
train_f = h2o.H2OFrame(train_lag)

splits = train_f.split_frame(ratios = [0.8], seed = 1)
train_split = splits[0]
test_split = splits[1]

aml = H2OAutoML(max_runtime_secs = 1000, seed = 1, project_name = "sales_forecasting")
aml.train(y = "item_cnt_day", training_frame = train_split, leaderboard_frame = test_split)
lb = aml.leaderboard

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,1 min 29 secs
H2O_cluster_timezone:,Africa/Harare
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.1.2
H2O_cluster_version_age:,18 days
H2O_cluster_name:,H2O_from_python_kishe_y3ru29
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.614 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


Parse progress: |█████████████████████████████████████████████████████████| 100%
AutoML progress: |█
15:25:53.468: AutoML: XGBoost is not available; skipping it.

██████████████████████████████████████Failed polling AutoML progress log: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\kishe\\AppData\\Local\\Temp\\tmptqumo5ov.csv'
██