In [1]:
import pandas as pd

# Read the data into a DataFrame'
df = pd.read_csv("../../data/raw/sales_train.csv")

In [2]:
df.shape

(30490, 1547)

In [3]:
df.drop(["id", "dept_id", "cat_id", "state_id"], inplace=True, axis=1)

In [4]:
import sys
sys.path.append('../../src/data')

from filter_items import filter_byper
df_filtered = filter_byper(df)

In [5]:
# Check if all 'd_' columns have zeros, and keep items that were sold at least once
#df_filtered = df[df.iloc[:, 2:].sum(axis=1) > 0]

In [6]:
df_filtered = df_filtered.melt(id_vars=['item_id', 'store_id'], var_name='d', value_name='units_sold')

In [7]:
df_filtered.head()

Unnamed: 0,item_id,store_id,d,units_sold
0,HOBBIES_1_001,CA_1,d_1,0
1,HOBBIES_1_002,CA_1,d_1,0
2,HOBBIES_1_004,CA_1,d_1,0
3,HOBBIES_1_005,CA_1,d_1,0
4,HOBBIES_1_006,CA_1,d_1,0


In [8]:
del df

In [9]:
df_filtered.shape

(35604805, 4)

In [10]:
from merge import Merger
df_merged = Merger().merge_df(df_filtered)

In [11]:
df_merged.head()

Unnamed: 0,item_id,store_id,date,sell_price,revenue
0,HOBBIES_1_008,CA_1,2011-01-29,0.46,5.52
1,HOBBIES_1_008,CA_1,2011-01-30,0.46,6.9
2,HOBBIES_1_008,CA_1,2011-01-31,0.46,0.0
3,HOBBIES_1_008,CA_1,2011-02-01,0.46,0.0
4,HOBBIES_1_008,CA_1,2011-02-02,0.46,0.0


In [13]:
df_merged.shape

(29827642, 5)

In [12]:
del df_filtered

In [14]:
from get_date import get_date_features
df_cleaned = get_date_features(df_merged)

In [15]:
df_cleaned.head()

Unnamed: 0,item_id,store_id,sell_price,revenue,year,month,day,weekday
0,HOBBIES_1_008,CA_1,0.46,5.52,2011,1,29,5
1,HOBBIES_1_008,CA_1,0.46,6.9,2011,1,30,6
2,HOBBIES_1_008,CA_1,0.46,0.0,2011,1,31,0
3,HOBBIES_1_008,CA_1,0.46,0.0,2011,2,1,1
4,HOBBIES_1_008,CA_1,0.46,0.0,2011,2,2,2


In [16]:
df_cleaned["revenue"].value_counts()

0.00      16405658
2.98        245999
1.98        232301
3.98        178698
5.96        156733
            ...   
110.74           1
30.10            1
37.17            1
197.46           1
41.01            1
Name: revenue, Length: 9381, dtype: int64

In [17]:
df_cleaned = df_cleaned[df_cleaned['revenue'] != 0]

In [18]:
df_cleaned.shape

(13421984, 8)

### Save the interim dataframe into a pickle file, uses less memory

In [19]:
df_cleaned.to_pickle("../../data/interim/predictive/train_cleaned.pkl")  

### Load the processed dataframe if kernel restarts 

In [1]:
import pandas as pd
df_cleaned = pd.read_pickle("../../data/interim/predictive/train_cleaned.pkl")

## Modelling

In [2]:
import sys
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV,  KFold
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Define categorical and numerical columns
categorical_columns = ['item_id', 'store_id']
numerical_columns = ['sell_price', 'year', 'month', 'day', 'weekday']

# Splitting data into X (features) and y (target variable)
X = df_cleaned.drop(columns=['revenue'])
y = df_cleaned['revenue']

In [4]:
sys.path.append("../../src/models/predictive")

from train_model import ModelPipeline
# Create an instance of the ModelPipeline class
builder = ModelPipeline(numerical_columns, categorical_columns)

In [7]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

# Fit the model
#pipeline = builder.train(X_train, y_train)
pipeline = builder.train(X_train, y_train)

In [8]:
pipeline

In [9]:
# Evaluate the model on the test set
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error on Test Set:", mse)

Mean Squared Error on Test Set: 142.858333166615


In [10]:
max(y_pred)

63.862100417692886

In [12]:
# Now you can use the pipeline for prediction
# For example, to predict the sales for a new data point:
new_data = pd.DataFrame({'item_id': ['HOBBIES_1_060'], 'store_id': ['CA_1'], 'sell_price': [30.98], 'date': ['2012-05-19']})

In [15]:
from predict_model import predict_df

In [16]:
predicted_sales = predict_df(new_data, pipeline)
print(predicted_sales)

[45.70769671]


## Save model

In [48]:
from joblib import dump

dump(pipeline, '../../models/predictive/sgd_pipeline.joblib')

['../../models/predictive/sgd_pipeline.joblib']

## Appendix

### Using the dask dataframe to make batches of the dataframe

In [None]:
import dask.dataframe as dd
# Convert the Pandas DataFrame to a Dask DataFrame
ddf = dd.from_pandas(df_cleaned, npartitions=4)

# Define categorical and numerical columns
categorical_columns = ['item_id', 'store_id']
numerical_columns = ['year', 'month', 'day', 'weekday']

ddf[numerical_columns] = ddf[numerical_columns].astype('float32')

# Splitting data into X (features) and y (target variable)
X = ddf.drop(columns=['revenue']).compute()
y = ddf['revenue'].compute()

In [41]:
# Define a custom transformer to extract date features
class DateFeaturesExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['date'] = pd.to_datetime(X['date'])
        X['year'] = X['date'].dt.year
        X['month'] = X['date'].dt.month
        X['day'] = X['date'].dt.day
        X['weekday'] = X['date'].dt.weekday
        return X.drop(columns=['date'])

# Specify numerical and categorical columns
numerical_columns = ['year', 'month', 'day', 'weekday']
categorical_columns = ['item_id', 'store_id']

# Preprocessing pipeline for date features
date_feature_transformer = Pipeline([
    ('date_features', DateFeaturesExtractor()),
    ('scaler', StandardScaler())
])

# Preprocessing pipeline for categorical features
categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(drop='first'))
])

# Combine transformers for date and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', date_feature_transformer, ['date']),
        ('cat', categorical_transformer, categorical_columns),
    ])

# Model pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SGDRegressor())
])

# Separate features (X) and target (y)
X = df.drop(columns=['volume of sales'])
y = df['volume of sales']

# Fit the pipeline on your data
pipeline.fit(X, y)

# Now you can use the trained pipeline for prediction


ValueError: A given column is not a column of the dataframe

In [None]:
# Hyperparameter grid for grid search
param_grid = {
    'regressor__alpha': [0.0001, 0.001, 0.01, 0.1],
    'regressor__max_iter': [1000, 5000, 10000],
}

# Grid search with cross-validation
#cv = KFold(n_splits=5, shuffle=True, random_state=42)  # Use KFold for regression
grid_search = GridSearchCV(
    pipeline, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
grid_search.fit(X, y)

# Best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Fit the model with the best hyperparameters
final_pipeline = grid_search.best_estimator_
final_pipeline.fit(X_train, y_train)

In [None]:
# Evaluate the model on the test set
y_pred = final_pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error on Test Set:", mse)