In [2]:
import numpy as np
import pandas as pd

In [34]:
train_df = pd.read_csv("data/raw/train.csv")

## 1. Preprocessing for Random Forest
### 1.1. Handle Missing Values

In [35]:
train_df.dropna(subset=["num_sold"], inplace=True)
train_df.isna().sum()

id          0
date        0
country     0
store       0
product     0
num_sold    0
dtype: int64

### 1.2. Data Column Formatting

In [36]:
train_df["date"] = pd.to_datetime(train_df["date"])
train_df.dtypes

id                   int64
date        datetime64[ns]
country             object
store               object
product             object
num_sold           float64
dtype: object

## 2. Feature Engineering
### 2.1. For Random Forest

In [37]:
train_df = pd.get_dummies(train_df, columns=["country", "store", "product"], dtype=int)

train_df["day"] = train_df["date"].dt.day
train_df["month"] = train_df["date"].dt.month
train_df["year"] = train_df["date"].dt.year
train_df["day_of_week"] = train_df["date"].dt.dayofweek

train_df.drop(columns=["date"], inplace=True)

In [38]:
train_df

Unnamed: 0,id,num_sold,country_Canada,country_Finland,country_Italy,country_Kenya,country_Norway,country_Singapore,store_Discount Stickers,store_Premium Sticker Mart,store_Stickers for Less,product_Holographic Goose,product_Kaggle,product_Kaggle Tiers,product_Kerneler,product_Kerneler Dark Mode,day,month,year,day_of_week
1,1,973.0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,2010,4
2,2,906.0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,1,2010,4
3,3,423.0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,1,2010,4
4,4,491.0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,1,1,2010,4
5,5,300.0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,2010,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230125,230125,466.0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,31,12,2016,5
230126,230126,2907.0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,31,12,2016,5
230127,230127,2299.0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,31,12,2016,5
230128,230128,1242.0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,31,12,2016,5


## 4. Export Preprocessed Data

In [39]:
train_df.to_csv("data/processed/preprocessed_train_rf.csv", index=False)