In [46]:
import pandas as pd
import datetime
import numpy as np
from sklearn.preprocessing import LabelEncoder

## Reading the Data 

In [47]:
df = pd.read_csv('StockX-ML-Data.csv', parse_dates=True)

df.head()

Unnamed: 0,Order Date,Brand,Sneaker Name,Sale Price,Retail Price,Release Date,Shoe Size,Buyer Region,Percent Change,Days Since Release,Original Release
0,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-Low-V2-Beluga,1097,220,9/24/16,11.0,California,398.636364,1665,0
1,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Copper,685,220,11/23/16,11.0,California,211.363636,1605,0
2,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Green,690,220,11/23/16,11.0,California,213.636364,1605,0
3,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red,1075,220,11/23/16,11.5,Kentucky,388.636364,1605,0
4,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017,828,220,2/11/17,11.0,Rhode Island,276.363636,1525,0


## Data Preprocessing

In [59]:
# Label encoding the states to have values from 0 - 49
# Instatniate LabelEncoder
label_encoder = LabelEncoder()

# LabelEncode
df['Buyer Region'] = label_encoder.fit_transform(df['Buyer Region']) # CA, NY, etc.
df['Color'] = label_encoder.fit_transform(df['Sneaker Name']) # Colorway
df['Brand'] = label_encoder.fit_transform(df['Brand']) # Yeezy, Nike


In [60]:
# Parsing data times and creating month column
df['Order Date'] = pd.to_datetime(df['Order Date'], infer_datetime_format=True)
df['Month'] = pd.DatetimeIndex(df['Order Date']).month
df['Year'] = pd.DatetimeIndex(df['Order Date']).year

# Inspecting newly created df
df.head()

Unnamed: 0,Order Date,Brand,Sneaker Name,Sale Price,Retail Price,Release Date,Shoe Size,Buyer Region,Percent Change,Days Since Release,Original Release,Color,Month,Year
0,2017-09-01,0,Adidas-Yeezy-Boost-350-Low-V2-Beluga,1097,220,9/24/16,11.0,4,398.636364,1665,0,5,9,2017
1,2017-09-01,0,Adidas-Yeezy-Boost-350-V2-Core-Black-Copper,685,220,11/23/16,11.0,4,211.363636,1605,0,8,9,2017
2,2017-09-01,0,Adidas-Yeezy-Boost-350-V2-Core-Black-Green,690,220,11/23/16,11.0,4,213.636364,1605,0,9,9,2017
3,2017-09-01,0,Adidas-Yeezy-Boost-350-V2-Core-Black-Red,1075,220,11/23/16,11.5,17,388.636364,1605,0,10,9,2017
4,2017-09-01,0,Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017,828,220,2/11/17,11.0,39,276.363636,1525,0,11,9,2017


Now that we have preprocessed the data - our next task is to create the matrices:

In [12]:
# Separating features and target output
X = np.array(df[["Order Date", "Brand", "Retail Price", "Release Date", "Shoe Size", "Buyer Region", "Percent Change", "Days Since Release", "Original Release", "Color"]])
Y = np.array(df["Sale Price"])

# Checking shape of matrices
print(X.shape)
print(Y.shape)

(99956, 10)
(99956,)


In [13]:
# Number of training examples
N = Y.shape[0]

ones = np.ones((N, 1))
X_1 = np.hstack((ones, X))

print(X_1)

[[1.0 '9/1/17' 0 ... 1665 0 5]
 [1.0 '9/1/17' 0 ... 1605 0 8]
 [1.0 '9/1/17' 0 ... 1605 0 9]
 ...
 [1.0 '2/13/19' 0 ... 842 0 49]
 [1.0 '2/13/19' 0 ... 842 0 49]
 [1.0 '2/13/19' 0 ... 842 0 49]]


## Data Visualization