In [32]:
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn import svm

## Reading the Data 

In [9]:
df = pd.read_csv('StockX-ML-Data.csv', parse_dates=True)

df.head()

Unnamed: 0,Order Date,Brand,Sneaker Name,Sale Price,Retail Price,Release Date,Shoe Size,Buyer Region,Percent Change,Days Since Release,Original Release
0,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-Low-V2-Beluga,1097,220,9/24/16,11.0,California,398.636364,1665,0
1,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Copper,685,220,11/23/16,11.0,California,211.363636,1605,0
2,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Green,690,220,11/23/16,11.0,California,213.636364,1605,0
3,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red,1075,220,11/23/16,11.5,Kentucky,388.636364,1605,0
4,9/1/17,Yeezy,Adidas-Yeezy-Boost-350-V2-Core-Black-Red-2017,828,220,2/11/17,11.0,Rhode Island,276.363636,1525,0


## Data Preprocessing

In [10]:
# Label encoding the states to have values from 0 - 49
# Instatniate LabelEncoder
label_encoder = LabelEncoder()

# LabelEncode
df['Buyer Region'] = label_encoder.fit_transform(df['Buyer Region']) # CA, NY, etc.
df['Color'] = label_encoder.fit_transform(df['Sneaker Name']) # Colorway
df['Brand'] = label_encoder.fit_transform(df['Brand']) # Yeezy, Nike


In [20]:
# Parsing data times and creating month column
df['Order Date'] = pd.to_datetime(df['Order Date'], infer_datetime_format=True)
df['Month'] = pd.DatetimeIndex(df['Order Date']).month
df['Year'] = pd.DatetimeIndex(df['Order Date']).year

# Inspecting newly created 
# df.head()

Now that we have preprocessed the data - our next task is to create the matrices:

In [21]:
# Separating features and target output
X = np.array(df[["Order Date", "Brand", "Retail Price", "Release Date", "Shoe Size", "Buyer Region", "Percent Change", "Days Since Release", "Original Release", "Color"]])
Y = np.array(df["Sale Price"])

# Checking shape of matrices
print(X)
#print(Y.shape)

[[Timestamp('2017-09-01 00:00:00') 0 220 ... 1665 0 5]
 [Timestamp('2017-09-01 00:00:00') 0 220 ... 1605 0 8]
 [Timestamp('2017-09-01 00:00:00') 0 220 ... 1605 0 9]
 ...
 [Timestamp('2019-02-13 00:00:00') 0 220 ... 842 0 49]
 [Timestamp('2019-02-13 00:00:00') 0 220 ... 842 0 49]
 [Timestamp('2019-02-13 00:00:00') 0 220 ... 842 0 49]]


In [22]:
# Number of training examples
N = Y.shape[0]

ones = np.ones((N, 1))
X_1 = np.hstack((ones, X))

print(X_1)

[[1.0 Timestamp('2017-09-01 00:00:00') 0 ... 1665 0 5]
 [1.0 Timestamp('2017-09-01 00:00:00') 0 ... 1605 0 8]
 [1.0 Timestamp('2017-09-01 00:00:00') 0 ... 1605 0 9]
 ...
 [1.0 Timestamp('2019-02-13 00:00:00') 0 ... 842 0 49]
 [1.0 Timestamp('2019-02-13 00:00:00') 0 ... 842 0 49]
 [1.0 Timestamp('2019-02-13 00:00:00') 0 ... 842 0 49]]


## Data Visualization

## Linear Regression
#### Given a shoe's information, can we predict its resale price?

In [23]:
# Retrieving feature data and target variable
X = df[['Brand', 'Sale Price', 'Retail Price', 'Shoe Size', 'Buyer Region', 'Percent Change', 'Days Since Release', 'Original Release', 'Color', 'Month', 'Year']]  

y = df['Sale Price']

# Splitting data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Linear Regression with Lasso Regularization

In [30]:
acc_train_linreg = []
acc_test_linreg = []
c_linreg = []

def linregression_model(c, X_train, y_train, X_test, y_test):
    linear_regression = LinearRegression()
    linear_regression.fit(X_train, y_train.ravel())
    y_hat_train = linear_regression.predict(X_train)
    acc_train = np.mean(y_train == y_hat_train)
    acc_train_linreg.append(acc_train)
    print("C = ", c)
    print("Accuracy on training data: %f" % acc_train)
    y_hat_test = linear_regression.predict(X_test)
    acc_test = np.mean(y_hat_test == y_test)
    acc_test_linreg.append(acc_test)
    print("Accuracy on testing data: %f" % acc_test)
    print()

In [31]:
c_vals = [0.0001, 0.001, 0.01, 0.1, 1, 10]

for c in c_vals:
    linregression_model(c, X_train, y_train, X_test, y_test)

C =  0.0001
Accuracy on training data: 0.027479
Accuracy on testing data: 0.028012

C =  0.001
Accuracy on training data: 0.027479
Accuracy on testing data: 0.028012

C =  0.01
Accuracy on training data: 0.027479
Accuracy on testing data: 0.028012

C =  0.1
Accuracy on training data: 0.027479
Accuracy on testing data: 0.028012

C =  1
Accuracy on training data: 0.027479
Accuracy on testing data: 0.028012

C =  10
Accuracy on training data: 0.027479
Accuracy on testing data: 0.028012



## Logistic Regression
#### Given a shoe, can we predict if its resale price doubled in price (with respect to its original retail price)?


In [108]:
# Retrieving feature data and target variable
X = df[['Brand', 'Sale Price', 'Retail Price', 'Shoe Size', 'Buyer Region', 'Percent Change', 'Days Since Release', 'Original Release', 'Color', 'Month', 'Year']]  

y = df['Sale Price']

# Splitting data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [109]:
# Instantiating a LogisticRegression instance
logistic_regression = LogisticRegression()

logistic_regression.fit( X_train, y_train )

y_hat_train = logistic_regression.predict( X_train )

print(y_hat_train)
print('Hi')



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

## Support Vector Machine