In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.feature_extraction import DictVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import accuracy_score


In [23]:
orders = pd.read_csv('instacart/orders.csv')
products = pd.read_csv('instacart/products.csv')
order_products_prior = pd.read_csv('instacart/order_products__prior.csv')

In [29]:
print(orders.shape)

print(list(orders))
ordersSet = orders[orders['eval_set'].str.contains("prior")]
print(ordersSet.shape)

(3421083, 7)
['order_id', 'user_id', 'eval_set', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']
(3214874, 7)


In [30]:
twoColOrders = ordersSet.filter(["order_id", "user_id"], axis=1)
print(twoColOrders.shape)

(3214874, 2)


In [31]:
threeColPriors = order_products_prior.filter(["order_id", "product_id", "reordered"], axis =1)
print(threeColPriors.shape)

(32434489, 3)


In [45]:
#df.groupby('id').agg(lambda x: x.tolist())
# & (orders['user_id'] < 20000)
userProduct = threeColPriors.set_index('order_id').join(twoColOrders.set_index('order_id'))
print(list(userProduct))
print(userProduct.shape)

['product_id', 'reordered', 'user_id']
(32434489, 3)


In [73]:
totalSet = userProduct[userProduct['user_id'] <= 50000]
print(totalSet.shape)

(7882635, 3)


In [74]:
totalSet.iloc[0:10]

Unnamed: 0_level_0,product_id,reordered,user_id
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6,40462,0,22352
6,15873,0,22352
6,41897,0,22352
8,23423,1,3107
13,17330,0,45082
13,27407,0,45082
13,35419,0,45082
13,196,0,45082
13,44635,0,45082
13,26878,0,45082


In [90]:
totalSet = totalSet.groupby('user_id').agg({
    "product_id": lambda x: x.tolist(),
    "reordered": 'mean'

})

In [91]:
temp = totalSet[totalSet['reordered'] >= .5]

In [92]:
print(temp.shape)

(19961, 2)


In [93]:
totalSet.shape

(50000, 2)

In [94]:
totalSet['reordered'] = np.where(totalSet['reordered'] >= .5, 1, totalSet['reordered'])
totalSet['reordered'] = np.where(totalSet['reordered'] < .5, 0, totalSet['reordered'])

In [95]:
totalSet.iloc[0:10]

Unnamed: 0_level_0,product_id,reordered
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[[196, 12427, 10258, 25133, 10326, 17122, 4178...",1.0
2,"[[49451, 32792, 32139, 34688, 36735, 37646, 22...",0.0
3,"[[38596, 21903, 248, 40604, 8021, 17668, 21137...",1.0
4,"[[22199, 25146, 1200, 17769, 43704, 37646, 118...",0.0
5,"[[27344, 24535, 43693, 40706, 16168, 21413, 13...",0.0
6,"[[38293, 20323, 40992, 21903, 45007, 11068, 10...",0.0
7,"[[13198, 42803, 8277, 37602, 40852, 4920, 4945...",1.0
8,"[[21903, 28985, 6473, 39110, 39812, 20920, 320...",0.0
9,"[[27973, 481, 27966, 33754, 4957, 13351, 40571...",0.0
10,"[[46979, 47380, 20995, 43014, 15011, 27156, 13...",0.0


In [96]:
trainSet = totalSet.iloc[0:40000]
testSet = totalSet.iloc[40000:50000]
print(trainSet.shape)
print(testSet.shape)

(40000, 2)
(10000, 2)


In [97]:
print(type(trainSet.iloc[1][0]))
list = trainSet.iloc[1][0]
print(type(list[0]))

<class 'list'>
<class 'list'>


In [152]:
def features(productList):
    featureList = []
    for item in productList[0]:
        featureList.append(str(item))
    return Counter(featureList)

In [153]:
features(xtrainRaw[1])

Counter({'196': 10,
         '12427': 10,
         '10258': 9,
         '25133': 8,
         '10326': 1,
         '17122': 1,
         '41787': 1,
         '13176': 2,
         '30450': 1,
         '13032': 3,
         '26405': 2,
         '49235': 2,
         '46149': 3,
         '26088': 2,
         '14084': 1,
         '39657': 1,
         '38928': 1,
         '35951': 1})

In [154]:
xtrainRaw = trainSet.iloc[0:, 0]
ytrain = trainSet.iloc[0:, 1]

xtestRaw = testSet.iloc[0:, 0]
ytest = testSet.iloc[0:, 1]

vect = DictVectorizer()
xtrain = vect.fit_transform(features(row) for row in xtrainRaw)
xtest = vect.transform(features(row) for row in xtestRaw)

In [155]:
type(xtrain)
print(xtrain.shape)
print(xtest.shape)

(40000, 45719)
(10000, 45719)


In [156]:
from sklearn.svm import SVC

clist = [.0001, .001, .01, .1, 1.0, 10, 100, 1000]
for c in clist:
    classifier = SVC(kernel='linear', C=c, random_state=123)
    classifier.fit(xtrain[:1000], ytrain[:1000])

    print("Training accuracy: %0.6f for c of %.6f" % (accuracy_score(ytrain[:1000], classifier.predict(xtrain[:1000])), c))
    print("Test accuracy: %0.6f" % accuracy_score(ytest, classifier.predict(xtest)))

Training accuracy: 0.715000 for c of 0.000100
Test accuracy: 0.715500
Training accuracy: 0.846000 for c of 0.001000
Test accuracy: 0.763800
Training accuracy: 0.955000 for c of 0.010000
Test accuracy: 0.761400
Training accuracy: 0.999000 for c of 0.100000
Test accuracy: 0.741500
Training accuracy: 1.000000 for c of 1.000000
Test accuracy: 0.740200
Training accuracy: 1.000000 for c of 10.000000
Test accuracy: 0.740200
Training accuracy: 1.000000 for c of 100.000000
Test accuracy: 0.740200
Training accuracy: 1.000000 for c of 1000.000000
Test accuracy: 0.740200


In [157]:
clist = [.0001, .001, .01, .1, 1.0, 10, 100, 1000]
for c in clist:
    classifier = SVC(kernel='rbf', C=c, random_state=123)
    classifier.fit(xtrain[:1000], ytrain[:1000])

    print("Training accuracy: %0.6f for c of %.6f" % (accuracy_score(ytrain[:1000], classifier.predict(xtrain[:1000])), c))
    print("Test accuracy: %0.6f" % accuracy_score(ytest, classifier.predict(xtest)))

Training accuracy: 0.590000 for c of 0.000100
Test accuracy: 0.596700
Training accuracy: 0.590000 for c of 0.001000
Test accuracy: 0.596700
Training accuracy: 0.590000 for c of 0.010000
Test accuracy: 0.596700
Training accuracy: 0.594000 for c of 0.100000
Test accuracy: 0.599600
Training accuracy: 0.691000 for c of 1.000000
Test accuracy: 0.702200
Training accuracy: 0.812000 for c of 10.000000
Test accuracy: 0.788300
Training accuracy: 0.921000 for c of 100.000000
Test accuracy: 0.828700
Training accuracy: 0.993000 for c of 1000.000000
Test accuracy: 0.828000


In [158]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(alpha=1)
clf.fit(xtrain, ytrain)
print("Test accuracy: %0.6f" % accuracy_score(ytest, classifier.predict(xtest)))

Test accuracy: 0.828000


In [159]:
print("Test accuracy: %0.6f" % accuracy_score(ytest, clf.predict(xtest)))

Test accuracy: 0.842000


In [168]:
from sklearn.ensemble import RandomForestClassifier

depths = [5, 10 , 15, 20, 25, 30, 35, 40]

for depth in depths:
    RF = RandomForestClassifier(n_estimators=100, max_depth=depth, random_state=0)
    RF.fit(xtrain, ytrain)
    print("Test accuracy: %0.6f with a depth of %d" % (accuracy_score(ytest, RF.predict(xtest)), depth))

Test accuracy: 0.663800 with a depth of 5
Test accuracy: 0.715100 with a depth of 10
Test accuracy: 0.742900 with a depth of 15
Test accuracy: 0.763100 with a depth of 20
Test accuracy: 0.780300 with a depth of 25
Test accuracy: 0.790000 with a depth of 30
Test accuracy: 0.797900 with a depth of 35
Test accuracy: 0.804600 with a depth of 40


In [169]:
estimators = [50, 100 , 120, 150, 180, 200, 225, 250]

for estimator in estimators:
    RF = RandomForestClassifier(n_estimators=estimator, max_depth=40, random_state=0)
    RF.fit(xtrain, ytrain)
    print("Test accuracy: %0.6f with a estimator of %d" % (accuracy_score(ytest, RF.predict(xtest)), estimator))

Test accuracy: 0.804600 with a estimator of 50
Test accuracy: 0.804600 with a estimator of 100
Test accuracy: 0.803700 with a estimator of 120
Test accuracy: 0.804100 with a estimator of 150
Test accuracy: 0.802400 with a estimator of 180
Test accuracy: 0.804300 with a estimator of 200
Test accuracy: 0.804800 with a estimator of 225
Test accuracy: 0.804900 with a estimator of 250


In [170]:
classifier = SVC(kernel='rbf', C=100, random_state=123)
classifier.fit(xtrain, ytrain)
print("Test accuracy: %0.6f" % accuracy_score(ytest, classifier.predict(xtest)))

Test accuracy: 0.885300


In [171]:
classifier = SVC(kernel='linear', C=.001, random_state=123)
classifier.fit(xtrain[:1000], ytrain[:1000])
print("Test accuracy: %0.6f" % accuracy_score(ytest, classifier.predict(xtest)))

Test accuracy: 0.763800
