In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, silhouette_score
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier

In [3]:
full_df = pd.read_csv('../dataset/cleaned/combined_cleansed.csv')

## Get Validation Set

In [3]:
train_ordered_products = pd.read_csv('../dataset/order_products__train.csv')
orders = pd.read_csv('../dataset/orders.csv')
products = pd.read_csv('../dataset/products.csv')

In [4]:
orders = orders[orders['eval_set'] == 'train']

In [6]:
validation_set = pd.merge(orders, train_ordered_products, how = 'left', on = 'order_id')
validation_set = pd.merge(validation_set, products, how = 'left', on = 'product_id')

In [24]:
validation_set.to_csv('../dataset/cleaned/validation.csv')

In [8]:
a = list(validation_set['user_id'].unique())

In [9]:
b = list(full_df['user_id'].unique())

## Groupby

In [4]:
collaborative_df = full_df.groupby(['user_id', 'product_name', 'product_id'])['product_id'].agg('count').to_frame('purchase_count').reset_index()

In [15]:
collaborative_df[collaborative_df['user_id'] == 8]

Unnamed: 0,user_id,product_name,product_id,purchase_count
273,8,Bag of Lemons,6473,1
274,8,"Baked Beans, Vegetarian",33640,1
275,8,Blueberry Pint,15143,1
276,8,Cane Sugar,49533,1
277,8,Carrots,17794,3
278,8,Extra Sharp White Cheddar,32030,2
279,8,Garlic,34358,2
280,8,Green Beans,14992,2
281,8,Italian Extra Virgin Olive Oil,10644,1
282,8,Low Sodium Beef Broth,6141,1


In [25]:
user_group = full_df.groupby('user_id')

In [27]:
# get first entry of the groups

user_group.first()

Unnamed: 0_level_0,order_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,department,aisle
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,2539329,prior,1,2,8,15.0,196,1,0,Soda,77,7,beverages,soft drinks
2,2168274,prior,1,2,11,10.0,32792,1,0,Chipotle Beef & Pork Realstick,23,19,snacks,popcorn jerky
3,1374495,prior,1,1,14,9.0,9387,1,0,Granny Smith Apples,24,4,produce,fresh fruits
4,3343014,prior,1,6,11,19.0,36606,1,0,Sprouted Multi-Grain Bread,112,3,bakery,bread
5,2717275,prior,1,3,12,11.0,15349,1,0,Organic Raw Agave Nectar,29,13,pantry,honeys syrups nectars
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206205,969311,prior,1,4,12,30.0,27845,1,0,Organic Whole Milk,84,16,dairy eggs,milk
206206,3189322,prior,1,3,18,3.0,13817,1,0,"\""Im Pei-nut Butter\"" Double Chocolate Cookie ...",37,1,frozen,ice cream ice
206207,2166133,prior,1,6,19,1.0,47766,1,0,Organic Avocado,24,4,produce,fresh fruits
206208,2227043,prior,1,1,15,8.0,34213,1,0,Great White Bread,112,3,bakery,bread


In [28]:
# get everything from user 1

user_group.get_group(1)

Unnamed: 0,order_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,department,aisle
0,2539329,prior,1,2,8,,196,1,0,Soda,77,7,beverages,soft drinks
1,2539329,prior,1,2,8,,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,dairy eggs,soy lactosefree
2,2539329,prior,1,2,8,,12427,3,0,Original Beef Jerky,23,19,snacks,popcorn jerky
3,2539329,prior,1,2,8,,26088,4,0,Aged White Cheddar Popcorn,23,19,snacks,popcorn jerky
4,2539329,prior,1,2,8,,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54,17,household,paper goods
5,2398795,prior,2,3,7,15.0,196,1,1,Soda,77,7,beverages,soft drinks
6,2398795,prior,2,3,7,15.0,10258,2,0,Pistachios,117,19,snacks,nuts seeds dried fruit
7,2398795,prior,2,3,7,15.0,12427,3,1,Original Beef Jerky,23,19,snacks,popcorn jerky
8,2398795,prior,2,3,7,15.0,13176,4,0,Bag of Organic Bananas,24,4,produce,fresh fruits
9,2398795,prior,2,3,7,15.0,26088,5,1,Aged White Cheddar Popcorn,23,19,snacks,popcorn jerky


In [31]:
test = full_df.groupby('user_id')['product_name']

In [34]:
test = pd.DataFrame(test, columns = ['user', 'product_bought'])

In [36]:
test.to_csv('test.csv')

In [38]:
test.head()

Unnamed: 0,0,1
0,1,0 Soda ...
1,2,59 Chipotle Beef & Pork Realstic...
2,3,254 Granny Smit...
3,4,342 Sprouted Multi-G...
4,5,360 Organic Raw Agave ...
