# NNMF with all the data

In [1]:
#imports

import tarfile
import pandas as pd
import pyarrow.parquet as pq
import numpy as np

In [2]:
dataset_all = pq.ParquetDataset("bigd/M148/hitdata7days")

In [3]:
columns = ['cookieid','productlist','ordernumber']
data_all = dataset_all.read(columns=columns).combine_chunks()

In [4]:
data_all_pd = data_all.to_pandas()

In [5]:
data_all_pd.head()

Unnamed: 0,cookieid,productlist,ordernumber,visitday
0,18976392225474675141320040725069689144,,,10
1,22458651700155932098567309246136578371,;NRWP5;;;;eVar1=Kitchen|eVar2=Kitchen:Kitchen ...,,10
2,81302138040689021608603450882312197152,,,10
3,59091259204920318791245315839306482106,,,10
4,53802118384124243215211591740223846690,,,10


In [6]:
data_all_pd.shape

(42730149, 4)

## Subset by users who ordered something 

In [7]:
len(data_all_pd["ordernumber"].unique()) - 1

117886

In [8]:
data_orders = data_all_pd[data_all_pd.ordernumber.notnull()]
len(data_orders)

118049

In [9]:
len(data_orders["cookieid"].unique())

103138

In [10]:
# Number of users who placed more than one order
118049 - 103138

14911

In [11]:
data_orders.head(10)

Unnamed: 0,cookieid,productlist,ordernumber,visitday
321,52882151623272742032948034078925051832,Video Games|Nintendo Switch;NRXJK;1;65.99;;,ORD0116474530,10
555,79755164678756084974501978337153484560,Toys|Pretend Play & Dress up|Play Tents;NHC0V;...,ORD0116455911,10
721,5443955122341888424003495602170301473,Bed & Bath|Bedding|Comforters;NOPW4;1;142.99;;...,ORD0116475324,10
1034,15567279697543469666445773480724187157,"Personal Care|Fragrances;NQ9WP;1;94.99;;,Perso...",ORD0116479100,10
1196,71667249622638493801005527041676393041,"Shoes|Men's Shoes|Sandals;NL31C;1;49.99;;,Pers...",ORD0116463219,10
1412,61042077885887970553598772945168024836,Toys|Bikes & Riding Toys|Bikes & Trikes;NQ5QK;...,ORD0116467305,10
1690,50173036520586681086289374017581049714,Kitchen|Kitchen Appliances|Mixers;NQ271;1;530....,ORD0116458048,10
1740,75806096112973408034688549910937655265,Toys|Action Figures & Playsets|Playsets;NRFAL;...,ORD0116468364,10
1773,69355983623503946334291902547906312305,Home|Patio & Garden|Garden & Lawn Care|Trimmer...,ORD0116470167,10
2300,32066281582833015954444801797185030134,"Bed & Bath|Bedding|Comforters;NQ8ST;1;71.99;;,...",ORD0116467351,10


In [12]:
## Fix the index
data_orders = data_orders.reset_index()
data_orders = data_orders.drop('index', axis=1)

## Go through productlist column and extract our sections of products 

In [13]:
import re
products = []
## CHANGED for JUPYTER 
for i in range(0,len(data_orders)): 
    extracted_string = str(data_orders["productlist"][i])
    first_product = re.search('([\w &]*)', extracted_string)
    additional_products = re.findall(',([\w &]*)\|', extracted_string)
    products.append(first_product[0])
    products.extend(additional_products)

In [14]:
products_np = np.array(products)
products_unique = np.unique(products_np)

## Create a data frame with Unique_Products as rows and cookie IDS as columns 

In [15]:
cookies_unique = data_orders["cookieid"].unique()

In [16]:
df_matrix = pd.DataFrame(0, index = products_unique, columns=cookies_unique)
df_matrix.head(10)

Unnamed: 0,52882151623272742032948034078925051832,79755164678756084974501978337153484560,5443955122341888424003495602170301473,15567279697543469666445773480724187157,71667249622638493801005527041676393041,61042077885887970553598772945168024836,50173036520586681086289374017581049714,75806096112973408034688549910937655265,69355983623503946334291902547906312305,32066281582833015954444801797185030134,...,57179438968080530363688548149629368028,89438258878661526742816006335978355240,61091189551588009577815950000410715623,29320100099378218248063544989477770254,21333409446351308826399880882943475377,3842196529366479506449823684258936355,59050303452684471876646873770423470660,27518133002633969172902995216203005732,8320986103372012575860537784589312884,15496251200612831365233286454677036970
,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Crafts & Holidays,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Crafts & Sewing,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Music & Books,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Tea & Espresso,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Trains & RC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Automotive,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Baby,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bed & Bath,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Clothing,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Fill in our data frame 

In [17]:
for i in range(0,len(cookies_unique)):
    
    per_user_df = data_orders[data_orders["cookieid"] == cookies_unique[i]]
    
    if (len(per_user_df) > 1):
        per_user_df_index = per_user_df.reset_index()
        per_user_df_index = per_user_df_index.drop('index', axis=1)
        per_user_df_index

        for j in range(0, len(per_user_df)):
            extracted_string = str(per_user_df_index["productlist"][j])
            first_product = re.search('([\w &]*)', extracted_string)[0]
            additional_products = re.findall(',([\w &]*)\|', extracted_string)
            df_matrix.loc[first_product, cookies_unique[i]] = df_matrix.loc[first_product, cookies_unique[i]] + 1
            
            if(len(additional_products)>0):
                for k in range(0, len(additional_products)):
                    df_matrix.loc[additional_products[k], cookies_unique[i]] = df_matrix.loc[additional_products[k], cookies_unique[i]] + 1

                  
                  
    
    else: 
        extracted_string = str(data_orders["productlist"][i])
        first_product = re.search('([\w &]*)', extracted_string)[0]
        additional_products = re.findall(',([\w &]*)\|', extracted_string)
        df_matrix.loc[first_product, cookies_unique[i]] = df_matrix.loc[first_product, cookies_unique[i]] + 1
        
        if(len(additional_products)>0):
                for k in range(0, len(additional_products)):
                    df_matrix.loc[additional_products[k], cookies_unique[i]] = df_matrix.loc[additional_products[k], cookies_unique[i]] + 1
        
        

In [18]:
df_matrix.head(5)

Unnamed: 0,52882151623272742032948034078925051832,79755164678756084974501978337153484560,5443955122341888424003495602170301473,15567279697543469666445773480724187157,71667249622638493801005527041676393041,61042077885887970553598772945168024836,50173036520586681086289374017581049714,75806096112973408034688549910937655265,69355983623503946334291902547906312305,32066281582833015954444801797185030134,...,57179438968080530363688548149629368028,89438258878661526742816006335978355240,61091189551588009577815950000410715623,29320100099378218248063544989477770254,21333409446351308826399880882943475377,3842196529366479506449823684258936355,59050303452684471876646873770423470660,27518133002633969172902995216203005732,8320986103372012575860537784589312884,15496251200612831365233286454677036970
,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Crafts & Holidays,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Crafts & Sewing,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Music & Books,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Tea & Espresso,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Write df_matrix to csv

In [19]:
df_matrix.to_csv('df_matrix2.csv')

# Bring in NNMF 

In [20]:
## Converting our data frame to a matrix 

df_matrix.to_numpy()
A = np.asmatrix(df_matrix)

I randomly choose 5 categories but please feel free to play around with having more

In [22]:
## Doing NNMF 
import plotly.graph_objects as go
from sklearn.decomposition import NMF
category_number = 5

nmf = NMF(category_number) ## Setting R equal to 4
nmf.fit(A)



NMF(n_components=5)

In [23]:
H = pd.DataFrame(np.round(nmf.components_,category_number), columns=df_matrix.columns)
H.index = ['Category1', 'Category2','Category3','Category4','Category5' ]
# H

In [24]:
H

Unnamed: 0,52882151623272742032948034078925051832,79755164678756084974501978337153484560,5443955122341888424003495602170301473,15567279697543469666445773480724187157,71667249622638493801005527041676393041,61042077885887970553598772945168024836,50173036520586681086289374017581049714,75806096112973408034688549910937655265,69355983623503946334291902547906312305,32066281582833015954444801797185030134,...,57179438968080530363688548149629368028,89438258878661526742816006335978355240,61091189551588009577815950000410715623,29320100099378218248063544989477770254,21333409446351308826399880882943475377,3842196529366479506449823684258936355,59050303452684471876646873770423470660,27518133002633969172902995216203005732,8320986103372012575860537784589312884,15496251200612831365233286454677036970
Category1,0.00189,0.09305,0.0,0.23124,0.0,0.09362,0.0022,0.14653,0.09613,0.00016,...,0.00038,0.00043,0.0,5e-05,0.0,0.00113,0.0,0.00176,0.09362,0.00189
Category2,0.0,0.0,0.0,0.0,0.0,0.0,0.00078,0.00083,0.15746,9e-05,...,0.00014,0.00023,0.03893,0.00014,0.11679,0.00215,0.03893,0.07783,0.0,0.0
Category3,0.00114,0.0,0.23993,0.0,0.0,0.0,0.07333,0.01831,0.01276,0.06909,...,0.00162,0.00893,0.0,0.00039,0.0,0.08043,0.0,0.00068,0.0,0.00114
Category4,0.00416,0.0,0.0,0.20239,0.05002,0.0,0.01118,0.01314,0.01112,0.00304,...,0.00073,0.00439,0.0,0.00042,0.0,0.00821,0.0,0.00374,0.0,0.00416
Category5,0.00556,0.05154,0.0,0.09275,0.05056,0.0,0.01816,0.08128,0.01729,0.00707,...,0.00099,0.00954,0.0,0.00021,0.0,0.01499,0.0,0.00497,0.0,0.00556


In [25]:
W = pd.DataFrame(np.round(nmf.transform(A),category_number), columns=H.index)
W.index = df_matrix.index
#W



In [26]:
W

Unnamed: 0,Category1,Category2,Category3,Category4,Category5
,0.00665,0.00932,0.00751,0.01292,0.01128
Crafts & Holidays,0.10176,0.11348,0.13241,0.16516,0.14336
Crafts & Sewing,0.04425,0.03981,0.03008,0.06651,0.03366
Music & Books,0.13193,0.04754,0.02186,0.05424,0.06242
Tea & Espresso,0.00997,0.00314,0.07473,0.05423,0.08523
Trains & RC,1.67953,0.03071,0.0,0.0,0.0
Automotive,0.02695,0.09231,0.11165,0.15763,0.08148
Baby,0.83523,0.11241,0.16681,0.20665,0.20017
Bed & Bath,0.0,0.0,16.05187,0.0,0.0
Clothing,0.0,25.66124,0.0,0.0,0.0


In [27]:
reconstructed = pd.DataFrame(np.round(np.dot(W,H),category_number), columns=df_matrix.columns)
reconstructed.index = df_matrix.index
reconstructed

Unnamed: 0,52882151623272742032948034078925051832,79755164678756084974501978337153484560,5443955122341888424003495602170301473,15567279697543469666445773480724187157,71667249622638493801005527041676393041,61042077885887970553598772945168024836,50173036520586681086289374017581049714,75806096112973408034688549910937655265,69355983623503946334291902547906312305,32066281582833015954444801797185030134,...,57179438968080530363688548149629368028,89438258878661526742816006335978355240,61091189551588009577815950000410715623,29320100099378218248063544989477770254,21333409446351308826399880882943475377,3842196529366479506449823684258936355,59050303452684471876646873770423470660,27518133002633969172902995216203005732,8320986103372012575860537784589312884,15496251200612831365233286454677036970
,0.00014,0.0012,0.0018,0.0052,0.00122,0.00062,0.00092,0.00221,0.00254,0.00064,...,4e-05,0.00024,0.00036,1e-05,0.00109,0.00091,0.00036,0.00085,0.00062,0.00014
Crafts & Holidays,0.00183,0.01686,0.03177,0.07025,0.01551,0.00953,0.01447,0.03125,0.03366,0.01069,...,0.00053,0.00334,0.00442,0.00017,0.01325,0.01451,0.00442,0.01043,0.00953,0.00183
Crafts & Sewing,0.00058,0.00585,0.00722,0.02682,0.00503,0.00414,0.00369,0.01068,0.01223,0.00253,...,0.00015,0.00091,0.00155,5e-05,0.00465,0.00361,0.00155,0.00361,0.00414,0.00058
Music & Books,0.00085,0.01549,0.00524,0.04727,0.00587,0.01235,0.00367,0.02556,0.02213,0.00214,...,0.00019,0.0011,0.00185,6e-05,0.00555,0.00339,0.00185,0.00446,0.01235,0.00085
Tea & Espresso,0.0008,0.00532,0.01793,0.02119,0.00702,0.00093,0.00766,0.01047,0.00448,0.00593,...,0.00025,0.00172,0.00012,7e-05,0.00037,0.00775,0.00012,0.00094,0.00093,0.0008
Trains & RC,0.00317,0.15628,0.0,0.38837,0.0,0.15724,0.00372,0.24613,0.16629,0.00027,...,0.00064,0.00073,0.0012,9e-05,0.00359,0.00196,0.0012,0.00535,0.15724,0.00317
Automotive,0.00129,0.00671,0.02679,0.04569,0.012,0.00252,0.01156,0.01476,0.02171,0.00878,...,0.0004,0.0025,0.00359,0.00014,0.01078,0.01172,0.00359,0.0083,0.00252,0.00129
Baby,0.00374,0.08803,0.04002,0.25353,0.02046,0.07819,0.0201,0.14452,0.10588,0.01371,...,0.00095,0.00469,0.00438,0.00025,0.01313,0.0193,0.00438,0.0121,0.07819,0.00374
Bed & Bath,0.0183,0.0,3.85133,0.0,0.0,0.0,1.17708,0.29391,0.20482,1.10902,...,0.026,0.14334,0.0,0.00626,0.0,1.29105,0.0,0.01092,0.0,0.0183
Clothing,0.0,0.0,0.0,0.0,0.0,0.0,0.02002,0.0213,4.04062,0.00231,...,0.00359,0.0059,0.99899,0.00359,2.99698,0.05517,0.99899,1.99721,0.0,0.0
