# Manual Feature Selection for Prediction

In [1]:
#imports

import tarfile
import pandas as pd
import pyarrow.parquet as pq
import numpy as np

## All days 

In [2]:
dataset_all = pq.ParquetDataset("bigd/M148/hitdata7days")

In [3]:
columns = ['cookieid','ordernumber','devicetype']
data_all = dataset_all.read(columns=columns).combine_chunks()

In [4]:
data_all_pd = data_all.to_pandas()

In [5]:
data_all_pd.tail(10)

Unnamed: 0,cookieid,ordernumber,devicetype,visitday
42730139,66766296197672086254571863561089794102,,Tablet,16
42730140,42336954984504414921774913107250050258,,Mobile,16
42730141,17693983337648543312395385556212515037,,Desktop,16
42730142,85153173066658247329001202340424923203,,Mobile,16
42730143,81128382723058624884928245544453957460,,Desktop,16
42730144,83103452230070488727514707142695403239,,Mobile,16
42730145,78978313603212467237685845152760013485,,Tablet,16
42730146,38462509364520503943541087382094206104,,Mobile,16
42730147,53686880511144008613364683438838426866,,Mobile,16
42730148,15214929409965487617343822276678408192,,Tablet,16


In [6]:
data_all_pd.shape

(42730149, 4)

In [7]:
len(data_all_pd["devicetype"].unique())

4

### Subset by visitors who ordered something 

In [8]:
data_orders = data_all_pd[data_all_pd.ordernumber.notnull()]
len(data_orders)

118049

In [9]:
len(data_orders["cookieid"].unique())

103138

In [10]:
# Number of users who placed more than one order
118049 - 103138

14911

In [11]:
len(data_orders["ordernumber"].unique())

117886

In [10]:
data_orders.head(10)

Unnamed: 0,cookieid,ordernumber,devicetype,visitday
321,52882151623272742032948034078925051832,ORD0116474530,Mobile,10
555,79755164678756084974501978337153484560,ORD0116455911,Desktop,10
721,5443955122341888424003495602170301473,ORD0116475324,Mobile,10
1034,15567279697543469666445773480724187157,ORD0116479100,Mobile,10
1196,71667249622638493801005527041676393041,ORD0116463219,Mobile,10
1412,61042077885887970553598772945168024836,ORD0116467305,Desktop,10
1690,50173036520586681086289374017581049714,ORD0116458048,Desktop,10
1740,75806096112973408034688549910937655265,ORD0116468364,Mobile,10
1773,69355983623503946334291902547906312305,ORD0116470167,Tablet,10
2300,32066281582833015954444801797185030134,ORD0116467351,Tablet,10


In [11]:
# Number of unique cookie ids with an order number
len(data_orders["cookieid"].unique())

103138

### Subset by unique cookie IDs (already have an order number) 

In [16]:
cookies_unique = data_orders["cookieid"].unique()

In [15]:
cookies_unique

array(['52882151623272742032948034078925051832',
       '79755164678756084974501978337153484560',
       '5443955122341888424003495602170301473', ...,
       '27518133002633969172902995216203005732',
       '8320986103372012575860537784589312884',
       '15496251200612831365233286454677036970'], dtype=object)

In [13]:
len(cookies_unique)

103138

In [26]:
# Get corresponding device types for unique cookie IDs
# Subset data_orders by unique cookies
#data_cookie_unique = data_orders.iloc[data_orders['cookieid']==cookies_unique]
data_cookie_unique = data_orders.loc[data_orders['cookieid'].isin(cookies_unique)] 

In [27]:
len(data_cookie_unique)

118049

In [28]:
data_cookie_unique.head(10)

Unnamed: 0,cookieid,ordernumber,devicetype,visitday
321,52882151623272742032948034078925051832,ORD0116474530,Mobile,10
555,79755164678756084974501978337153484560,ORD0116455911,Desktop,10
721,5443955122341888424003495602170301473,ORD0116475324,Mobile,10
1034,15567279697543469666445773480724187157,ORD0116479100,Mobile,10
1196,71667249622638493801005527041676393041,ORD0116463219,Mobile,10
1412,61042077885887970553598772945168024836,ORD0116467305,Desktop,10
1690,50173036520586681086289374017581049714,ORD0116458048,Desktop,10
1740,75806096112973408034688549910937655265,ORD0116468364,Mobile,10
1773,69355983623503946334291902547906312305,ORD0116470167,Tablet,10
2300,32066281582833015954444801797185030134,ORD0116467351,Tablet,10


In [29]:
len(data_orders)

118049

In [30]:
cookie_device = data_orders[["cookieid","devicetype"]]

In [31]:
cookie_device.shape

(118049, 2)

In [32]:
cookie_device.head(10)

Unnamed: 0,cookieid,devicetype
321,52882151623272742032948034078925051832,Mobile
555,79755164678756084974501978337153484560,Desktop
721,5443955122341888424003495602170301473,Mobile
1034,15567279697543469666445773480724187157,Mobile
1196,71667249622638493801005527041676393041,Mobile
1412,61042077885887970553598772945168024836,Desktop
1690,50173036520586681086289374017581049714,Desktop
1740,75806096112973408034688549910937655265,Mobile
1773,69355983623503946334291902547906312305,Tablet
2300,32066281582833015954444801797185030134,Tablet


In [45]:
cookie_device.tail(10)

Unnamed: 0,cookieid,devicetype
42727659,30885578089350772155912397845751825711,Mobile
42727875,29320100099378218248063544989477770254,Tablet
42728988,39289138522088170321707915526287389050,Mobile
42729043,21333409446351308826399880882943475377,Desktop
42729070,3842196529366479506449823684258936355,Mobile
42729225,59050303452684471876646873770423470660,Mobile
42729404,61208706497661156832678050724331608138,Mobile
42729476,27518133002633969172902995216203005732,Mobile
42729497,8320986103372012575860537784589312884,Mobile
42730027,15496251200612831365233286454677036970,Tablet


## Drop duplicates of cookie ids and only keep the first one 

In [34]:
cookie_device2 = cookie_device.drop_duplicates(subset='cookieid', keep="first")
cookie_device2.shape

(103138, 2)

### Write the dataframe to a csv 

In [35]:
#cookie_device2.to_csv('cookie_device.csv',float_format = 'Callable')