# Step 1: Gather Data

### *Import Libraries*

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
%matplotlib inline  

import warnings
warnings.filterwarnings('ignore')

### *Get data*

In [2]:
# save filepath to variable for easier access
ecommerce_file_path = 'train.csv'
test_file_path = 'test.csv'
submission_file_path = 'sample_submission.csv'

# read the data and store data in DataFrame titled train_data
train_data = pd.read_csv(ecommerce_file_path)
test_data = pd.read_csv(test_file_path)
submission_data = pd.read_csv(submission_file_path)

In [3]:
train_data.shape, test_data.shape, submission_data.shape

((10500, 5), (4500, 4), (4500, 2))

Train data has:
- 10,500 rows (observations)
- 5 columns (features)

Test data has:
- 4,500 rows (observations)
- 4 columns (features)

Submission data has:
- 4,500 rows 
- 2 columns 

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10500 entries, 0 to 10499
Data columns (total 5 columns):
session_id     10500 non-null object
startTime      10500 non-null object
endTime        10500 non-null object
ProductList    10500 non-null object
gender         10500 non-null object
dtypes: object(5)
memory usage: 410.3+ KB


In [5]:
train_data.describe()

Unnamed: 0,session_id,startTime,endTime,ProductList,gender
count,10500,10500,10500,10500,10500
unique,10500,8815,8803,9402,2
top,u10728,17/12/14 12:29,14/11/14 16:15,A00002/B00002/C00007/D00266/,female
freq,1,5,5,25,8192


In [6]:
train_data.head()

Unnamed: 0,session_id,startTime,endTime,ProductList,gender
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female
1,u10253,16/12/14 14:35,16/12/14 14:41,A00001/B00009/C00031/D29404/;A00001/B00009/C00...,male
2,u19037,01/12/14 15:58,01/12/14 15:58,A00002/B00001/C00020/D16944/,female
3,u14556,23/11/14 2:57,23/11/14 3:00,A00002/B00004/C00018/D10284/;A00002/B00004/C00...,female
4,u24295,17/12/14 16:44,17/12/14 16:46,A00001/B00001/C00012/D30805/;A00001/B00001/C00...,male


## Exploratory Data Analysis

### TRAINING DATA - PreProcessing

#### Splitting Product List
Product list contains list of products viewed by the user in the given session and it also contains the category, sub category, sub-sub category and the product all encoded and separated with a slash symbol. Each consecutive product is separated with a semicolon.
- product
- category
- sub_category
- sub_sub_category


In [7]:
train_data['ProductList'][0]

'A00002/B00003/C00006/D28435/;A00002/B00003/C00006/D02554/;A00002/B00003/C00006/D28436/;A00002/B00003/C00006/D28437/'

In [8]:
train_data['ProductListCount'] = train_data['ProductList'].apply(lambda x: len(x.split(";")))
train_data.head()

Unnamed: 0,session_id,startTime,endTime,ProductList,gender,ProductListCount
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,4
1,u10253,16/12/14 14:35,16/12/14 14:41,A00001/B00009/C00031/D29404/;A00001/B00009/C00...,male,7
2,u19037,01/12/14 15:58,01/12/14 15:58,A00002/B00001/C00020/D16944/,female,1
3,u14556,23/11/14 2:57,23/11/14 3:00,A00002/B00004/C00018/D10284/;A00002/B00004/C00...,female,3
4,u24295,17/12/14 16:44,17/12/14 16:46,A00001/B00001/C00012/D30805/;A00001/B00001/C00...,male,2


In [9]:
# Create product list - Separating each product to new row (;)
# Extract name from the string 
prod = train_data['ProductList'].str.split(';')
train_data = train_data.reindex(train_data.index.repeat(prod.apply(len)))
train_data['product_data'] = np.hstack(prod)
train_data.head()


Unnamed: 0,session_id,startTime,endTime,ProductList,gender,ProductListCount,product_data
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,4,A00002/B00003/C00006/D28435/
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,4,A00002/B00003/C00006/D02554/
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,4,A00002/B00003/C00006/D28436/
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,4,A00002/B00003/C00006/D28437/
1,u10253,16/12/14 14:35,16/12/14 14:41,A00001/B00009/C00031/D29404/;A00001/B00009/C00...,male,7,A00001/B00009/C00031/D29404/


In [10]:
#product_data list are separated by forward slash (/). They need to be split into product, category, category, sub_sub_category
#added other category to see if there is more
train_data[['product','category','sub_category','sub_sub_category', 'other']] = train_data.product_data.str.split('[/]', expand=True)
train_data.head()


Unnamed: 0,session_id,startTime,endTime,ProductList,gender,ProductListCount,product_data,product,category,sub_category,sub_sub_category,other
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,4,A00002/B00003/C00006/D28435/,A00002,B00003,C00006,D28435,
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,4,A00002/B00003/C00006/D02554/,A00002,B00003,C00006,D02554,
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,4,A00002/B00003/C00006/D28436/,A00002,B00003,C00006,D28436,
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female,4,A00002/B00003/C00006/D28437/,A00002,B00003,C00006,D28437,
1,u10253,16/12/14 14:35,16/12/14 14:41,A00001/B00009/C00031/D29404/;A00001/B00009/C00...,male,7,A00001/B00009/C00031/D29404/,A00001,B00009,C00031,D29404,


In [11]:
#Drop original productlist and product_data columns
train_data.drop(['ProductList', 'product_data'], axis=1, inplace=True)

In [12]:
#Display data
train_data.head()

Unnamed: 0,session_id,startTime,endTime,gender,ProductListCount,product,category,sub_category,sub_sub_category,other
0,u16159,15/12/14 18:11,15/12/14 18:12,female,4,A00002,B00003,C00006,D28435,
0,u16159,15/12/14 18:11,15/12/14 18:12,female,4,A00002,B00003,C00006,D02554,
0,u16159,15/12/14 18:11,15/12/14 18:12,female,4,A00002,B00003,C00006,D28436,
0,u16159,15/12/14 18:11,15/12/14 18:12,female,4,A00002,B00003,C00006,D28437,
1,u10253,16/12/14 14:35,16/12/14 14:41,male,7,A00001,B00009,C00031,D29404,


In [20]:
#check if we need the other column
train_data.isin([' '])

Unnamed: 0,session_id,startTime,endTime,gender,ProductListCount,product,category,sub_category,sub_sub_category,other
0,False,False,False,False,False,False,False,False,False,False
0,False,False,False,False,False,False,False,False,False,False
0,False,False,False,False,False,False,False,False,False,False
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
10498,False,False,False,False,False,False,False,False,False,False
10499,False,False,False,False,False,False,False,False,False,False
10499,False,False,False,False,False,False,False,False,False,False
10499,False,False,False,False,False,False,False,False,False,False


### Categorical Features
Transform using dummy variables so sklearn can understand them.

In [None]:
train_data.info()