In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
'''
FEATURES:
Overall action count/ratio
    - 
Overall day count
Monthly action count/ratio
    - Per brand, number of actions in a month / total actions (across all instances) in that month
Penetration (Popularity / Buys)
    - Number of Buys
    - Related Brand Popularity: Among the brands, split into tiers of high popularity vs low popularity
Monthly Aggregation
    - Per brand, number of actions in a month
    - Per brand, average action_type in a month
    - Std. deviation for number of clicks 
    - Per brand, action count by gender in a month
Double 11 Features
Latest One-Week
Repeat Buyer Features
Age Related
Gender Related
'''

'\nFEATURES:\nOverall action count/ratio\nOverall day count\nMonthly action couint/ratio\nPenetration\nMonthly Aggregation\nDouble 11 Features\nLatest One-Week\nRepeat Buyer Features\nAge Related\nGender Related\n'

In [2]:
df = pd.read_csv("./use_data/expanded_training.csv")
df

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,age_range,gender,label
0,379824,198,656,145,3462.0,1111,0,5.0,1.0,0
1,379824,198,656,145,3462.0,1111,0,5.0,1.0,0
2,379824,198,656,145,3462.0,1111,2,5.0,1.0,0
3,379824,198,656,145,3462.0,1110,0,5.0,1.0,0
4,379824,198,656,145,3462.0,1110,0,5.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...
380,122632,175,1181,4760,247.0,1109,0,3.0,0.0,0
381,122632,175,1181,4760,247.0,1108,0,3.0,0.0,0
382,122632,175,1181,4760,247.0,1108,0,3.0,0.0,0
383,95362,253,962,3263,626.0,1111,0,0.0,0.0,0


In [3]:
df_brand = df[['brand_id']]
df_brand

Unnamed: 0,brand_id
0,3462.0
1,3462.0
2,3462.0
3,3462.0
4,3462.0
...,...
380,247.0
381,247.0
382,247.0
383,626.0


In [4]:
# Unique Brand Count
unique_brand_count = len(pd.unique(df_brand['brand_id']))
print(f'Unique Brand Count: {unique_brand_count}')

brand_occurrences = df_brand['brand_id'].value_counts()
print(brand_occurrences)

Unique Brand Count: 27
247.0     105
683.0      85
3462.0     36
1246.0     30
6208.0     21
5380.0     19
626.0      18
7924.0     12
4631.0      9
2276.0      7
2350.0      6
5946.0      6
7892.0      5
5491.0      4
7936.0      4
1097.0      3
1446.0      2
6230.0      2
5738.0      2
1905.0      2
3931.0      1
777.0       1
7989.0      1
3654.0      1
6590.0      1
8040.0      1
7371.0      1
Name: brand_id, dtype: int64


In [11]:
df['month'] = df['time_stamp'] // 100
df['day'] = df['time_stamp'] % 100
df

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,age_range,gender,label,month,day
0,379824,198,656,145,3462.0,1111,0,5.0,1.0,0,11,11
1,379824,198,656,145,3462.0,1111,0,5.0,1.0,0,11,11
2,379824,198,656,145,3462.0,1111,2,5.0,1.0,0,11,11
3,379824,198,656,145,3462.0,1110,0,5.0,1.0,0,11,10
4,379824,198,656,145,3462.0,1110,0,5.0,1.0,0,11,10
...,...,...,...,...,...,...,...,...,...,...,...,...
380,122632,175,1181,4760,247.0,1109,0,3.0,0.0,0,11,9
381,122632,175,1181,4760,247.0,1108,0,3.0,0.0,0,11,8
382,122632,175,1181,4760,247.0,1108,0,3.0,0.0,0,11,8
383,95362,253,962,3263,626.0,1111,0,0.0,0.0,0,11,11


In [5]:
# Split Month Day into Separate Columns
df_brand['month'] = df['time_stamp'] // 100
df_brand['day'] = df['time_stamp'] % 100
df_brand

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['month'] = df['time_stamp'] // 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['day'] = df['time_stamp'] % 100


Unnamed: 0,brand_id,month,day
0,3462.0,11,11
1,3462.0,11,11
2,3462.0,11,11
3,3462.0,11,10
4,3462.0,11,10
...,...,...,...
380,247.0,11,9
381,247.0,11,8
382,247.0,11,8
383,626.0,11,11


Aggregation

Common Aggregate Functions:
- Average
- Count
    > Action Count
- Maximum
- Median
- Minimum
- Mode
- Range
- Sum
- StdDeviation
- NaNMean

In [6]:
# Monthly Action Count

In [7]:
# Monthly Brand Action Counts
df_brand['brand_monthly_action_count'] = df_brand.groupby(['brand_id', 'month']).transform('size')
df_brand

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['brand_monthly_action_count'] = df_brand.groupby(['brand_id', 'month']).transform('size')


Unnamed: 0,brand_id,month,day,brand_monthly_action_count
0,3462.0,11,11,36
1,3462.0,11,11,36
2,3462.0,11,11,36
3,3462.0,11,10,36
4,3462.0,11,10,36
...,...,...,...,...
380,247.0,11,9,104
381,247.0,11,8,104
382,247.0,11,8,104
383,626.0,11,11,18


In [8]:
# Total Action Count in said month
df_brand['month_total_action_count'] = df_brand.groupby(['month']).transform('size')
df_brand


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['month_total_action_count'] = df_brand.groupby(['month']).transform('size')


Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count
0,3462.0,11,11,36,362
1,3462.0,11,11,36,362
2,3462.0,11,11,36,362
3,3462.0,11,10,36,362
4,3462.0,11,10,36,362
...,...,...,...,...,...
380,247.0,11,9,104,362
381,247.0,11,8,104,362
382,247.0,11,8,104,362
383,626.0,11,11,18,362


In [10]:
# Monthly Action Count / Ratio (Count/Ratio Type)
df_brand['monthly_action_count_ratio'] = df_brand.groupby(['brand_id', 'month'])['brand_monthly_action_count'].transform(lambda x: x/ df_brand['month_total_action_count'])
df_brand


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['monthly_action_count_ratio'] = df_brand.groupby(['brand_id', 'month'])['brand_monthly_action_count'].transform(lambda x: x/ df_brand['month_total_action_count'])


Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio
0,3462.0,11,11,36,362,0.099448
1,3462.0,11,11,36,362,0.099448
2,3462.0,11,11,36,362,0.099448
3,3462.0,11,10,36,362,0.099448
4,3462.0,11,10,36,362,0.099448
...,...,...,...,...,...,...
380,247.0,11,9,104,362,0.287293
381,247.0,11,8,104,362,0.287293
382,247.0,11,8,104,362,0.287293
383,626.0,11,11,18,362,0.049724


In [14]:
# Per Brand Mean Action Type (Treats Action Type not really Discrete Label)
df_brand['monthly_mean_action_type'] = df.groupby(['brand_id', 'month'])['action_type'].transform('mean')
df_brand

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['monthly_mean_action_type'] = df.groupby(['brand_id', 'month'])['action_type'].transform('mean')


Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type
0,3462.0,11,11,36,362,0.099448,0.361111
1,3462.0,11,11,36,362,0.099448,0.361111
2,3462.0,11,11,36,362,0.099448,0.361111
3,3462.0,11,10,36,362,0.099448,0.361111
4,3462.0,11,10,36,362,0.099448,0.361111
...,...,...,...,...,...,...,...
380,247.0,11,9,104,362,0.287293,0.355769
381,247.0,11,8,104,362,0.287293,0.355769
382,247.0,11,8,104,362,0.287293,0.355769
383,626.0,11,11,18,362,0.049724,0.111111


In [20]:
# Monthly Gender Interaction Count
df_brand['monthly_male_action_count'] = df[(df.gender == 1)].groupby(['brand_id', 'month']).transform('size')
df_brand['monthly_female_action_count'] = df[(df.gender == 0)].groupby(['brand_id', 'month']).transform('size')
df_brand['monthly_male_action_count'] = df_brand['monthly_male_action_count'].fillna(0)
df_brand['monthly_female_action_count'] = df_brand['monthly_female_action_count'].fillna(0)
df_brand

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['monthly_male_action_count'] = df[(df.gender == 1)].groupby(['brand_id', 'month']).transform('size')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['monthly_female_action_count'] = df[(df.gender == 0)].groupby(['brand_id', 'month']).transform('size')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,monthly_male_action_count,monthly_female_action_count
0,3462.0,11,11,36,362,0.099448,0.361111,9.0,0.0
1,3462.0,11,11,36,362,0.099448,0.361111,9.0,0.0
2,3462.0,11,11,36,362,0.099448,0.361111,9.0,0.0
3,3462.0,11,10,36,362,0.099448,0.361111,9.0,0.0
4,3462.0,11,10,36,362,0.099448,0.361111,9.0,0.0
...,...,...,...,...,...,...,...,...,...
380,247.0,11,9,104,362,0.287293,0.355769,0.0,33.0
381,247.0,11,8,104,362,0.287293,0.355769,0.0,33.0
382,247.0,11,8,104,362,0.287293,0.355769,0.0,33.0
383,626.0,11,11,18,362,0.049724,0.111111,0.0,18.0
