In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
'''
FEATURES:
Overall action count/ratio
    - 
Overall day count
Monthly action count/ratio
    - Per brand, number of actions in a month / total actions (across all instances) in that month
Penetration (Popularity / Buys)
    - Number of Buys
    - Related Brand Popularity: Among the brands, split into tiers of high popularity vs low popularity
Monthly Aggregation
    - Per brand, number of actions in a month
    - Per brand, average action_type in a month
    - Std. deviation for number of clicks 
    - Per brand, action count by gender in a month
Double 11 Features
Latest One-Week
Repeat Buyer Features
Age Related
Gender Related
'''

'\nFEATURES:\nOverall action count/ratio\n    - \nOverall day count\nMonthly action count/ratio\n    - Per brand, number of actions in a month / total actions (across all instances) in that month\nPenetration (Popularity / Buys)\n    - Number of Buys\n    - Related Brand Popularity: Among the brands, split into tiers of high popularity vs low popularity\nMonthly Aggregation\n    - Per brand, number of actions in a month\n    - Per brand, average action_type in a month\n    - Std. deviation for number of clicks \n    - Per brand, action count by gender in a month\nDouble 11 Features\nLatest One-Week\nRepeat Buyer Features\nAge Related\nGender Related\n'

In [2]:
df = pd.read_csv("./use_data/expanded_testing.csv")
df

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,age_range,gender,prob
0,149002,282,885,3791,1500.0,1111,0,5.0,2.0,
1,149002,282,885,3791,1500.0,1107,0,5.0,2.0,
2,109881,230,629,1,1662.0,820,0,4.0,0.0,
3,109881,230,629,1,1662.0,820,0,4.0,0.0,
4,109881,230,629,1,1662.0,820,0,4.0,0.0,
...,...,...,...,...,...,...,...,...,...,...
6544,73105,259,614,637,2603.0,1111,0,2.0,0.0,
6545,403421,175,1181,4760,247.0,1111,0,3.0,1.0,
6546,116738,186,267,1200,2276.0,1110,0,2.0,1.0,
6547,116738,186,267,1200,2276.0,1101,0,2.0,1.0,


In [3]:
df_brand = df[['brand_id']]
df_brand

Unnamed: 0,brand_id
0,1500.0
1,1500.0
2,1662.0
3,1662.0
4,1662.0
...,...
6544,2603.0
6545,247.0
6546,2276.0
6547,2276.0


In [4]:
# Unique Brand Count
unique_brand_count = len(pd.unique(df_brand['brand_id']))
print(f'Unique Brand Count: {unique_brand_count}')

brand_occurrences = df_brand['brand_id'].value_counts()
print(brand_occurrences)

Unique Brand Count: 151
3650.0    1335
2603.0     885
683.0      413
247.0      398
6742.0     310
          ... 
1532.0       1
7651.0       1
5506.0       1
7139.0       1
6781.0       1
Name: brand_id, Length: 150, dtype: int64


In [5]:
df['month'] = df['time_stamp'] // 100
df['day'] = df['time_stamp'] % 100
df

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,age_range,gender,prob,month,day
0,149002,282,885,3791,1500.0,1111,0,5.0,2.0,,11,11
1,149002,282,885,3791,1500.0,1107,0,5.0,2.0,,11,7
2,109881,230,629,1,1662.0,820,0,4.0,0.0,,8,20
3,109881,230,629,1,1662.0,820,0,4.0,0.0,,8,20
4,109881,230,629,1,1662.0,820,0,4.0,0.0,,8,20
...,...,...,...,...,...,...,...,...,...,...,...,...
6544,73105,259,614,637,2603.0,1111,0,2.0,0.0,,11,11
6545,403421,175,1181,4760,247.0,1111,0,3.0,1.0,,11,11
6546,116738,186,267,1200,2276.0,1110,0,2.0,1.0,,11,10
6547,116738,186,267,1200,2276.0,1101,0,2.0,1.0,,11,1


In [6]:
# Split Month Day into Separate Columns
df_brand['month'] = df['time_stamp'] // 100
df_brand['day'] = df['time_stamp'] % 100
df_brand

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['month'] = df['time_stamp'] // 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['day'] = df['time_stamp'] % 100


Unnamed: 0,brand_id,month,day
0,1500.0,11,11
1,1500.0,11,7
2,1662.0,8,20
3,1662.0,8,20
4,1662.0,8,20
...,...,...,...
6544,2603.0,11,11
6545,247.0,11,11
6546,2276.0,11,10
6547,2276.0,11,1


Aggregation

Common Aggregate Functions:
- Average
- Count
    > Action Count
- Maximum
- Median
- Minimum
- Mode
- Range
- Sum
- StdDeviation
- NaNMean

In [7]:
# Monthly Action Count

In [8]:
# Monthly Brand Action Counts
df_brand['brand_monthly_action_count'] = df_brand.groupby(['brand_id', 'month']).transform('size')
df_brand

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['brand_monthly_action_count'] = df_brand.groupby(['brand_id', 'month']).transform('size')


Unnamed: 0,brand_id,month,day,brand_monthly_action_count
0,1500.0,11,11,3.0
1,1500.0,11,7,3.0
2,1662.0,8,20,10.0
3,1662.0,8,20,10.0
4,1662.0,8,20,10.0
...,...,...,...,...
6544,2603.0,11,11,225.0
6545,247.0,11,11,342.0
6546,2276.0,11,10,13.0
6547,2276.0,11,1,13.0


In [9]:
# Total Action Count in said month
df_brand['month_total_action_count'] = df_brand.groupby(['month']).transform('size')
df_brand


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['month_total_action_count'] = df_brand.groupby(['month']).transform('size')


Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count
0,1500.0,11,11,3.0,2292
1,1500.0,11,7,3.0,2292
2,1662.0,8,20,10.0,704
3,1662.0,8,20,10.0,704
4,1662.0,8,20,10.0,704
...,...,...,...,...,...
6544,2603.0,11,11,225.0,2292
6545,247.0,11,11,342.0,2292
6546,2276.0,11,10,13.0,2292
6547,2276.0,11,1,13.0,2292


In [10]:
# Monthly Action Count / Ratio (Count/Ratio Type)
df_brand['monthly_action_count_ratio'] = df_brand.groupby(['brand_id', 'month'])['brand_monthly_action_count'].transform(lambda x: x/ df_brand['month_total_action_count'])
df_brand


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['monthly_action_count_ratio'] = df_brand.groupby(['brand_id', 'month'])['brand_monthly_action_count'].transform(lambda x: x/ df_brand['month_total_action_count'])


Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio
0,1500.0,11,11,3.0,2292,0.001309
1,1500.0,11,7,3.0,2292,0.001309
2,1662.0,8,20,10.0,704,0.014205
3,1662.0,8,20,10.0,704,0.014205
4,1662.0,8,20,10.0,704,0.014205
...,...,...,...,...,...,...
6544,2603.0,11,11,225.0,2292,0.098168
6545,247.0,11,11,342.0,2292,0.149215
6546,2276.0,11,10,13.0,2292,0.005672
6547,2276.0,11,1,13.0,2292,0.005672


In [11]:
# Per Brand Mean Action Type (Treats Action Type not really Discrete Label)
df_brand['monthly_mean_action_type'] = df.groupby(['brand_id', 'month'])['action_type'].transform('mean')
df_brand

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['monthly_mean_action_type'] = df.groupby(['brand_id', 'month'])['action_type'].transform('mean')


Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type
0,1500.0,11,11,3.0,2292,0.001309,0.000000
1,1500.0,11,7,3.0,2292,0.001309,0.000000
2,1662.0,8,20,10.0,704,0.014205,0.300000
3,1662.0,8,20,10.0,704,0.014205,0.300000
4,1662.0,8,20,10.0,704,0.014205,0.300000
...,...,...,...,...,...,...,...
6544,2603.0,11,11,225.0,2292,0.098168,0.528889
6545,247.0,11,11,342.0,2292,0.149215,0.324561
6546,2276.0,11,10,13.0,2292,0.005672,0.307692
6547,2276.0,11,1,13.0,2292,0.005672,0.307692



# Gender Interaction Count per Brand

In [12]:
# For Other Columns, modify the groupby condition
male_counts = df.groupby(['brand_id'])['gender'].apply(lambda x: (x == 1).sum())
female_counts = df.groupby(['brand_id'])['gender'].apply(lambda x: (x == 0).sum())
unknown_gender_count = df.groupby(['brand_id'])['gender'].apply(lambda x: (x == 2).sum())
gender_total_brand_counts = pd.DataFrame({'brand_male_count': male_counts, 'brand_female_count': female_counts, 'brand_unknown_gender_count': unknown_gender_count})
gender_total_brand_counts


Unnamed: 0_level_0,brand_male_count,brand_female_count,brand_unknown_gender_count
brand_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
68.0,0,14,4
69.0,3,22,0
99.0,6,17,0
136.0,2,0,0
178.0,0,1,0
...,...,...,...
7989.0,3,5,0
8040.0,0,2,0
8122.0,0,6,2
8150.0,0,10,0


In [13]:
df_brand = df_brand.join(gender_total_brand_counts, on='brand_id')
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count
0,1500.0,11,11,3.0,2292,0.001309,0.000000,1.0,11.0,2.0
1,1500.0,11,7,3.0,2292,0.001309,0.000000,1.0,11.0,2.0
2,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0
3,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0
4,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0
...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225.0,2292,0.098168,0.528889,170.0,689.0,14.0
6545,247.0,11,11,342.0,2292,0.149215,0.324561,197.0,182.0,15.0
6546,2276.0,11,10,13.0,2292,0.005672,0.307692,2.0,11.0,0.0
6547,2276.0,11,1,13.0,2292,0.005672,0.307692,2.0,11.0,0.0


# Monthly Gender Action Count per Brand

In [14]:
monthly_male_counts = df.groupby(['brand_id', 'month'])['gender'].apply(lambda x: (x == 1).sum())
monthly_female_counts = df.groupby(['brand_id', 'month'])['gender'].apply(lambda x: (x == 0).sum())
monthly_unknown_gender_count = df.groupby(['brand_id', 'month'])['gender'].apply(lambda x: (x == 2).sum())
monthly_gender_total_brand_counts = pd.DataFrame({'month_brand_male_count': monthly_male_counts, 'month_brand_female_count': monthly_female_counts, 'month_brand_unknown_gender_count': monthly_unknown_gender_count})
monthly_gender_total_brand_counts

Unnamed: 0_level_0,Unnamed: 1_level_0,month_brand_male_count,month_brand_female_count,month_brand_unknown_gender_count
brand_id,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
68.0,6,0,13,3
68.0,7,0,1,1
69.0,7,0,17,0
69.0,8,0,2,0
69.0,9,1,1,0
...,...,...,...,...
8150.0,7,0,3,0
8150.0,8,0,2,0
8150.0,9,0,1,0
8238.0,6,2,57,0


In [15]:
df_brand = df_brand.join(monthly_gender_total_brand_counts, on=['brand_id', 'month'])
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,month_brand_male_count,month_brand_female_count,month_brand_unknown_gender_count
0,1500.0,11,11,3.0,2292,0.001309,0.000000,1.0,11.0,2.0,0.0,1.0,2.0
1,1500.0,11,7,3.0,2292,0.001309,0.000000,1.0,11.0,2.0,0.0,1.0,2.0
2,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,1.0,9.0,0.0
3,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,1.0,9.0,0.0
4,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,1.0,9.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225.0,2292,0.098168,0.528889,170.0,689.0,14.0,46.0,170.0,1.0
6545,247.0,11,11,342.0,2292,0.149215,0.324561,197.0,182.0,15.0,173.0,154.0,12.0
6546,2276.0,11,10,13.0,2292,0.005672,0.307692,2.0,11.0,0.0,2.0,11.0,0.0
6547,2276.0,11,1,13.0,2292,0.005672,0.307692,2.0,11.0,0.0,2.0,11.0,0.0


# Brand Gender Count Ratio

In [16]:
gender_counts = df_brand[['brand_male_count', 'brand_female_count', 'brand_unknown_gender_count']].sum(axis=1)
male_ratio = df_brand.apply(lambda row: row['brand_male_count'] / gender_counts[row.name], axis=1)
female_ratio = df_brand.apply(lambda row: row['brand_female_count'] / gender_counts[row.name], axis=1)
unknown_gender_ratio = df_brand.apply(lambda row: row['brand_unknown_gender_count'] / gender_counts[row.name], axis=1)
df_brand['brand_male_count_ratio'] = male_ratio
df_brand['brand_female_count_ratio'] = female_ratio
df_brand['brand_unknown_count_ratio'] = unknown_gender_ratio
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,month_brand_male_count,month_brand_female_count,month_brand_unknown_gender_count,brand_male_count_ratio,brand_female_count_ratio,brand_unknown_count_ratio
0,1500.0,11,11,3.0,2292,0.001309,0.000000,1.0,11.0,2.0,0.0,1.0,2.0,0.071429,0.785714,0.142857
1,1500.0,11,7,3.0,2292,0.001309,0.000000,1.0,11.0,2.0,0.0,1.0,2.0,0.071429,0.785714,0.142857
2,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,1.0,9.0,0.0,0.133333,0.800000,0.066667
3,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,1.0,9.0,0.0,0.133333,0.800000,0.066667
4,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,1.0,9.0,0.0,0.133333,0.800000,0.066667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225.0,2292,0.098168,0.528889,170.0,689.0,14.0,46.0,170.0,1.0,0.194731,0.789233,0.016037
6545,247.0,11,11,342.0,2292,0.149215,0.324561,197.0,182.0,15.0,173.0,154.0,12.0,0.500000,0.461929,0.038071
6546,2276.0,11,10,13.0,2292,0.005672,0.307692,2.0,11.0,0.0,2.0,11.0,0.0,0.153846,0.846154,0.000000
6547,2276.0,11,1,13.0,2292,0.005672,0.307692,2.0,11.0,0.0,2.0,11.0,0.0,0.153846,0.846154,0.000000


# Brand Monthly Gender Count Ratio

In [17]:
df_brand['brand_monthly_male_count_ratio'] = df_brand['month_brand_male_count'] / df_brand['brand_monthly_action_count']
df_brand['brand_monthly_female_count_ratio'] = df_brand['month_brand_female_count'] / df_brand['brand_monthly_action_count']
df_brand['brand_monthly_unknown_count_ratio'] = df_brand['month_brand_unknown_gender_count'] / df_brand['brand_monthly_action_count']
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,month_brand_male_count,month_brand_female_count,month_brand_unknown_gender_count,brand_male_count_ratio,brand_female_count_ratio,brand_unknown_count_ratio,brand_monthly_male_count_ratio,brand_monthly_female_count_ratio,brand_monthly_unknown_count_ratio
0,1500.0,11,11,3.0,2292,0.001309,0.000000,1.0,11.0,2.0,0.0,1.0,2.0,0.071429,0.785714,0.142857,0.000000,0.333333,0.666667
1,1500.0,11,7,3.0,2292,0.001309,0.000000,1.0,11.0,2.0,0.0,1.0,2.0,0.071429,0.785714,0.142857,0.000000,0.333333,0.666667
2,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,1.0,9.0,0.0,0.133333,0.800000,0.066667,0.100000,0.900000,0.000000
3,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,1.0,9.0,0.0,0.133333,0.800000,0.066667,0.100000,0.900000,0.000000
4,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,1.0,9.0,0.0,0.133333,0.800000,0.066667,0.100000,0.900000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225.0,2292,0.098168,0.528889,170.0,689.0,14.0,46.0,170.0,1.0,0.194731,0.789233,0.016037,0.204444,0.755556,0.004444
6545,247.0,11,11,342.0,2292,0.149215,0.324561,197.0,182.0,15.0,173.0,154.0,12.0,0.500000,0.461929,0.038071,0.505848,0.450292,0.035088
6546,2276.0,11,10,13.0,2292,0.005672,0.307692,2.0,11.0,0.0,2.0,11.0,0.0,0.153846,0.846154,0.000000,0.153846,0.846154,0.000000
6547,2276.0,11,1,13.0,2292,0.005672,0.307692,2.0,11.0,0.0,2.0,11.0,0.0,0.153846,0.846154,0.000000,0.153846,0.846154,0.000000


# All Above Confirmed

# Penetration

In [18]:
df

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,age_range,gender,prob,month,day
0,149002,282,885,3791,1500.0,1111,0,5.0,2.0,,11,11
1,149002,282,885,3791,1500.0,1107,0,5.0,2.0,,11,7
2,109881,230,629,1,1662.0,820,0,4.0,0.0,,8,20
3,109881,230,629,1,1662.0,820,0,4.0,0.0,,8,20
4,109881,230,629,1,1662.0,820,0,4.0,0.0,,8,20
...,...,...,...,...,...,...,...,...,...,...,...,...
6544,73105,259,614,637,2603.0,1111,0,2.0,0.0,,11,11
6545,403421,175,1181,4760,247.0,1111,0,3.0,1.0,,11,11
6546,116738,186,267,1200,2276.0,1110,0,2.0,1.0,,11,10
6547,116738,186,267,1200,2276.0,1101,0,2.0,1.0,,11,1


In [19]:
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,month_brand_male_count,month_brand_female_count,month_brand_unknown_gender_count,brand_male_count_ratio,brand_female_count_ratio,brand_unknown_count_ratio,brand_monthly_male_count_ratio,brand_monthly_female_count_ratio,brand_monthly_unknown_count_ratio
0,1500.0,11,11,3.0,2292,0.001309,0.000000,1.0,11.0,2.0,0.0,1.0,2.0,0.071429,0.785714,0.142857,0.000000,0.333333,0.666667
1,1500.0,11,7,3.0,2292,0.001309,0.000000,1.0,11.0,2.0,0.0,1.0,2.0,0.071429,0.785714,0.142857,0.000000,0.333333,0.666667
2,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,1.0,9.0,0.0,0.133333,0.800000,0.066667,0.100000,0.900000,0.000000
3,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,1.0,9.0,0.0,0.133333,0.800000,0.066667,0.100000,0.900000,0.000000
4,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,1.0,9.0,0.0,0.133333,0.800000,0.066667,0.100000,0.900000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225.0,2292,0.098168,0.528889,170.0,689.0,14.0,46.0,170.0,1.0,0.194731,0.789233,0.016037,0.204444,0.755556,0.004444
6545,247.0,11,11,342.0,2292,0.149215,0.324561,197.0,182.0,15.0,173.0,154.0,12.0,0.500000,0.461929,0.038071,0.505848,0.450292,0.035088
6546,2276.0,11,10,13.0,2292,0.005672,0.307692,2.0,11.0,0.0,2.0,11.0,0.0,0.153846,0.846154,0.000000,0.153846,0.846154,0.000000
6547,2276.0,11,1,13.0,2292,0.005672,0.307692,2.0,11.0,0.0,2.0,11.0,0.0,0.153846,0.846154,0.000000,0.153846,0.846154,0.000000


# Number of Buys Per Brand

In [20]:
brand_buy = pd.DataFrame({'brand_buys': df.groupby(['brand_id'])['action_type'].apply(lambda x: (x == 2).sum())})
df_brand = df_brand.join(brand_buy, on=['brand_id'])
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,month_brand_male_count,month_brand_female_count,month_brand_unknown_gender_count,brand_male_count_ratio,brand_female_count_ratio,brand_unknown_count_ratio,brand_monthly_male_count_ratio,brand_monthly_female_count_ratio,brand_monthly_unknown_count_ratio,brand_buys
0,1500.0,11,11,3.0,2292,0.001309,0.000000,1.0,11.0,2.0,0.0,1.0,2.0,0.071429,0.785714,0.142857,0.000000,0.333333,0.666667,0.0
1,1500.0,11,7,3.0,2292,0.001309,0.000000,1.0,11.0,2.0,0.0,1.0,2.0,0.071429,0.785714,0.142857,0.000000,0.333333,0.666667,0.0
2,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,1.0,9.0,0.0,0.133333,0.800000,0.066667,0.100000,0.900000,0.000000,0.0
3,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,1.0,9.0,0.0,0.133333,0.800000,0.066667,0.100000,0.900000,0.000000,0.0
4,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,1.0,9.0,0.0,0.133333,0.800000,0.066667,0.100000,0.900000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225.0,2292,0.098168,0.528889,170.0,689.0,14.0,46.0,170.0,1.0,0.194731,0.789233,0.016037,0.204444,0.755556,0.004444,144.0
6545,247.0,11,11,342.0,2292,0.149215,0.324561,197.0,182.0,15.0,173.0,154.0,12.0,0.500000,0.461929,0.038071,0.505848,0.450292,0.035088,35.0
6546,2276.0,11,10,13.0,2292,0.005672,0.307692,2.0,11.0,0.0,2.0,11.0,0.0,0.153846,0.846154,0.000000,0.153846,0.846154,0.000000,2.0
6547,2276.0,11,1,13.0,2292,0.005672,0.307692,2.0,11.0,0.0,2.0,11.0,0.0,0.153846,0.846154,0.000000,0.153846,0.846154,0.000000,2.0


# Buy Ratio per Brand (Brand Buy Count / Brand Total Actions)

In [21]:
df_brand['brand_buy_ratio'] = df_brand['brand_buys'] / df_brand.groupby(['brand_id']).transform('size')
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,month_brand_female_count,month_brand_unknown_gender_count,brand_male_count_ratio,brand_female_count_ratio,brand_unknown_count_ratio,brand_monthly_male_count_ratio,brand_monthly_female_count_ratio,brand_monthly_unknown_count_ratio,brand_buys,brand_buy_ratio
0,1500.0,11,11,3.0,2292,0.001309,0.000000,1.0,11.0,2.0,...,1.0,2.0,0.071429,0.785714,0.142857,0.000000,0.333333,0.666667,0.0,0.000000
1,1500.0,11,7,3.0,2292,0.001309,0.000000,1.0,11.0,2.0,...,1.0,2.0,0.071429,0.785714,0.142857,0.000000,0.333333,0.666667,0.0,0.000000
2,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,...,9.0,0.0,0.133333,0.800000,0.066667,0.100000,0.900000,0.000000,0.0,0.000000
3,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,...,9.0,0.0,0.133333,0.800000,0.066667,0.100000,0.900000,0.000000,0.0,0.000000
4,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,...,9.0,0.0,0.133333,0.800000,0.066667,0.100000,0.900000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225.0,2292,0.098168,0.528889,170.0,689.0,14.0,...,170.0,1.0,0.194731,0.789233,0.016037,0.204444,0.755556,0.004444,144.0,0.162712
6545,247.0,11,11,342.0,2292,0.149215,0.324561,197.0,182.0,15.0,...,154.0,12.0,0.500000,0.461929,0.038071,0.505848,0.450292,0.035088,35.0,0.087940
6546,2276.0,11,10,13.0,2292,0.005672,0.307692,2.0,11.0,0.0,...,11.0,0.0,0.153846,0.846154,0.000000,0.153846,0.846154,0.000000,2.0,0.153846
6547,2276.0,11,1,13.0,2292,0.005672,0.307692,2.0,11.0,0.0,...,11.0,0.0,0.153846,0.846154,0.000000,0.153846,0.846154,0.000000,2.0,0.153846


# Gender-Buy Count per Brand

In [22]:
brand_male_buy = df[df.action_type == 2].groupby(['brand_id'])['gender'].apply(lambda x: (x == 1).sum()) 
brand_female_buy = df[df.action_type == 2].groupby(['brand_id'])['gender'].apply(lambda x: (x == 0).sum())
brand_unknown_buy = df[df.action_type == 2].groupby(['brand_id'])['gender'].apply(lambda x: (x == 2).sum())
gender_buys = pd.DataFrame({'brand_male_buy_count': brand_male_buy, 'brand_female_buy_count': brand_female_buy, 'brand_unknown_buy_count': brand_unknown_buy})
df_brand = df_brand.join(gender_buys, on=['brand_id'])
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand_female_count_ratio,brand_unknown_count_ratio,brand_monthly_male_count_ratio,brand_monthly_female_count_ratio,brand_monthly_unknown_count_ratio,brand_buys,brand_buy_ratio,brand_male_buy_count,brand_female_buy_count,brand_unknown_buy_count
0,1500.0,11,11,3.0,2292,0.001309,0.000000,1.0,11.0,2.0,...,0.785714,0.142857,0.000000,0.333333,0.666667,0.0,0.000000,,,
1,1500.0,11,7,3.0,2292,0.001309,0.000000,1.0,11.0,2.0,...,0.785714,0.142857,0.000000,0.333333,0.666667,0.0,0.000000,,,
2,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,...,0.800000,0.066667,0.100000,0.900000,0.000000,0.0,0.000000,,,
3,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,...,0.800000,0.066667,0.100000,0.900000,0.000000,0.0,0.000000,,,
4,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,...,0.800000,0.066667,0.100000,0.900000,0.000000,0.0,0.000000,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225.0,2292,0.098168,0.528889,170.0,689.0,14.0,...,0.789233,0.016037,0.204444,0.755556,0.004444,144.0,0.162712,33.0,108.0,1.0
6545,247.0,11,11,342.0,2292,0.149215,0.324561,197.0,182.0,15.0,...,0.461929,0.038071,0.505848,0.450292,0.035088,35.0,0.087940,17.0,16.0,2.0
6546,2276.0,11,10,13.0,2292,0.005672,0.307692,2.0,11.0,0.0,...,0.846154,0.000000,0.153846,0.846154,0.000000,2.0,0.153846,0.0,2.0,0.0
6547,2276.0,11,1,13.0,2292,0.005672,0.307692,2.0,11.0,0.0,...,0.846154,0.000000,0.153846,0.846154,0.000000,2.0,0.153846,0.0,2.0,0.0


# Gender-Buy Ratio per Brand (Buy Count of a Gender / Total Buy Count for that Brand)

In [23]:
df_brand['brand_male_buy_ratio'] = df_brand['brand_male_buy_count'] / df_brand['brand_buys']
df_brand['brand_female_buy_ratio'] = df_brand['brand_female_buy_count'] / df_brand['brand_buys']
df_brand['brand_unknown_buy_ratio'] = df_brand['brand_unknown_buy_count'] / df_brand['brand_buys']
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand_monthly_female_count_ratio,brand_monthly_unknown_count_ratio,brand_buys,brand_buy_ratio,brand_male_buy_count,brand_female_buy_count,brand_unknown_buy_count,brand_male_buy_ratio,brand_female_buy_ratio,brand_unknown_buy_ratio
0,1500.0,11,11,3.0,2292,0.001309,0.000000,1.0,11.0,2.0,...,0.333333,0.666667,0.0,0.000000,,,,,,
1,1500.0,11,7,3.0,2292,0.001309,0.000000,1.0,11.0,2.0,...,0.333333,0.666667,0.0,0.000000,,,,,,
2,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,...,0.900000,0.000000,0.0,0.000000,,,,,,
3,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,...,0.900000,0.000000,0.0,0.000000,,,,,,
4,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,...,0.900000,0.000000,0.0,0.000000,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225.0,2292,0.098168,0.528889,170.0,689.0,14.0,...,0.755556,0.004444,144.0,0.162712,33.0,108.0,1.0,0.229167,0.750000,0.006944
6545,247.0,11,11,342.0,2292,0.149215,0.324561,197.0,182.0,15.0,...,0.450292,0.035088,35.0,0.087940,17.0,16.0,2.0,0.485714,0.457143,0.057143
6546,2276.0,11,10,13.0,2292,0.005672,0.307692,2.0,11.0,0.0,...,0.846154,0.000000,2.0,0.153846,0.0,2.0,0.0,0.000000,1.000000,0.000000
6547,2276.0,11,1,13.0,2292,0.005672,0.307692,2.0,11.0,0.0,...,0.846154,0.000000,2.0,0.153846,0.0,2.0,0.0,0.000000,1.000000,0.000000


# Resolving NaNs: When brand_buys is 0, math results in some NaNs for ratios 

In [24]:
# Visualize rows with NaNs
nan_mask = df_brand.isna().any(axis=1)
nan_rows = df_brand[nan_mask]
nan_rows

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand_monthly_female_count_ratio,brand_monthly_unknown_count_ratio,brand_buys,brand_buy_ratio,brand_male_buy_count,brand_female_buy_count,brand_unknown_buy_count,brand_male_buy_ratio,brand_female_buy_ratio,brand_unknown_buy_ratio
0,1500.0,11,11,3.0,2292,0.001309,0.000000,1.0,11.0,2.0,...,0.333333,0.666667,0.0,0.0,,,,,,
1,1500.0,11,7,3.0,2292,0.001309,0.000000,1.0,11.0,2.0,...,0.333333,0.666667,0.0,0.0,,,,,,
2,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,...,0.900000,0.000000,0.0,0.0,,,,,,
3,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,...,0.900000,0.000000,0.0,0.0,,,,,,
4,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,...,0.900000,0.000000,0.0,0.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6521,3650.0,10,5,99.0,615,0.160976,0.181818,531.0,741.0,56.0,...,0.525253,0.010101,0.0,0.0,,,,,,
6525,913.0,9,17,7.0,615,0.011382,0.428571,1.0,13.0,0.0,...,1.000000,0.000000,0.0,0.0,,,,,,
6526,6455.0,5,26,12.0,368,0.032609,0.000000,3.0,37.0,0.0,...,1.000000,0.000000,0.0,0.0,,,,,,
6527,3650.0,9,14,103.0,615,0.167480,0.203883,531.0,741.0,56.0,...,0.485437,0.038835,0.0,0.0,,,,,,


In [25]:
# If Brand_Buys is 0, the previous method generates NaNs. Set these NaNs to 0
condition = df_brand['brand_buys'] == 0
df_brand.loc[condition, ['brand_male_buy_count', 'brand_female_buy_count', 'brand_unknown_buy_count', 'brand_male_buy_ratio', 'brand_female_buy_ratio', 'brand_unknown_buy_ratio']] = 0
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand_monthly_female_count_ratio,brand_monthly_unknown_count_ratio,brand_buys,brand_buy_ratio,brand_male_buy_count,brand_female_buy_count,brand_unknown_buy_count,brand_male_buy_ratio,brand_female_buy_ratio,brand_unknown_buy_ratio
0,1500.0,11,11,3.0,2292,0.001309,0.000000,1.0,11.0,2.0,...,0.333333,0.666667,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000
1,1500.0,11,7,3.0,2292,0.001309,0.000000,1.0,11.0,2.0,...,0.333333,0.666667,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000
2,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,...,0.900000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000
3,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,...,0.900000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000
4,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,...,0.900000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225.0,2292,0.098168,0.528889,170.0,689.0,14.0,...,0.755556,0.004444,144.0,0.162712,33.0,108.0,1.0,0.229167,0.750000,0.006944
6545,247.0,11,11,342.0,2292,0.149215,0.324561,197.0,182.0,15.0,...,0.450292,0.035088,35.0,0.087940,17.0,16.0,2.0,0.485714,0.457143,0.057143
6546,2276.0,11,10,13.0,2292,0.005672,0.307692,2.0,11.0,0.0,...,0.846154,0.000000,2.0,0.153846,0.0,2.0,0.0,0.000000,1.000000,0.000000
6547,2276.0,11,1,13.0,2292,0.005672,0.307692,2.0,11.0,0.0,...,0.846154,0.000000,2.0,0.153846,0.0,2.0,0.0,0.000000,1.000000,0.000000


# Check for unaccounted NaNs

In [26]:
print(sum(df_brand.isna().sum()))

72


# Age-Related

# AgeGroup Counts per Brand

In [27]:
# For Other Columns, modify the groupby condition
ageGroup_0_counts = df.groupby(['brand_id'])['age_range'].apply(lambda x: (x == 0).sum())
ageGroup_1_counts = df.groupby(['brand_id'])['age_range'].apply(lambda x: (x == 1).sum())
ageGroup_2_counts = df.groupby(['brand_id'])['age_range'].apply(lambda x: (x == 2).sum())
ageGroup_3_counts = df.groupby(['brand_id'])['age_range'].apply(lambda x: (x == 3).sum())
ageGroup_4_counts = df.groupby(['brand_id'])['age_range'].apply(lambda x: (x == 4).sum())
ageGroup_5_counts = df.groupby(['brand_id'])['age_range'].apply(lambda x: (x == 5).sum())
ageGroup_6_counts = df.groupby(['brand_id'])['age_range'].apply(lambda x: (x == 6).sum())
ageGroup_7_counts = df.groupby(['brand_id'])['age_range'].apply(lambda x: (x == 7).sum())
ageGroup_8_counts = df.groupby(['brand_id'])['age_range'].apply(lambda x: (x == 8).sum())
gender_total_brand_counts = pd.DataFrame({'brand_ageGroup_1_counts': ageGroup_1_counts,'brand_ageGroup_2_counts': ageGroup_2_counts,'brand_ageGroup_3_counts': ageGroup_3_counts,'brand_ageGroup_4_counts': ageGroup_4_counts,'brand_ageGroup_5_counts': ageGroup_5_counts,
                                          'brand_ageGroup_6_counts': ageGroup_6_counts,'brand_ageGroup_7_counts': ageGroup_7_counts,'brand_ageGroup_8_counts': ageGroup_8_counts, 'brand_ageGroup_0_counts': ageGroup_0_counts})
df_brand = df_brand.join(gender_total_brand_counts, on='brand_id')
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand_unknown_buy_ratio,brand_ageGroup_1_counts,brand_ageGroup_2_counts,brand_ageGroup_3_counts,brand_ageGroup_4_counts,brand_ageGroup_5_counts,brand_ageGroup_6_counts,brand_ageGroup_7_counts,brand_ageGroup_8_counts,brand_ageGroup_0_counts
0,1500.0,11,11,3.0,2292,0.001309,0.000000,1.0,11.0,2.0,...,0.000000,0.0,2.0,4.0,1.0,3.0,0.0,4.0,0.0,0.0
1,1500.0,11,7,3.0,2292,0.001309,0.000000,1.0,11.0,2.0,...,0.000000,0.0,2.0,4.0,1.0,3.0,0.0,4.0,0.0,0.0
2,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,...,0.000000,0.0,0.0,8.0,26.0,5.0,0.0,0.0,0.0,6.0
3,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,...,0.000000,0.0,0.0,8.0,26.0,5.0,0.0,0.0,0.0,6.0
4,1662.0,8,20,10.0,704,0.014205,0.300000,6.0,36.0,3.0,...,0.000000,0.0,0.0,8.0,26.0,5.0,0.0,0.0,0.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225.0,2292,0.098168,0.528889,170.0,689.0,14.0,...,0.006944,0.0,278.0,276.0,84.0,42.0,37.0,6.0,0.0,155.0
6545,247.0,11,11,342.0,2292,0.149215,0.324561,197.0,182.0,15.0,...,0.057143,0.0,88.0,94.0,71.0,25.0,15.0,2.0,2.0,100.0
6546,2276.0,11,10,13.0,2292,0.005672,0.307692,2.0,11.0,0.0,...,0.000000,0.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,5.0
6547,2276.0,11,1,13.0,2292,0.005672,0.307692,2.0,11.0,0.0,...,0.000000,0.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,5.0


In [28]:
#FEATURE 1: OVERALL ACTION COUNT/RATIO
actions = {}
for index, row in df.iterrows():
    if row['brand_id'] not in actions:
        # there may be multiple entries, so default with an empty list
        actions[row["brand_id"]] = [] 
    actions[row["brand_id"]].append(row["action_type"])

KeyError: nan

In [29]:
actions

{1500.0: [0.0, 0.0],
 1662.0: [0.0, 0.0, 0.0, 0.0],
 6885.0: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 3650.0: [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 5270.0: [2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0,
  2.0],
 3

In [30]:
action_count = {}
for key in actions.keys():
    action_count[key] = len(actions[key])
action_count

{1500.0: 2,
 1662.0: 4,
 6885.0: 7,
 3650.0: 108,
 5270.0: 18,
 3080.0: 2,
 99.0: 2,
 5192.0: 4,
 6742.0: 23,
 7924.0: 2,
 1446.0: 7,
 4533.0: 2,
 683.0: 27,
 5717.0: 3,
 69.0: 1,
 3462.0: 8,
 4120.0: 6,
 5360.0: 3,
 5067.0: 45,
 7892.0: 5,
 913.0: 3,
 2603.0: 12,
 944.0: 8,
 247.0: 21,
 8040.0: 1,
 2018.0: 7,
 1097.0: 4,
 1128.0: 1,
 928.0: 4,
 6086.0: 1,
 2541.0: 3,
 3228.0: 1,
 6040.0: 2,
 5890.0: 2,
 5394.0: 4,
 1654.0: 1,
 5588.0: 1,
 5999.0: 3,
 3969.0: 1,
 5230.0: 9,
 5380.0: 14,
 5491.0: 2,
 2350.0: 3,
 2735.0: 7,
 2169.0: 4,
 4385.0: 1,
 4073.0: 2,
 1768.0: 3,
 4583.0: 1,
 136.0: 1,
 68.0: 1,
 6740.0: 1,
 8122.0: 1,
 516.0: 1,
 7936.0: 2,
 4655.0: 1,
 5204.0: 2,
 7537.0: 1,
 6823.0: 2,
 5022.0: 1,
 5478.0: 1,
 5506.0: 1,
 6443.0: 1,
 nan: 0}

In [31]:
total_action_count = sum(action_count.values())
total_action_count

422

In [32]:
action_count_ratio = {}
for key in action_count:
    action_count_ratio[key] = action_count[key]/total_action_count
action_count_ratio

{1500.0: 0.004739336492890996,
 1662.0: 0.009478672985781991,
 6885.0: 0.016587677725118485,
 3650.0: 0.2559241706161137,
 5270.0: 0.04265402843601896,
 3080.0: 0.004739336492890996,
 99.0: 0.004739336492890996,
 5192.0: 0.009478672985781991,
 6742.0: 0.054502369668246446,
 7924.0: 0.004739336492890996,
 1446.0: 0.016587677725118485,
 4533.0: 0.004739336492890996,
 683.0: 0.06398104265402843,
 5717.0: 0.0071090047393364926,
 69.0: 0.002369668246445498,
 3462.0: 0.018957345971563982,
 4120.0: 0.014218009478672985,
 5360.0: 0.0071090047393364926,
 5067.0: 0.1066350710900474,
 7892.0: 0.011848341232227487,
 913.0: 0.0071090047393364926,
 2603.0: 0.02843601895734597,
 944.0: 0.018957345971563982,
 247.0: 0.04976303317535545,
 8040.0: 0.002369668246445498,
 2018.0: 0.016587677725118485,
 1097.0: 0.009478672985781991,
 1128.0: 0.002369668246445498,
 928.0: 0.009478672985781991,
 6086.0: 0.002369668246445498,
 2541.0: 0.0071090047393364926,
 3228.0: 0.002369668246445498,
 6040.0: 0.0047393364

In [33]:
df_brand['brand action count'] = -1
df_brand['brand action count ratio'] = -1
for index,row in df_brand.iterrows():
        df_brand.at[index, 'brand action count'] = action_count[row['brand_id']]
        df_brand.at[index, 'brand action count ratio'] = action_count_ratio[row['brand_id']]

df_brand

KeyError: nan

In [None]:
#FEATURE 2: OVERALL DAY COUNT
days = {}

# go through user_seller sub-dataframe and split entries by item_id:seller_id
for index, row in df.iterrows():
    if row['brand_id'] not in days:
        # there may be multiple entries, so default with an empty list
        days[row["brand_id"]] = [] 
    days[row["brand_id"]].append(row["time_stamp"])

In [None]:
for k,v in days.items():
    days[k] = list(set(v)) # make each value list unique

days

{3462.0: [1105.0, 1109.0, 1110.0, 1111.0],
 247.0: [1026.0,
  1101.0,
  1103.0,
  1106.0,
  1107.0,
  1108.0,
  1109.0,
  1110.0,
  1111.0],
 5380.0: [1103.0, 1104.0, 1105.0, 1107.0, 1108.0, 1109.0, 1111.0],
 683.0: [1030.0,
  617.0,
  522.0,
  524.0,
  529.0,
  1105.0,
  1107.0,
  1108.0,
  1109.0,
  1110.0,
  1111.0,
  603.0],
 2350.0: [1105.0, 1108.0, 1109.0, 1111.0],
 6208.0: [1105.0, 1109.0, 1111.0],
 1905.0: [520.0, 1110.0],
 6230.0: [1111.0],
 7924.0: [1108.0, 1109.0, 1111.0],
 5491.0: [1108.0, 1022.0, 1103.0],
 626.0: [1110.0, 1111.0],
 777.0: [1111.0],
 7936.0: [1111.0],
 1097.0: [1110.0, 1111.0],
 3931.0: [1111.0],
 1446.0: [528.0, 601.0],
 1246.0: [1103.0, 1105.0, 1106.0, 1108.0, 1109.0, 1110.0, 1111.0],
 7892.0: [801.0, 804.0, 806.0],
 5946.0: [1111.0],
 4631.0: [1026.0, 1101.0, 1110.0, 1111.0],
 7989.0: [627.0],
 5738.0: [1110.0],
 2276.0: [1110.0, 1111.0],
 3654.0: [1110.0],
 6590.0: [1001.0],
 8040.0: [1111.0],
 7371.0: [1109.0]}

In [None]:
day_count = {}
for key in days.keys():
    day_count[key] = len(days[key])
day_count

{3462.0: 4,
 247.0: 9,
 5380.0: 7,
 683.0: 12,
 2350.0: 4,
 6208.0: 3,
 1905.0: 2,
 6230.0: 1,
 7924.0: 3,
 5491.0: 3,
 626.0: 2,
 777.0: 1,
 7936.0: 1,
 1097.0: 2,
 3931.0: 1,
 1446.0: 2,
 1246.0: 7,
 7892.0: 3,
 5946.0: 1,
 4631.0: 4,
 7989.0: 1,
 5738.0: 1,
 2276.0: 2,
 3654.0: 1,
 6590.0: 1,
 8040.0: 1,
 7371.0: 1}

In [None]:
df_brand['brand day count'] = -1
for index,row in df_brand.iterrows():
        df_brand.at[index, 'brand day count'] = day_count[row['brand_id']]

df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand_ageGroup_3_counts,brand_ageGroup_4_counts,brand_ageGroup_5_counts,brand_ageGroup_6_counts,brand_ageGroup_7_counts,brand_ageGroup_8_counts,brand_ageGroup_0_counts,brand action count,brand action count ratio,brand day count
0,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,1,10,7,8,0,0,8,36,0.093506,4
1,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,1,10,7,8,0,0,8,36,0.093506,4
2,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,1,10,7,8,0,0,8,36,0.093506,4
3,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,...,1,10,7,8,0,0,8,36,0.093506,4
4,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,...,1,10,7,8,0,0,8,36,0.093506,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,247.0,11,9,104,362,0.287293,0.355769,72,33,0,...,24,35,1,3,0,0,17,105,0.272727,9
381,247.0,11,8,104,362,0.287293,0.355769,72,33,0,...,24,35,1,3,0,0,17,105,0.272727,9
382,247.0,11,8,104,362,0.287293,0.355769,72,33,0,...,24,35,1,3,0,0,17,105,0.272727,9
383,626.0,11,11,18,362,0.049724,0.111111,0,18,0,...,0,6,0,0,0,0,2,18,0.046753,2


In [None]:
#DOUBLE 11 FEATURES
df_11 = df[df['time_stamp']==1111]
df_11

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,age_range,gender,label,month,day
0,379824,198,656,145,3462.0,1111,0,5.0,1.0,0,11,11
1,379824,198,656,145,3462.0,1111,0,5.0,1.0,0,11,11
2,379824,198,656,145,3462.0,1111,2,5.0,1.0,0,11,11
9,141307,175,1181,4760,247.0,1111,0,4.0,1.0,0,11,11
10,141307,175,1181,4760,247.0,1111,0,4.0,1.0,0,11,11
...,...,...,...,...,...,...,...,...,...,...,...,...
375,289079,279,898,3323,683.0,1111,0,4.0,1.0,1,11,11
377,403117,175,1181,4760,247.0,1111,0,2.0,1.0,0,11,11
378,36385,219,349,1943,6208.0,1111,2,0.0,0.0,0,11,11
379,36385,219,349,1943,6208.0,1111,0,0.0,0.0,0,11,11


In [None]:
actions_11 = {}

# go through user_seller sub-dataframe and split entries by item_id:seller_id
for index, row in df_11.iterrows():
    if row['brand_id'] not in actions_11:
        # there may be multiple entries, so default with an empty list
        actions_11[row["brand_id"]] = [] 
    actions_11[row["brand_id"]].append(row["action_type"])

actions_11

{3462.0: [0.0,
  0.0,
  2.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0],
 247.0: [0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  2.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  2.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0],
 5380.0: [2.0, 0.0, 0.0, 2.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0],
 683.0: [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,


In [None]:
action_count_11 = {}
for key in actions_11.keys():
    action_count_11[key] = len(actions_11[key])
action_count_11

{3462.0: 30,
 247.0: 67,
 5380.0: 11,
 683.0: 46,
 6208.0: 17,
 6230.0: 2,
 7924.0: 10,
 626.0: 13,
 777.0: 1,
 7936.0: 4,
 3931.0: 1,
 1246.0: 12,
 5946.0: 6,
 4631.0: 3,
 1097.0: 1,
 2350.0: 1,
 2276.0: 2,
 8040.0: 1}

In [None]:
total_action_count_11 = sum(action_count_11.values())
total_action_count_11

228

In [None]:
action_count_ratio_11 = {}
for key in action_count_11:
    action_count_ratio_11[key] = action_count_11[key]/total_action_count_11
action_count_ratio_11

{3462.0: 0.13157894736842105,
 247.0: 0.29385964912280704,
 5380.0: 0.04824561403508772,
 683.0: 0.20175438596491227,
 6208.0: 0.07456140350877193,
 6230.0: 0.008771929824561403,
 7924.0: 0.043859649122807015,
 626.0: 0.05701754385964912,
 777.0: 0.0043859649122807015,
 7936.0: 0.017543859649122806,
 3931.0: 0.0043859649122807015,
 1246.0: 0.05263157894736842,
 5946.0: 0.02631578947368421,
 4631.0: 0.013157894736842105,
 1097.0: 0.0043859649122807015,
 2350.0: 0.0043859649122807015,
 2276.0: 0.008771929824561403,
 8040.0: 0.0043859649122807015}

In [None]:
df_brand['brand 1111 action count'] = 0
df_brand['brand 1111 action count ratio'] = 0
for index,row in df_brand.iterrows():
        if row['brand_id'] in action_count_11.keys():
            df_brand.at[index, 'brand 1111 action count'] = action_count_11[row['brand_id']]
        if row['brand_id'] in action_count_ratio_11.keys():
            df_brand.at[index, 'brand 1111 action count ratio'] = action_count_ratio_11[row['brand_id']]

df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand_ageGroup_5_counts,brand_ageGroup_6_counts,brand_ageGroup_7_counts,brand_ageGroup_8_counts,brand_ageGroup_0_counts,brand action count,brand action count ratio,brand day count,brand 1111 action count,brand 1111 action count ratio
0,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,7,8,0,0,8,36,0.093506,4,30,0.131579
1,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,7,8,0,0,8,36,0.093506,4,30,0.131579
2,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,7,8,0,0,8,36,0.093506,4,30,0.131579
3,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,...,7,8,0,0,8,36,0.093506,4,30,0.131579
4,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,...,7,8,0,0,8,36,0.093506,4,30,0.131579
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,247.0,11,9,104,362,0.287293,0.355769,72,33,0,...,1,3,0,0,17,105,0.272727,9,67,0.293860
381,247.0,11,8,104,362,0.287293,0.355769,72,33,0,...,1,3,0,0,17,105,0.272727,9,67,0.293860
382,247.0,11,8,104,362,0.287293,0.355769,72,33,0,...,1,3,0,0,17,105,0.272727,9,67,0.293860
383,626.0,11,11,18,362,0.049724,0.111111,0,18,0,...,0,0,0,0,2,18,0.046753,2,13,0.057018


In [None]:
activity_ratio_1111 = {}
df_brand['brand 1111 activity ratio'] = 0
for index,row in df_brand.iterrows():
    if df_brand.at[index,'brand 1111 action count'] != 0:
        df_brand.at[index, 'brand 1111 activity ratio'] = (df_brand.at[index,'brand 1111 action count']/df_brand.at[index,'brand action count'])
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand_ageGroup_6_counts,brand_ageGroup_7_counts,brand_ageGroup_8_counts,brand_ageGroup_0_counts,brand action count,brand action count ratio,brand day count,brand 1111 action count,brand 1111 action count ratio,brand 1111 activity ratio
0,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,8,0,0,8,36,0.093506,4,30,0.131579,0.833333
1,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,8,0,0,8,36,0.093506,4,30,0.131579,0.833333
2,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,8,0,0,8,36,0.093506,4,30,0.131579,0.833333
3,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,...,8,0,0,8,36,0.093506,4,30,0.131579,0.833333
4,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,...,8,0,0,8,36,0.093506,4,30,0.131579,0.833333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,247.0,11,9,104,362,0.287293,0.355769,72,33,0,...,3,0,0,17,105,0.272727,9,67,0.293860,0.638095
381,247.0,11,8,104,362,0.287293,0.355769,72,33,0,...,3,0,0,17,105,0.272727,9,67,0.293860,0.638095
382,247.0,11,8,104,362,0.287293,0.355769,72,33,0,...,3,0,0,17,105,0.272727,9,67,0.293860,0.638095
383,626.0,11,11,18,362,0.049724,0.111111,0,18,0,...,0,0,0,2,18,0.046753,2,13,0.057018,0.722222


In [None]:
#FEATURE 10: LATEST ONE-WEEK
latest_week = 1104
df_latest_week = df[df['time_stamp']>=latest_week]
df_latest_week

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,age_range,gender,label,month,day
0,379824,198,656,145,3462.0,1111,0,5.0,1.0,0,11,11
1,379824,198,656,145,3462.0,1111,0,5.0,1.0,0,11,11
2,379824,198,656,145,3462.0,1111,2,5.0,1.0,0,11,11
3,379824,198,656,145,3462.0,1110,0,5.0,1.0,0,11,10
4,379824,198,656,145,3462.0,1110,0,5.0,1.0,0,11,10
...,...,...,...,...,...,...,...,...,...,...,...,...
380,122632,175,1181,4760,247.0,1109,0,3.0,0.0,0,11,9
381,122632,175,1181,4760,247.0,1108,0,3.0,0.0,0,11,8
382,122632,175,1181,4760,247.0,1108,0,3.0,0.0,0,11,8
383,95362,253,962,3263,626.0,1111,0,0.0,0.0,0,11,11


In [None]:
actions_last_week = {}

# go through user_seller sub-dataframe and split entries by item_id:seller_id
for index, row in df_latest_week.iterrows():
    if row['brand_id'] not in actions_last_week:
        # there may be multiple entries, so default with an empty list
        actions_last_week[row["brand_id"]] = [] 
    actions_last_week[row["brand_id"]].append(row["action_type"])
    
#for k,v in actions_last_week.items():
#    actions_last_week[k] = list(set(v)) # make each value list unique
actions_last_week

{3462.0: [0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 247.0: [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  2.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 5380.0: [0.0,
  0.0,
  

In [None]:
action_count_last_week = {}
for key in actions_last_week.keys():
    action_count_last_week[key] = len(actions_last_week[key])
action_count_last_week

{3462.0: 36,
 247.0: 101,
 5380.0: 18,
 683.0: 75,
 2350.0: 6,
 6208.0: 21,
 6230.0: 2,
 7924.0: 12,
 626.0: 18,
 777.0: 1,
 7936.0: 4,
 1097.0: 3,
 3931.0: 1,
 1246.0: 29,
 1905.0: 1,
 5946.0: 6,
 4631.0: 7,
 5738.0: 2,
 2276.0: 7,
 3654.0: 1,
 5491.0: 2,
 8040.0: 1,
 7371.0: 1}

In [None]:
total_action_count_last_week = sum(action_count_last_week.values())
total_action_count_last_week


355

In [None]:
action_count_ratio_last_week = {}
for key in action_count_last_week:
    action_count_ratio_last_week[key] = action_count_last_week[key]/total_action_count_last_week
action_count_ratio_last_week

{3462.0: 0.10140845070422536,
 247.0: 0.28450704225352114,
 5380.0: 0.05070422535211268,
 683.0: 0.2112676056338028,
 2350.0: 0.016901408450704224,
 6208.0: 0.059154929577464786,
 6230.0: 0.005633802816901409,
 7924.0: 0.03380281690140845,
 626.0: 0.05070422535211268,
 777.0: 0.0028169014084507044,
 7936.0: 0.011267605633802818,
 1097.0: 0.008450704225352112,
 3931.0: 0.0028169014084507044,
 1246.0: 0.08169014084507042,
 1905.0: 0.0028169014084507044,
 5946.0: 0.016901408450704224,
 4631.0: 0.01971830985915493,
 5738.0: 0.005633802816901409,
 2276.0: 0.01971830985915493,
 3654.0: 0.0028169014084507044,
 5491.0: 0.005633802816901409,
 8040.0: 0.0028169014084507044,
 7371.0: 0.0028169014084507044}

In [None]:
df_brand['brand last week action count'] = 0
df_brand['brand last week action count ratio'] = 0
for index,row in df_brand.iterrows():
        if row['brand_id'] in action_count_last_week.keys():
            df_brand.at[index, 'brand last week action count'] = action_count_last_week[row['brand_id']]
        if row['brand_id'] in action_count_ratio_last_week.keys():
            df_brand.at[index, 'brand last week action count ratio'] = action_count_ratio_last_week[row['brand_id']]

df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand_ageGroup_8_counts,brand_ageGroup_0_counts,brand action count,brand action count ratio,brand day count,brand 1111 action count,brand 1111 action count ratio,brand 1111 activity ratio,brand last week action count,brand last week action count ratio
0,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,0,8,36,0.093506,4,30,0.131579,0.833333,36,0.101408
1,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,0,8,36,0.093506,4,30,0.131579,0.833333,36,0.101408
2,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,0,8,36,0.093506,4,30,0.131579,0.833333,36,0.101408
3,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,...,0,8,36,0.093506,4,30,0.131579,0.833333,36,0.101408
4,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,...,0,8,36,0.093506,4,30,0.131579,0.833333,36,0.101408
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,247.0,11,9,104,362,0.287293,0.355769,72,33,0,...,0,17,105,0.272727,9,67,0.293860,0.638095,101,0.284507
381,247.0,11,8,104,362,0.287293,0.355769,72,33,0,...,0,17,105,0.272727,9,67,0.293860,0.638095,101,0.284507
382,247.0,11,8,104,362,0.287293,0.355769,72,33,0,...,0,17,105,0.272727,9,67,0.293860,0.638095,101,0.284507
383,626.0,11,11,18,362,0.049724,0.111111,0,18,0,...,0,2,18,0.046753,2,13,0.057018,0.722222,18,0.050704


In [None]:
activity_ratio_last_week = {}
df_brand['brand last week activity ratio'] = 0
for index,row in df_brand.iterrows():
    if df_brand.at[index,'brand last week action count'] != 0:
        df_brand.at[index, 'brand last week activity ratio'] = (df_brand.at[index,'brand last week action count']/df_brand.at[index,'brand action count'])
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand_ageGroup_0_counts,brand action count,brand action count ratio,brand day count,brand 1111 action count,brand 1111 action count ratio,brand 1111 activity ratio,brand last week action count,brand last week action count ratio,brand last week activity ratio
0,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,8,36,0.093506,4,30,0.131579,0.833333,36,0.101408,1.000000
1,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,8,36,0.093506,4,30,0.131579,0.833333,36,0.101408,1.000000
2,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,8,36,0.093506,4,30,0.131579,0.833333,36,0.101408,1.000000
3,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,...,8,36,0.093506,4,30,0.131579,0.833333,36,0.101408,1.000000
4,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,...,8,36,0.093506,4,30,0.131579,0.833333,36,0.101408,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,247.0,11,9,104,362,0.287293,0.355769,72,33,0,...,17,105,0.272727,9,67,0.293860,0.638095,101,0.284507,0.961905
381,247.0,11,8,104,362,0.287293,0.355769,72,33,0,...,17,105,0.272727,9,67,0.293860,0.638095,101,0.284507,0.961905
382,247.0,11,8,104,362,0.287293,0.355769,72,33,0,...,17,105,0.272727,9,67,0.293860,0.638095,101,0.284507,0.961905
383,626.0,11,11,18,362,0.049724,0.111111,0,18,0,...,2,18,0.046753,2,13,0.057018,0.722222,18,0.050704,1.000000


In [None]:
#FEATURE 11: LATEST MONTH
latest_month = 1011
df_latest_month = df[df['time_stamp']>=latest_month]
df_latest_month

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,age_range,gender,label,month,day
0,379824,198,656,145,3462.0,1111,0,5.0,1.0,0,11,11
1,379824,198,656,145,3462.0,1111,0,5.0,1.0,0,11,11
2,379824,198,656,145,3462.0,1111,2,5.0,1.0,0,11,11
3,379824,198,656,145,3462.0,1110,0,5.0,1.0,0,11,10
4,379824,198,656,145,3462.0,1110,0,5.0,1.0,0,11,10
...,...,...,...,...,...,...,...,...,...,...,...,...
380,122632,175,1181,4760,247.0,1109,0,3.0,0.0,0,11,9
381,122632,175,1181,4760,247.0,1108,0,3.0,0.0,0,11,8
382,122632,175,1181,4760,247.0,1108,0,3.0,0.0,0,11,8
383,95362,253,962,3263,626.0,1111,0,0.0,0.0,0,11,11


In [None]:
actions_last_month = {}

# go through user_seller sub-dataframe and split entries by user_id:seller_id
for index, row in df_latest_month.iterrows():
    if row['brand_id'] not in actions_last_month:
        # there may be multiple entries, so default with an empty list
        actions_last_month[row["brand_id"]] = [] 
    actions_last_month[row["brand_id"]].append(row["action_type"])
    
#for k,v in actions_last_month.items():
#    actions_last_month[k] = list(set(v)) # make each value list unique
actions_last_month

{3462.0: [0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 247.0: [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  2.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.

In [None]:
action_count_last_month = {}
for key in actions_last_month.keys():
    action_count_last_month[key] = len(actions_last_month[key])
action_count_last_month

{3462.0: 36,
 247.0: 105,
 5380.0: 19,
 683.0: 76,
 2350.0: 6,
 6208.0: 21,
 6230.0: 2,
 7924.0: 12,
 5491.0: 4,
 626.0: 18,
 777.0: 1,
 7936.0: 4,
 1097.0: 3,
 3931.0: 1,
 1246.0: 30,
 1905.0: 1,
 5946.0: 6,
 4631.0: 9,
 5738.0: 2,
 2276.0: 7,
 3654.0: 1,
 8040.0: 1,
 7371.0: 1}

In [None]:
total_action_count_last_month = sum(action_count_last_month.values())
total_action_count_last_month

366

In [None]:
action_count_ratio_last_month = {}
for key in action_count_last_month:
    action_count_ratio_last_month[key] = action_count_last_month[key]/total_action_count_last_month
action_count_ratio_last_month

{3462.0: 0.09836065573770492,
 247.0: 0.28688524590163933,
 5380.0: 0.05191256830601093,
 683.0: 0.20765027322404372,
 2350.0: 0.01639344262295082,
 6208.0: 0.05737704918032787,
 6230.0: 0.00546448087431694,
 7924.0: 0.03278688524590164,
 5491.0: 0.01092896174863388,
 626.0: 0.04918032786885246,
 777.0: 0.00273224043715847,
 7936.0: 0.01092896174863388,
 1097.0: 0.00819672131147541,
 3931.0: 0.00273224043715847,
 1246.0: 0.08196721311475409,
 1905.0: 0.00273224043715847,
 5946.0: 0.01639344262295082,
 4631.0: 0.02459016393442623,
 5738.0: 0.00546448087431694,
 2276.0: 0.01912568306010929,
 3654.0: 0.00273224043715847,
 8040.0: 0.00273224043715847,
 7371.0: 0.00273224043715847}

In [None]:
df_brand['last month action count'] = 0
df_brand['last month action count ratio'] = 0
for index,row in df_brand.iterrows():
        if row['brand_id'] in action_count_last_month.keys():
            df_brand.at[index, 'last month action count'] = action_count_last_month[row['brand_id']]
        if row['brand_id'] in action_count_ratio_last_month.keys():
            df_brand.at[index, 'last month action count ratio'] = action_count_ratio_last_month[row['brand_id']]

df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand action count ratio,brand day count,brand 1111 action count,brand 1111 action count ratio,brand 1111 activity ratio,brand last week action count,brand last week action count ratio,brand last week activity ratio,last month action count,last month action count ratio
0,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,0.093506,4,30,0.131579,0.833333,36,0.101408,1.000000,36,0.098361
1,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,0.093506,4,30,0.131579,0.833333,36,0.101408,1.000000,36,0.098361
2,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,0.093506,4,30,0.131579,0.833333,36,0.101408,1.000000,36,0.098361
3,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,...,0.093506,4,30,0.131579,0.833333,36,0.101408,1.000000,36,0.098361
4,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,...,0.093506,4,30,0.131579,0.833333,36,0.101408,1.000000,36,0.098361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,247.0,11,9,104,362,0.287293,0.355769,72,33,0,...,0.272727,9,67,0.293860,0.638095,101,0.284507,0.961905,105,0.286885
381,247.0,11,8,104,362,0.287293,0.355769,72,33,0,...,0.272727,9,67,0.293860,0.638095,101,0.284507,0.961905,105,0.286885
382,247.0,11,8,104,362,0.287293,0.355769,72,33,0,...,0.272727,9,67,0.293860,0.638095,101,0.284507,0.961905,105,0.286885
383,626.0,11,11,18,362,0.049724,0.111111,0,18,0,...,0.046753,2,13,0.057018,0.722222,18,0.050704,1.000000,18,0.049180


In [None]:
activity_ratio_last_month = {}
df_brand['last month activity ratio'] = 0
for index,row in df_brand.iterrows():
    if df_brand.at[index,'last month action count'] != 0:
        df_brand.at[index, 'last month activity ratio'] = (df_brand.at[index,'last month action count']/df_brand.at[index,'brand action count'])
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand day count,brand 1111 action count,brand 1111 action count ratio,brand 1111 activity ratio,brand last week action count,brand last week action count ratio,brand last week activity ratio,last month action count,last month action count ratio,last month activity ratio
0,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,4,30,0.131579,0.833333,36,0.101408,1.000000,36,0.098361,1.0
1,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,4,30,0.131579,0.833333,36,0.101408,1.000000,36,0.098361,1.0
2,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,4,30,0.131579,0.833333,36,0.101408,1.000000,36,0.098361,1.0
3,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,...,4,30,0.131579,0.833333,36,0.101408,1.000000,36,0.098361,1.0
4,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,...,4,30,0.131579,0.833333,36,0.101408,1.000000,36,0.098361,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,247.0,11,9,104,362,0.287293,0.355769,72,33,0,...,9,67,0.293860,0.638095,101,0.284507,0.961905,105,0.286885,1.0
381,247.0,11,8,104,362,0.287293,0.355769,72,33,0,...,9,67,0.293860,0.638095,101,0.284507,0.961905,105,0.286885,1.0
382,247.0,11,8,104,362,0.287293,0.355769,72,33,0,...,9,67,0.293860,0.638095,101,0.284507,0.961905,105,0.286885,1.0
383,626.0,11,11,18,362,0.049724,0.111111,0,18,0,...,2,13,0.057018,0.722222,18,0.050704,1.000000,18,0.049180,1.0


In [None]:
print("TOTAL ACTIONS: ", total_action_count)
print("TOTAL ACTIONS LATEST MONTH: ",total_action_count_last_month)
print("TOTAL ACTIONS LATEST WEEK: ",total_action_count_last_week)
print("TOTAL ACTIONS 1111: ",total_action_count_11)

TOTAL ACTIONS:  385
TOTAL ACTIONS LATEST MONTH:  366
TOTAL ACTIONS LATEST WEEK:  355
TOTAL ACTIONS 1111:  228


In [None]:
#FEATURE 18: AGE RELATED FEATURES
brand_age = {}

# go through user_seller sub-dataframe and split entries by user_id:seller_id
for index, row in df.iterrows():
    if row['brand_id'] not in brand_age:
        # there may be multiple entries, so default with an empty list
        brand_age[row["brand_id"]] = [] 
    brand_age[row["brand_id"]].append(row["age_range"])

brand_age_avg = {}
for row in brand_age.items():
    brand_age_avg[row[0]] = np.nansum(row[1])/len(row[1])
sorted(brand_age_avg.items())

[(247.0, 2.7142857142857144),
 (626.0, 2.4444444444444446),
 (683.0, 2.8705882352941177),
 (777.0, 4.0),
 (1097.0, 1.3333333333333333),
 (1246.0, 2.4),
 (1446.0, 2.0),
 (1905.0, 4.5),
 (2276.0, 1.1428571428571428),
 (2350.0, 3.0),
 (3462.0, 3.611111111111111),
 (3654.0, 0.0),
 (3931.0, 3.0),
 (4631.0, 0.5555555555555556),
 (5380.0, 4.0),
 (5491.0, 4.0),
 (5738.0, 3.0),
 (5946.0, 4.0),
 (6208.0, 3.238095238095238),
 (6230.0, 2.0),
 (6590.0, 6.0),
 (7371.0, 2.0),
 (7892.0, 3.2),
 (7924.0, 2.3333333333333335),
 (7936.0, 5.0),
 (7989.0, 3.0),
 (8040.0, 0.0)]

In [None]:
total_brand_average_age = sum(brand_age_avg.values())/len(brand_age_avg.keys())
print("Average age total: ",total_brand_average_age)

Average age total:  2.790503855863333


In [None]:
df_brand['average age'] = 0
for index,row in df_brand.iterrows():
        if row['brand_id'] in brand_age_avg.keys():
            df_brand.at[index, 'average age'] = brand_age_avg[row['brand_id']]
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand 1111 action count,brand 1111 action count ratio,brand 1111 activity ratio,brand last week action count,brand last week action count ratio,brand last week activity ratio,last month action count,last month action count ratio,last month activity ratio,average age
0,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,30,0.131579,0.833333,36,0.101408,1.000000,36,0.098361,1.0,3.611111
1,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,30,0.131579,0.833333,36,0.101408,1.000000,36,0.098361,1.0,3.611111
2,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,30,0.131579,0.833333,36,0.101408,1.000000,36,0.098361,1.0,3.611111
3,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,...,30,0.131579,0.833333,36,0.101408,1.000000,36,0.098361,1.0,3.611111
4,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,...,30,0.131579,0.833333,36,0.101408,1.000000,36,0.098361,1.0,3.611111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,247.0,11,9,104,362,0.287293,0.355769,72,33,0,...,67,0.293860,0.638095,101,0.284507,0.961905,105,0.286885,1.0,2.714286
381,247.0,11,8,104,362,0.287293,0.355769,72,33,0,...,67,0.293860,0.638095,101,0.284507,0.961905,105,0.286885,1.0,2.714286
382,247.0,11,8,104,362,0.287293,0.355769,72,33,0,...,67,0.293860,0.638095,101,0.284507,0.961905,105,0.286885,1.0,2.714286
383,626.0,11,11,18,362,0.049724,0.111111,0,18,0,...,13,0.057018,0.722222,18,0.050704,1.000000,18,0.049180,1.0,2.444444


In [None]:
#FEATURE 19: GENDER FEATURES
brand_gender = {}

# go through user_seller sub-dataframe and split entries by user_id:seller_id
for index, row in df.iterrows():
    if row['brand_id'] not in brand_gender:
        # there may be multiple entries, so default with an empty list
        brand_gender[row["brand_id"]] = [] 
    brand_gender[row["brand_id"]].append(row["gender"])

brand_gender_avg = {}
for row in brand_gender.items():
    brand_gender_avg[row[0]] = np.nansum(row[1])/len(row[1])
sorted(brand_gender_avg.items())

[(247.0, 0.6857142857142857),
 (626.0, 0.0),
 (683.0, 0.49411764705882355),
 (777.0, 0.0),
 (1097.0, 0.0),
 (1246.0, 0.6333333333333333),
 (1446.0, 0.0),
 (1905.0, 1.0),
 (2276.0, 0.0),
 (2350.0, 0.0),
 (3462.0, 0.3611111111111111),
 (3654.0, 1.0),
 (3931.0, 0.0),
 (4631.0, 0.7777777777777778),
 (5380.0, 0.0),
 (5491.0, 0.0),
 (5738.0, 1.0),
 (5946.0, 0.0),
 (6208.0, 0.6190476190476191),
 (6230.0, 1.0),
 (6590.0, 1.0),
 (7371.0, 0.0),
 (7892.0, 1.0),
 (7924.0, 0.16666666666666666),
 (7936.0, 0.0),
 (7989.0, 0.0),
 (8040.0, 0.0)]

In [None]:
total_brand_average_gender = sum(brand_gender_avg.values())/len(brand_gender_avg.keys())
print("Average gender total: ",total_brand_average_gender)

Average gender total:  0.36065809039665253


In [None]:
df_brand['average gender'] = 0
for index,row in df_brand.iterrows():
        if row['brand_id'] in brand_gender_avg.keys():
            df_brand.at[index, 'average gender'] = brand_gender_avg[row['brand_id']]
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand 1111 action count ratio,brand 1111 activity ratio,brand last week action count,brand last week action count ratio,brand last week activity ratio,last month action count,last month action count ratio,last month activity ratio,average age,average gender
0,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,0.131579,0.833333,36,0.101408,1.000000,36,0.098361,1.0,3.611111,0.361111
1,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,0.131579,0.833333,36,0.101408,1.000000,36,0.098361,1.0,3.611111,0.361111
2,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,0.131579,0.833333,36,0.101408,1.000000,36,0.098361,1.0,3.611111,0.361111
3,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,...,0.131579,0.833333,36,0.101408,1.000000,36,0.098361,1.0,3.611111,0.361111
4,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,...,0.131579,0.833333,36,0.101408,1.000000,36,0.098361,1.0,3.611111,0.361111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,247.0,11,9,104,362,0.287293,0.355769,72,33,0,...,0.293860,0.638095,101,0.284507,0.961905,105,0.286885,1.0,2.714286,0.685714
381,247.0,11,8,104,362,0.287293,0.355769,72,33,0,...,0.293860,0.638095,101,0.284507,0.961905,105,0.286885,1.0,2.714286,0.685714
382,247.0,11,8,104,362,0.287293,0.355769,72,33,0,...,0.293860,0.638095,101,0.284507,0.961905,105,0.286885,1.0,2.714286,0.685714
383,626.0,11,11,18,362,0.049724,0.111111,0,18,0,...,0.057018,0.722222,18,0.050704,1.000000,18,0.049180,1.0,2.444444,0.000000


In [None]:

df_brand.to_csv('test_brand_profile.csv',index=False)