In [9]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
'''
FEATURES:
Overall action count/ratio
    - 
Overall day count
Monthly action count/ratio
    - Per brand, number of actions in a month / total actions (across all instances) in that month
Penetration (Popularity / Buys)
    - Number of Buys
    - Related Brand Popularity: Among the brands, split into tiers of high popularity vs low popularity
Monthly Aggregation
    - Per brand, number of actions in a month
    - Per brand, average action_type in a month
    - Std. deviation for number of clicks 
    - Per brand, action count by gender in a month
Double 11 Features
Latest One-Week
Repeat Buyer Features
Age Related
Gender Related
'''

'\nFEATURES:\nOverall action count/ratio\n    - \nOverall day count\nMonthly action count/ratio\n    - Per brand, number of actions in a month / total actions (across all instances) in that month\nPenetration (Popularity / Buys)\n    - Number of Buys\n    - Related Brand Popularity: Among the brands, split into tiers of high popularity vs low popularity\nMonthly Aggregation\n    - Per brand, number of actions in a month\n    - Per brand, average action_type in a month\n    - Std. deviation for number of clicks \n    - Per brand, action count by gender in a month\nDouble 11 Features\nLatest One-Week\nRepeat Buyer Features\nAge Related\nGender Related\n'

In [10]:
df = pd.read_csv("./use_data/expanded_training.csv")
df

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,age_range,gender,label
0,379824,198,656,145,3462.0,1111,0,5.0,1.0,0
1,379824,198,656,145,3462.0,1111,0,5.0,1.0,0
2,379824,198,656,145,3462.0,1111,2,5.0,1.0,0
3,379824,198,656,145,3462.0,1110,0,5.0,1.0,0
4,379824,198,656,145,3462.0,1110,0,5.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...
380,122632,175,1181,4760,247.0,1109,0,3.0,0.0,0
381,122632,175,1181,4760,247.0,1108,0,3.0,0.0,0
382,122632,175,1181,4760,247.0,1108,0,3.0,0.0,0
383,95362,253,962,3263,626.0,1111,0,0.0,0.0,0


In [11]:
df_brand = df[['brand_id']]
df_brand

Unnamed: 0,brand_id
0,3462.0
1,3462.0
2,3462.0
3,3462.0
4,3462.0
...,...
380,247.0
381,247.0
382,247.0
383,626.0


In [4]:
# Unique Brand Count
unique_brand_count = len(pd.unique(df_brand['brand_id']))
print(f'Unique Brand Count: {unique_brand_count}')

brand_occurrences = df_brand['brand_id'].value_counts()
print(brand_occurrences)

Unique Brand Count: 27
247.0     105
683.0      85
3462.0     36
1246.0     30
6208.0     21
5380.0     19
626.0      18
7924.0     12
4631.0      9
2276.0      7
2350.0      6
5946.0      6
7892.0      5
5491.0      4
7936.0      4
1097.0      3
1446.0      2
6230.0      2
5738.0      2
1905.0      2
3931.0      1
777.0       1
7989.0      1
3654.0      1
6590.0      1
8040.0      1
7371.0      1
Name: brand_id, dtype: int64


In [5]:
df['month'] = df['time_stamp'] // 100
df['day'] = df['time_stamp'] % 100
df

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,age_range,gender,label,month,day
0,379824,198,656,145,3462.0,1111,0,5.0,1.0,0,11,11
1,379824,198,656,145,3462.0,1111,0,5.0,1.0,0,11,11
2,379824,198,656,145,3462.0,1111,2,5.0,1.0,0,11,11
3,379824,198,656,145,3462.0,1110,0,5.0,1.0,0,11,10
4,379824,198,656,145,3462.0,1110,0,5.0,1.0,0,11,10
...,...,...,...,...,...,...,...,...,...,...,...,...
380,122632,175,1181,4760,247.0,1109,0,3.0,0.0,0,11,9
381,122632,175,1181,4760,247.0,1108,0,3.0,0.0,0,11,8
382,122632,175,1181,4760,247.0,1108,0,3.0,0.0,0,11,8
383,95362,253,962,3263,626.0,1111,0,0.0,0.0,0,11,11


In [6]:
# Split Month Day into Separate Columns
df_brand['month'] = df['time_stamp'] // 100
df_brand['day'] = df['time_stamp'] % 100
df_brand

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['month'] = df['time_stamp'] // 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['day'] = df['time_stamp'] % 100


Unnamed: 0,brand_id,month,day
0,3462.0,11,11
1,3462.0,11,11
2,3462.0,11,11
3,3462.0,11,10
4,3462.0,11,10
...,...,...,...
380,247.0,11,9
381,247.0,11,8
382,247.0,11,8
383,626.0,11,11


Aggregation

Common Aggregate Functions:
- Average
- Count
    > Action Count
- Maximum
- Median
- Minimum
- Mode
- Range
- Sum
- StdDeviation
- NaNMean

In [7]:
# Monthly Action Count

In [8]:
# Monthly Brand Action Counts
df_brand['brand_monthly_action_count'] = df_brand.groupby(['brand_id', 'month']).transform('size')
df_brand

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['brand_monthly_action_count'] = df_brand.groupby(['brand_id', 'month']).transform('size')


Unnamed: 0,brand_id,month,day,brand_monthly_action_count
0,3462.0,11,11,36
1,3462.0,11,11,36
2,3462.0,11,11,36
3,3462.0,11,10,36
4,3462.0,11,10,36
...,...,...,...,...
380,247.0,11,9,104
381,247.0,11,8,104
382,247.0,11,8,104
383,626.0,11,11,18


In [9]:
# Total Action Count in said month
df_brand['month_total_action_count'] = df_brand.groupby(['month']).transform('size')
df_brand


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['month_total_action_count'] = df_brand.groupby(['month']).transform('size')


Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count
0,3462.0,11,11,36,362
1,3462.0,11,11,36,362
2,3462.0,11,11,36,362
3,3462.0,11,10,36,362
4,3462.0,11,10,36,362
...,...,...,...,...,...
380,247.0,11,9,104,362
381,247.0,11,8,104,362
382,247.0,11,8,104,362
383,626.0,11,11,18,362


In [10]:
# Monthly Action Count / Ratio (Count/Ratio Type)
df_brand['monthly_action_count_ratio'] = df_brand.groupby(['brand_id', 'month'])['brand_monthly_action_count'].transform(lambda x: x/ df_brand['month_total_action_count'])
df_brand


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['monthly_action_count_ratio'] = df_brand.groupby(['brand_id', 'month'])['brand_monthly_action_count'].transform(lambda x: x/ df_brand['month_total_action_count'])


Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio
0,3462.0,11,11,36,362,0.099448
1,3462.0,11,11,36,362,0.099448
2,3462.0,11,11,36,362,0.099448
3,3462.0,11,10,36,362,0.099448
4,3462.0,11,10,36,362,0.099448
...,...,...,...,...,...,...
380,247.0,11,9,104,362,0.287293
381,247.0,11,8,104,362,0.287293
382,247.0,11,8,104,362,0.287293
383,626.0,11,11,18,362,0.049724


In [11]:
# Per Brand Mean Action Type (Treats Action Type not really Discrete Label)
df_brand['monthly_mean_action_type'] = df.groupby(['brand_id', 'month'])['action_type'].transform('mean')
df_brand

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['monthly_mean_action_type'] = df.groupby(['brand_id', 'month'])['action_type'].transform('mean')


Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type
0,3462.0,11,11,36,362,0.099448,0.361111
1,3462.0,11,11,36,362,0.099448,0.361111
2,3462.0,11,11,36,362,0.099448,0.361111
3,3462.0,11,10,36,362,0.099448,0.361111
4,3462.0,11,10,36,362,0.099448,0.361111
...,...,...,...,...,...,...,...
380,247.0,11,9,104,362,0.287293,0.355769
381,247.0,11,8,104,362,0.287293,0.355769
382,247.0,11,8,104,362,0.287293,0.355769
383,626.0,11,11,18,362,0.049724,0.111111



# Gender Interaction Count per Brand

In [12]:
# For Other Columns, modify the groupby condition
male_counts = df.groupby(['brand_id'])['gender'].apply(lambda x: (x == 1).sum())
female_counts = df.groupby(['brand_id'])['gender'].apply(lambda x: (x == 0).sum())
unknown_gender_count = df.groupby(['brand_id'])['gender'].apply(lambda x: (x == 2).sum())
gender_total_brand_counts = pd.DataFrame({'brand_male_count': male_counts, 'brand_female_count': female_counts, 'brand_unknown_gender_count': unknown_gender_count})
gender_total_brand_counts


Unnamed: 0_level_0,brand_male_count,brand_female_count,brand_unknown_gender_count
brand_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
247.0,72,33,0
626.0,0,18,0
683.0,40,44,1
777.0,0,1,0
1097.0,0,3,0
1246.0,19,11,0
1446.0,0,2,0
1905.0,0,1,1
2276.0,0,7,0
2350.0,0,6,0


In [13]:
df_brand = df_brand.join(gender_total_brand_counts, on='brand_id')
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count
0,3462.0,11,11,36,362,0.099448,0.361111,9,25,2
1,3462.0,11,11,36,362,0.099448,0.361111,9,25,2
2,3462.0,11,11,36,362,0.099448,0.361111,9,25,2
3,3462.0,11,10,36,362,0.099448,0.361111,9,25,2
4,3462.0,11,10,36,362,0.099448,0.361111,9,25,2
...,...,...,...,...,...,...,...,...,...,...
380,247.0,11,9,104,362,0.287293,0.355769,72,33,0
381,247.0,11,8,104,362,0.287293,0.355769,72,33,0
382,247.0,11,8,104,362,0.287293,0.355769,72,33,0
383,626.0,11,11,18,362,0.049724,0.111111,0,18,0


# Monthly Gender Action Count per Brand

In [14]:
monthly_male_counts = df.groupby(['brand_id', 'month'])['gender'].apply(lambda x: (x == 1).sum())
monthly_female_counts = df.groupby(['brand_id', 'month'])['gender'].apply(lambda x: (x == 0).sum())
monthly_unknown_gender_count = df.groupby(['brand_id', 'month'])['gender'].apply(lambda x: (x == 2).sum())
monthly_gender_total_brand_counts = pd.DataFrame({'month_brand_male_count': monthly_male_counts, 'month_brand_female_count': monthly_female_counts, 'month_brand_unknown_gender_count': monthly_unknown_gender_count})
monthly_gender_total_brand_counts

Unnamed: 0_level_0,Unnamed: 1_level_0,month_brand_male_count,month_brand_female_count,month_brand_unknown_gender_count
brand_id,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
247.0,10,1,0,0
247.0,11,71,33,0
626.0,11,0,18,0
683.0,5,6,0,0
683.0,6,2,1,0
683.0,10,0,1,0
683.0,11,32,42,1
777.0,11,0,1,0
1097.0,11,0,3,0
1246.0,11,19,11,0


In [15]:
df_brand = df_brand.join(monthly_gender_total_brand_counts, on=['brand_id', 'month'])
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,month_brand_male_count,month_brand_female_count,month_brand_unknown_gender_count
0,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,9,25,2
1,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,9,25,2
2,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,9,25,2
3,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,9,25,2
4,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,9,25,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,247.0,11,9,104,362,0.287293,0.355769,72,33,0,71,33,0
381,247.0,11,8,104,362,0.287293,0.355769,72,33,0,71,33,0
382,247.0,11,8,104,362,0.287293,0.355769,72,33,0,71,33,0
383,626.0,11,11,18,362,0.049724,0.111111,0,18,0,0,18,0


# Brand Gender Count Ratio

In [16]:
gender_counts = df_brand[['brand_male_count', 'brand_female_count', 'brand_unknown_gender_count']].sum(axis=1)
male_ratio = df_brand.apply(lambda row: row['brand_male_count'] / gender_counts[row.name], axis=1)
female_ratio = df_brand.apply(lambda row: row['brand_female_count'] / gender_counts[row.name], axis=1)
unknown_gender_ratio = df_brand.apply(lambda row: row['brand_unknown_gender_count'] / gender_counts[row.name], axis=1)
df_brand['brand_male_count_ratio'] = male_ratio
df_brand['brand_female_count_ratio'] = female_ratio
df_brand['brand_unknown_count_ratio'] = unknown_gender_ratio
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,month_brand_male_count,month_brand_female_count,month_brand_unknown_gender_count,brand_male_count_ratio,brand_female_count_ratio,brand_unknown_count_ratio
0,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,9,25,2,0.250000,0.694444,0.055556
1,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,9,25,2,0.250000,0.694444,0.055556
2,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,9,25,2,0.250000,0.694444,0.055556
3,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,9,25,2,0.250000,0.694444,0.055556
4,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,9,25,2,0.250000,0.694444,0.055556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,247.0,11,9,104,362,0.287293,0.355769,72,33,0,71,33,0,0.685714,0.314286,0.000000
381,247.0,11,8,104,362,0.287293,0.355769,72,33,0,71,33,0,0.685714,0.314286,0.000000
382,247.0,11,8,104,362,0.287293,0.355769,72,33,0,71,33,0,0.685714,0.314286,0.000000
383,626.0,11,11,18,362,0.049724,0.111111,0,18,0,0,18,0,0.000000,1.000000,0.000000


# Brand Monthly Gender Count Ratio

In [17]:
df_brand['brand_monthly_male_count_ratio'] = df_brand['month_brand_male_count'] / df_brand['brand_monthly_action_count']
df_brand['brand_monthly_female_count_ratio'] = df_brand['month_brand_female_count'] / df_brand['brand_monthly_action_count']
df_brand['brand_monthly_unknown_count_ratio'] = df_brand['month_brand_unknown_gender_count'] / df_brand['brand_monthly_action_count']
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,month_brand_male_count,month_brand_female_count,month_brand_unknown_gender_count,brand_male_count_ratio,brand_female_count_ratio,brand_unknown_count_ratio,brand_monthly_male_count_ratio,brand_monthly_female_count_ratio,brand_monthly_unknown_count_ratio
0,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,9,25,2,0.250000,0.694444,0.055556,0.250000,0.694444,0.055556
1,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,9,25,2,0.250000,0.694444,0.055556,0.250000,0.694444,0.055556
2,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,9,25,2,0.250000,0.694444,0.055556,0.250000,0.694444,0.055556
3,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,9,25,2,0.250000,0.694444,0.055556,0.250000,0.694444,0.055556
4,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,9,25,2,0.250000,0.694444,0.055556,0.250000,0.694444,0.055556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,247.0,11,9,104,362,0.287293,0.355769,72,33,0,71,33,0,0.685714,0.314286,0.000000,0.682692,0.317308,0.000000
381,247.0,11,8,104,362,0.287293,0.355769,72,33,0,71,33,0,0.685714,0.314286,0.000000,0.682692,0.317308,0.000000
382,247.0,11,8,104,362,0.287293,0.355769,72,33,0,71,33,0,0.685714,0.314286,0.000000,0.682692,0.317308,0.000000
383,626.0,11,11,18,362,0.049724,0.111111,0,18,0,0,18,0,0.000000,1.000000,0.000000,0.000000,1.000000,0.000000


# All Above Confirmed

# Penetration

In [18]:
df

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,age_range,gender,label,month,day
0,379824,198,656,145,3462.0,1111,0,5.0,1.0,0,11,11
1,379824,198,656,145,3462.0,1111,0,5.0,1.0,0,11,11
2,379824,198,656,145,3462.0,1111,2,5.0,1.0,0,11,11
3,379824,198,656,145,3462.0,1110,0,5.0,1.0,0,11,10
4,379824,198,656,145,3462.0,1110,0,5.0,1.0,0,11,10
...,...,...,...,...,...,...,...,...,...,...,...,...
380,122632,175,1181,4760,247.0,1109,0,3.0,0.0,0,11,9
381,122632,175,1181,4760,247.0,1108,0,3.0,0.0,0,11,8
382,122632,175,1181,4760,247.0,1108,0,3.0,0.0,0,11,8
383,95362,253,962,3263,626.0,1111,0,0.0,0.0,0,11,11


In [19]:
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,month_brand_male_count,month_brand_female_count,month_brand_unknown_gender_count,brand_male_count_ratio,brand_female_count_ratio,brand_unknown_count_ratio,brand_monthly_male_count_ratio,brand_monthly_female_count_ratio,brand_monthly_unknown_count_ratio
0,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,9,25,2,0.250000,0.694444,0.055556,0.250000,0.694444,0.055556
1,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,9,25,2,0.250000,0.694444,0.055556,0.250000,0.694444,0.055556
2,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,9,25,2,0.250000,0.694444,0.055556,0.250000,0.694444,0.055556
3,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,9,25,2,0.250000,0.694444,0.055556,0.250000,0.694444,0.055556
4,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,9,25,2,0.250000,0.694444,0.055556,0.250000,0.694444,0.055556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,247.0,11,9,104,362,0.287293,0.355769,72,33,0,71,33,0,0.685714,0.314286,0.000000,0.682692,0.317308,0.000000
381,247.0,11,8,104,362,0.287293,0.355769,72,33,0,71,33,0,0.685714,0.314286,0.000000,0.682692,0.317308,0.000000
382,247.0,11,8,104,362,0.287293,0.355769,72,33,0,71,33,0,0.685714,0.314286,0.000000,0.682692,0.317308,0.000000
383,626.0,11,11,18,362,0.049724,0.111111,0,18,0,0,18,0,0.000000,1.000000,0.000000,0.000000,1.000000,0.000000


# Number of Buys Per Brand

In [20]:
brand_buy = pd.DataFrame({'brand_buys': df.groupby(['brand_id'])['action_type'].apply(lambda x: (x == 2).sum())})
df_brand = df_brand.join(brand_buy, on=['brand_id'])
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,month_brand_male_count,month_brand_female_count,month_brand_unknown_gender_count,brand_male_count_ratio,brand_female_count_ratio,brand_unknown_count_ratio,brand_monthly_male_count_ratio,brand_monthly_female_count_ratio,brand_monthly_unknown_count_ratio,brand_buys
0,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,9,25,2,0.250000,0.694444,0.055556,0.250000,0.694444,0.055556,5
1,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,9,25,2,0.250000,0.694444,0.055556,0.250000,0.694444,0.055556,5
2,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,9,25,2,0.250000,0.694444,0.055556,0.250000,0.694444,0.055556,5
3,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,9,25,2,0.250000,0.694444,0.055556,0.250000,0.694444,0.055556,5
4,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,9,25,2,0.250000,0.694444,0.055556,0.250000,0.694444,0.055556,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,247.0,11,9,104,362,0.287293,0.355769,72,33,0,71,33,0,0.685714,0.314286,0.000000,0.682692,0.317308,0.000000,14
381,247.0,11,8,104,362,0.287293,0.355769,72,33,0,71,33,0,0.685714,0.314286,0.000000,0.682692,0.317308,0.000000,14
382,247.0,11,8,104,362,0.287293,0.355769,72,33,0,71,33,0,0.685714,0.314286,0.000000,0.682692,0.317308,0.000000,14
383,626.0,11,11,18,362,0.049724,0.111111,0,18,0,0,18,0,0.000000,1.000000,0.000000,0.000000,1.000000,0.000000,1


# Buy Ratio per Brand (Brand Buy Count / Brand Total Actions)

In [21]:
df_brand['brand_buy_ratio'] = df_brand['brand_buys'] / df_brand.groupby(['brand_id']).transform('size')
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,month_brand_female_count,month_brand_unknown_gender_count,brand_male_count_ratio,brand_female_count_ratio,brand_unknown_count_ratio,brand_monthly_male_count_ratio,brand_monthly_female_count_ratio,brand_monthly_unknown_count_ratio,brand_buys,brand_buy_ratio
0,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,25,2,0.250000,0.694444,0.055556,0.250000,0.694444,0.055556,5,0.138889
1,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,25,2,0.250000,0.694444,0.055556,0.250000,0.694444,0.055556,5,0.138889
2,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,25,2,0.250000,0.694444,0.055556,0.250000,0.694444,0.055556,5,0.138889
3,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,...,25,2,0.250000,0.694444,0.055556,0.250000,0.694444,0.055556,5,0.138889
4,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,...,25,2,0.250000,0.694444,0.055556,0.250000,0.694444,0.055556,5,0.138889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,247.0,11,9,104,362,0.287293,0.355769,72,33,0,...,33,0,0.685714,0.314286,0.000000,0.682692,0.317308,0.000000,14,0.133333
381,247.0,11,8,104,362,0.287293,0.355769,72,33,0,...,33,0,0.685714,0.314286,0.000000,0.682692,0.317308,0.000000,14,0.133333
382,247.0,11,8,104,362,0.287293,0.355769,72,33,0,...,33,0,0.685714,0.314286,0.000000,0.682692,0.317308,0.000000,14,0.133333
383,626.0,11,11,18,362,0.049724,0.111111,0,18,0,...,18,0,0.000000,1.000000,0.000000,0.000000,1.000000,0.000000,1,0.055556


# Gender-Buy Count per Brand

In [22]:
brand_male_buy = df[df.action_type == 2].groupby(['brand_id'])['gender'].apply(lambda x: (x == 1).sum()) 
brand_female_buy = df[df.action_type == 2].groupby(['brand_id'])['gender'].apply(lambda x: (x == 0).sum())
brand_unknown_buy = df[df.action_type == 2].groupby(['brand_id'])['gender'].apply(lambda x: (x == 2).sum())
gender_buys = pd.DataFrame({'brand_male_buy_count': brand_male_buy, 'brand_female_buy_count': brand_female_buy, 'brand_unknown_buy_count': brand_unknown_buy})
df_brand = df_brand.join(gender_buys, on=['brand_id'])
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand_female_count_ratio,brand_unknown_count_ratio,brand_monthly_male_count_ratio,brand_monthly_female_count_ratio,brand_monthly_unknown_count_ratio,brand_buys,brand_buy_ratio,brand_male_buy_count,brand_female_buy_count,brand_unknown_buy_count
0,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,0.694444,0.055556,0.250000,0.694444,0.055556,5,0.138889,1.0,4.0,0.0
1,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,0.694444,0.055556,0.250000,0.694444,0.055556,5,0.138889,1.0,4.0,0.0
2,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,0.694444,0.055556,0.250000,0.694444,0.055556,5,0.138889,1.0,4.0,0.0
3,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,...,0.694444,0.055556,0.250000,0.694444,0.055556,5,0.138889,1.0,4.0,0.0
4,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,...,0.694444,0.055556,0.250000,0.694444,0.055556,5,0.138889,1.0,4.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,247.0,11,9,104,362,0.287293,0.355769,72,33,0,...,0.314286,0.000000,0.682692,0.317308,0.000000,14,0.133333,8.0,6.0,0.0
381,247.0,11,8,104,362,0.287293,0.355769,72,33,0,...,0.314286,0.000000,0.682692,0.317308,0.000000,14,0.133333,8.0,6.0,0.0
382,247.0,11,8,104,362,0.287293,0.355769,72,33,0,...,0.314286,0.000000,0.682692,0.317308,0.000000,14,0.133333,8.0,6.0,0.0
383,626.0,11,11,18,362,0.049724,0.111111,0,18,0,...,1.000000,0.000000,0.000000,1.000000,0.000000,1,0.055556,0.0,1.0,0.0


# Gender-Buy Ratio per Brand (Buy Count of a Gender / Total Buy Count for that Brand)

In [23]:
df_brand['brand_male_buy_ratio'] = df_brand['brand_male_buy_count'] / df_brand['brand_buys']
df_brand['brand_female_buy_ratio'] = df_brand['brand_female_buy_count'] / df_brand['brand_buys']
df_brand['brand_unknown_buy_ratio'] = df_brand['brand_unknown_buy_count'] / df_brand['brand_buys']
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand_monthly_female_count_ratio,brand_monthly_unknown_count_ratio,brand_buys,brand_buy_ratio,brand_male_buy_count,brand_female_buy_count,brand_unknown_buy_count,brand_male_buy_ratio,brand_female_buy_ratio,brand_unknown_buy_ratio
0,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,0.694444,0.055556,5,0.138889,1.0,4.0,0.0,0.200000,0.800000,0.0
1,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,0.694444,0.055556,5,0.138889,1.0,4.0,0.0,0.200000,0.800000,0.0
2,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,0.694444,0.055556,5,0.138889,1.0,4.0,0.0,0.200000,0.800000,0.0
3,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,...,0.694444,0.055556,5,0.138889,1.0,4.0,0.0,0.200000,0.800000,0.0
4,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,...,0.694444,0.055556,5,0.138889,1.0,4.0,0.0,0.200000,0.800000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,247.0,11,9,104,362,0.287293,0.355769,72,33,0,...,0.317308,0.000000,14,0.133333,8.0,6.0,0.0,0.571429,0.428571,0.0
381,247.0,11,8,104,362,0.287293,0.355769,72,33,0,...,0.317308,0.000000,14,0.133333,8.0,6.0,0.0,0.571429,0.428571,0.0
382,247.0,11,8,104,362,0.287293,0.355769,72,33,0,...,0.317308,0.000000,14,0.133333,8.0,6.0,0.0,0.571429,0.428571,0.0
383,626.0,11,11,18,362,0.049724,0.111111,0,18,0,...,1.000000,0.000000,1,0.055556,0.0,1.0,0.0,0.000000,1.000000,0.0


# Resolving NaNs: When brand_buys is 0, math results in some NaNs for ratios 

In [24]:
# Visualize rows with NaNs
nan_mask = df_brand.isna().any(axis=1)
nan_rows = df_brand[nan_mask]
nan_rows

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand_monthly_female_count_ratio,brand_monthly_unknown_count_ratio,brand_buys,brand_buy_ratio,brand_male_buy_count,brand_female_buy_count,brand_unknown_buy_count,brand_male_buy_ratio,brand_female_buy_ratio,brand_unknown_buy_ratio
35,2350.0,11,8,6,362,0.016575,0.0,0,6,0,...,1.0,0.0,0,0.0,,,,,,
36,2350.0,11,8,6,362,0.016575,0.0,0,6,0,...,1.0,0.0,0,0.0,,,,,,
39,2350.0,11,5,6,362,0.016575,0.0,0,6,0,...,1.0,0.0,0,0.0,,,,,,
48,1905.0,5,20,1,8,0.125,0.0,0,1,1,...,1.0,0.0,0,0.0,,,,,,
62,5491.0,11,3,3,362,0.008287,0.0,0,4,0,...,1.0,0.0,0,0.0,,,,,,
127,777.0,11,11,1,362,0.002762,0.0,0,1,0,...,1.0,0.0,0,0.0,,,,,,
139,1097.0,11,10,3,362,0.008287,0.0,0,3,0,...,1.0,0.0,0,0.0,,,,,,
143,3931.0,11,11,1,362,0.002762,0.0,0,1,0,...,1.0,0.0,0,0.0,,,,,,
165,1446.0,5,28,1,8,0.125,0.0,0,2,0,...,1.0,0.0,0,0.0,,,,,,
192,7892.0,8,4,5,5,1.0,0.0,5,0,0,...,0.0,0.0,0,0.0,,,,,,


In [25]:
# If Brand_Buys is 0, the previous method generates NaNs. Set these NaNs to 0
condition = df_brand['brand_buys'] == 0
df_brand.loc[condition, ['brand_male_buy_count', 'brand_female_buy_count', 'brand_unknown_buy_count', 'brand_male_buy_ratio', 'brand_female_buy_ratio', 'brand_unknown_buy_ratio']] = 0
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand_monthly_female_count_ratio,brand_monthly_unknown_count_ratio,brand_buys,brand_buy_ratio,brand_male_buy_count,brand_female_buy_count,brand_unknown_buy_count,brand_male_buy_ratio,brand_female_buy_ratio,brand_unknown_buy_ratio
0,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,0.694444,0.055556,5,0.138889,1.0,4.0,0.0,0.200000,0.800000,0.0
1,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,0.694444,0.055556,5,0.138889,1.0,4.0,0.0,0.200000,0.800000,0.0
2,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,0.694444,0.055556,5,0.138889,1.0,4.0,0.0,0.200000,0.800000,0.0
3,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,...,0.694444,0.055556,5,0.138889,1.0,4.0,0.0,0.200000,0.800000,0.0
4,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,...,0.694444,0.055556,5,0.138889,1.0,4.0,0.0,0.200000,0.800000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,247.0,11,9,104,362,0.287293,0.355769,72,33,0,...,0.317308,0.000000,14,0.133333,8.0,6.0,0.0,0.571429,0.428571,0.0
381,247.0,11,8,104,362,0.287293,0.355769,72,33,0,...,0.317308,0.000000,14,0.133333,8.0,6.0,0.0,0.571429,0.428571,0.0
382,247.0,11,8,104,362,0.287293,0.355769,72,33,0,...,0.317308,0.000000,14,0.133333,8.0,6.0,0.0,0.571429,0.428571,0.0
383,626.0,11,11,18,362,0.049724,0.111111,0,18,0,...,1.000000,0.000000,1,0.055556,0.0,1.0,0.0,0.000000,1.000000,0.0


# Check for unaccounted NaNs

In [29]:
print(sum(df_brand.isna().sum()))

0

# Age-Related

# AgeGroup Counts per Brand

In [27]:
# For Other Columns, modify the groupby condition
ageGroup_0_counts = df.groupby(['brand_id'])['age_range'].apply(lambda x: (x == 0).sum())
ageGroup_1_counts = df.groupby(['brand_id'])['age_range'].apply(lambda x: (x == 1).sum())
ageGroup_2_counts = df.groupby(['brand_id'])['age_range'].apply(lambda x: (x == 2).sum())
ageGroup_3_counts = df.groupby(['brand_id'])['age_range'].apply(lambda x: (x == 3).sum())
ageGroup_4_counts = df.groupby(['brand_id'])['age_range'].apply(lambda x: (x == 4).sum())
ageGroup_5_counts = df.groupby(['brand_id'])['age_range'].apply(lambda x: (x == 5).sum())
ageGroup_6_counts = df.groupby(['brand_id'])['age_range'].apply(lambda x: (x == 6).sum())
ageGroup_7_counts = df.groupby(['brand_id'])['age_range'].apply(lambda x: (x == 7).sum())
ageGroup_8_counts = df.groupby(['brand_id'])['age_range'].apply(lambda x: (x == 8).sum())
gender_total_brand_counts = pd.DataFrame({'ageGroup_1_counts': ageGroup_1_counts,'ageGroup_2_counts': ageGroup_2_counts,'ageGroup_3_counts': ageGroup_3_counts,'ageGroup_4_counts': ageGroup_4_counts,'ageGroup_5_counts': ageGroup_5_counts,
                                          'ageGroup_6_counts': ageGroup_6_counts,'ageGroup_7_counts': ageGroup_7_counts,'ageGroup_8_counts': ageGroup_8_counts, 'ageGroup_0_counts': ageGroup_0_counts})
df_brand = df_brand.join(gender_total_brand_counts, on='brand_id')
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand_unknown_buy_ratio,ageGroup_1_counts,ageGroup_2_counts,ageGroup_3_counts,ageGroup_4_counts,ageGroup_5_counts,ageGroup_6_counts,ageGroup_7_counts,ageGroup_8_counts,ageGroup_0_counts
0,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,0.0,0,2,1,10,7,8,0,0,8
1,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,0.0,0,2,1,10,7,8,0,0,8
2,3462.0,11,11,36,362,0.099448,0.361111,9,25,2,...,0.0,0,2,1,10,7,8,0,0,8
3,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,...,0.0,0,2,1,10,7,8,0,0,8
4,3462.0,11,10,36,362,0.099448,0.361111,9,25,2,...,0.0,0,2,1,10,7,8,0,0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,247.0,11,9,104,362,0.287293,0.355769,72,33,0,...,0.0,0,25,24,35,1,3,0,0,17
381,247.0,11,8,104,362,0.287293,0.355769,72,33,0,...,0.0,0,25,24,35,1,3,0,0,17
382,247.0,11,8,104,362,0.287293,0.355769,72,33,0,...,0.0,0,25,24,35,1,3,0,0,17
383,626.0,11,11,18,362,0.049724,0.111111,0,18,0,...,0.0,0,10,0,6,0,0,0,0,2


In [33]:
#FEATURE 1: OVERALL ACTION COUNT/RATIO
actions = {}
for index, row in df.iterrows():
    if row['brand_id'] not in actions:
        # there may be multiple entries, so default with an empty list
        actions[row["brand_id"]] = [] 
    actions[row["brand_id"]].append(row["action_type"])

In [34]:
actions

{3462.0: [0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 247.0: [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  2.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.

In [35]:
action_count = {}
for key in actions.keys():
    action_count[key] = len(actions[key])
action_count

{3462.0: 36,
 247.0: 105,
 5380.0: 19,
 683.0: 85,
 2350.0: 6,
 6208.0: 21,
 1905.0: 2,
 6230.0: 2,
 7924.0: 12,
 5491.0: 4,
 626.0: 18,
 777.0: 1,
 7936.0: 4,
 1097.0: 3,
 3931.0: 1,
 1446.0: 2,
 1246.0: 30,
 7892.0: 5,
 5946.0: 6,
 4631.0: 9,
 7989.0: 1,
 5738.0: 2,
 2276.0: 7,
 3654.0: 1,
 6590.0: 1,
 8040.0: 1,
 7371.0: 1}

In [36]:
total_action_count = sum(action_count.values())
total_action_count

385

In [37]:
action_count_ratio = {}
for key in action_count:
    action_count_ratio[key] = action_count[key]/total_action_count
action_count_ratio

{3462.0: 0.09350649350649351,
 247.0: 0.2727272727272727,
 5380.0: 0.04935064935064935,
 683.0: 0.22077922077922077,
 2350.0: 0.015584415584415584,
 6208.0: 0.05454545454545454,
 1905.0: 0.005194805194805195,
 6230.0: 0.005194805194805195,
 7924.0: 0.03116883116883117,
 5491.0: 0.01038961038961039,
 626.0: 0.046753246753246755,
 777.0: 0.0025974025974025974,
 7936.0: 0.01038961038961039,
 1097.0: 0.007792207792207792,
 3931.0: 0.0025974025974025974,
 1446.0: 0.005194805194805195,
 1246.0: 0.07792207792207792,
 7892.0: 0.012987012987012988,
 5946.0: 0.015584415584415584,
 4631.0: 0.023376623376623377,
 7989.0: 0.0025974025974025974,
 5738.0: 0.005194805194805195,
 2276.0: 0.01818181818181818,
 3654.0: 0.0025974025974025974,
 6590.0: 0.0025974025974025974,
 8040.0: 0.0025974025974025974,
 7371.0: 0.0025974025974025974}

In [38]:
df_brand['brand action count'] = -1
df_brand['brand action count ratio'] = -1
for index,row in df_brand.iterrows():
        df_brand.at[index, 'brand action count'] = action_count[row['brand_id']]
        df_brand.at[index, 'brand action count ratio'] = action_count_ratio[row['brand_id']]

df_brand

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['brand action count'] = -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['brand action count ratio'] = -1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand.at[index, 'brand action count ratio'] = action_count_ratio[row['brand_id']]


Unnamed: 0,brand_id,brand action count,brand action count ratio,brand day count,item 1111 action count,item 1111 action count ratio,brand 1111 action count,brand 1111 action count ratio,brand 1111 activity ratio
0,3462.0,36,0.093506,4,0,0,0,0,0
1,3462.0,36,0.093506,4,0,0,0,0,0
2,3462.0,36,0.093506,4,0,0,0,0,0
3,3462.0,36,0.093506,4,0,0,0,0,0
4,3462.0,36,0.093506,4,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
380,247.0,105,0.272727,9,0,0,0,0,0
381,247.0,105,0.272727,9,0,0,0,0,0
382,247.0,105,0.272727,9,0,0,0,0,0
383,626.0,18,0.046753,2,0,0,0,0,0


In [39]:
#FEATURE 2: OVERALL DAY COUNT
days = {}

# go through user_seller sub-dataframe and split entries by item_id:seller_id
for index, row in df.iterrows():
    if row['brand_id'] not in days:
        # there may be multiple entries, so default with an empty list
        days[row["brand_id"]] = [] 
    days[row["brand_id"]].append(row["time_stamp"])

In [40]:
for k,v in days.items():
    days[k] = list(set(v)) # make each value list unique

days

{3462.0: [1105.0, 1109.0, 1110.0, 1111.0],
 247.0: [1026.0,
  1101.0,
  1103.0,
  1106.0,
  1107.0,
  1108.0,
  1109.0,
  1110.0,
  1111.0],
 5380.0: [1103.0, 1104.0, 1105.0, 1107.0, 1108.0, 1109.0, 1111.0],
 683.0: [1030.0,
  617.0,
  522.0,
  524.0,
  529.0,
  1105.0,
  1107.0,
  1108.0,
  1109.0,
  1110.0,
  1111.0,
  603.0],
 2350.0: [1105.0, 1108.0, 1109.0, 1111.0],
 6208.0: [1105.0, 1109.0, 1111.0],
 1905.0: [520.0, 1110.0],
 6230.0: [1111.0],
 7924.0: [1108.0, 1109.0, 1111.0],
 5491.0: [1108.0, 1022.0, 1103.0],
 626.0: [1110.0, 1111.0],
 777.0: [1111.0],
 7936.0: [1111.0],
 1097.0: [1110.0, 1111.0],
 3931.0: [1111.0],
 1446.0: [528.0, 601.0],
 1246.0: [1103.0, 1105.0, 1106.0, 1108.0, 1109.0, 1110.0, 1111.0],
 7892.0: [801.0, 804.0, 806.0],
 5946.0: [1111.0],
 4631.0: [1026.0, 1101.0, 1110.0, 1111.0],
 7989.0: [627.0],
 5738.0: [1110.0],
 2276.0: [1110.0, 1111.0],
 3654.0: [1110.0],
 6590.0: [1001.0],
 8040.0: [1111.0],
 7371.0: [1109.0]}

In [41]:
day_count = {}
for key in days.keys():
    day_count[key] = len(days[key])
day_count

{3462.0: 4,
 247.0: 9,
 5380.0: 7,
 683.0: 12,
 2350.0: 4,
 6208.0: 3,
 1905.0: 2,
 6230.0: 1,
 7924.0: 3,
 5491.0: 3,
 626.0: 2,
 777.0: 1,
 7936.0: 1,
 1097.0: 2,
 3931.0: 1,
 1446.0: 2,
 1246.0: 7,
 7892.0: 3,
 5946.0: 1,
 4631.0: 4,
 7989.0: 1,
 5738.0: 1,
 2276.0: 2,
 3654.0: 1,
 6590.0: 1,
 8040.0: 1,
 7371.0: 1}

In [42]:
df_brand['brand day count'] = -1
for index,row in df_brand.iterrows():
        df_brand.at[index, 'brand day count'] = day_count[row['brand_id']]

df_brand

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['brand day count'] = -1


Unnamed: 0,brand_id,brand action count,brand action count ratio,brand day count,item 1111 action count,item 1111 action count ratio,brand 1111 action count,brand 1111 action count ratio,brand 1111 activity ratio
0,3462.0,36,0.093506,4,0,0,0,0,0
1,3462.0,36,0.093506,4,0,0,0,0,0
2,3462.0,36,0.093506,4,0,0,0,0,0
3,3462.0,36,0.093506,4,0,0,0,0,0
4,3462.0,36,0.093506,4,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
380,247.0,105,0.272727,9,0,0,0,0,0
381,247.0,105,0.272727,9,0,0,0,0,0
382,247.0,105,0.272727,9,0,0,0,0,0
383,626.0,18,0.046753,2,0,0,0,0,0


In [43]:
#DOUBLE 11 FEATURES
df_11 = df[df['time_stamp']==1111]
df_11

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,age_range,gender,label
0,379824,198,656,145,3462.0,1111,0,5.0,1.0,0
1,379824,198,656,145,3462.0,1111,0,5.0,1.0,0
2,379824,198,656,145,3462.0,1111,2,5.0,1.0,0
9,141307,175,1181,4760,247.0,1111,0,4.0,1.0,0
10,141307,175,1181,4760,247.0,1111,0,4.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...
375,289079,279,898,3323,683.0,1111,0,4.0,1.0,1
377,403117,175,1181,4760,247.0,1111,0,2.0,1.0,0
378,36385,219,349,1943,6208.0,1111,2,0.0,0.0,0
379,36385,219,349,1943,6208.0,1111,0,0.0,0.0,0


In [44]:
actions_11 = {}

# go through user_seller sub-dataframe and split entries by item_id:seller_id
for index, row in df_11.iterrows():
    if row['brand_id'] not in actions_11:
        # there may be multiple entries, so default with an empty list
        actions_11[row["brand_id"]] = [] 
    actions_11[row["brand_id"]].append(row["action_type"])

actions_11

{3462.0: [0.0,
  0.0,
  2.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0],
 247.0: [0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  2.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  2.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0],
 5380.0: [2.0, 0.0, 0.0, 2.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0],
 683.0: [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,


In [45]:
action_count_11 = {}
for key in actions_11.keys():
    action_count_11[key] = len(actions_11[key])
action_count_11

{3462.0: 30,
 247.0: 67,
 5380.0: 11,
 683.0: 46,
 6208.0: 17,
 6230.0: 2,
 7924.0: 10,
 626.0: 13,
 777.0: 1,
 7936.0: 4,
 3931.0: 1,
 1246.0: 12,
 5946.0: 6,
 4631.0: 3,
 1097.0: 1,
 2350.0: 1,
 2276.0: 2,
 8040.0: 1}

In [46]:
total_action_count_11 = sum(action_count_11.values())
total_action_count_11

228

In [47]:
action_count_ratio_11 = {}
for key in action_count_11:
    action_count_ratio_11[key] = action_count_11[key]/total_action_count_11
action_count_ratio_11

{3462.0: 0.13157894736842105,
 247.0: 0.29385964912280704,
 5380.0: 0.04824561403508772,
 683.0: 0.20175438596491227,
 6208.0: 0.07456140350877193,
 6230.0: 0.008771929824561403,
 7924.0: 0.043859649122807015,
 626.0: 0.05701754385964912,
 777.0: 0.0043859649122807015,
 7936.0: 0.017543859649122806,
 3931.0: 0.0043859649122807015,
 1246.0: 0.05263157894736842,
 5946.0: 0.02631578947368421,
 4631.0: 0.013157894736842105,
 1097.0: 0.0043859649122807015,
 2350.0: 0.0043859649122807015,
 2276.0: 0.008771929824561403,
 8040.0: 0.0043859649122807015}

In [48]:
df_brand['brand 1111 action count'] = 0
df_brand['brand 1111 action count ratio'] = 0
for index,row in df_brand.iterrows():
        if row['brand_id'] in action_count_11.keys():
            df_brand.at[index, 'brand 1111 action count'] = action_count_11[row['brand_id']]
        if row['brand_id'] in action_count_ratio_11.keys():
            df_brand.at[index, 'brand 1111 action count ratio'] = action_count_ratio_11[row['brand_id']]

df_brand

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['brand 1111 action count'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['brand 1111 action count ratio'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand.at[index, 'brand 1111 action count ratio'] = action_count_ratio_11[row['brand_id']]


Unnamed: 0,brand_id,brand action count,brand action count ratio,brand day count,item 1111 action count,item 1111 action count ratio,brand 1111 action count,brand 1111 action count ratio,brand 1111 activity ratio
0,3462.0,36,0.093506,4,0,0,30,0.131579,0
1,3462.0,36,0.093506,4,0,0,30,0.131579,0
2,3462.0,36,0.093506,4,0,0,30,0.131579,0
3,3462.0,36,0.093506,4,0,0,30,0.131579,0
4,3462.0,36,0.093506,4,0,0,30,0.131579,0
...,...,...,...,...,...,...,...,...,...
380,247.0,105,0.272727,9,0,0,67,0.293860,0
381,247.0,105,0.272727,9,0,0,67,0.293860,0
382,247.0,105,0.272727,9,0,0,67,0.293860,0
383,626.0,18,0.046753,2,0,0,13,0.057018,0


In [51]:
activity_ratio_1111 = {}
df_brand['brand 1111 activity ratio'] = 0
for index,row in df_brand.iterrows():
    if df_brand.at[index,'brand 1111 action count'] != 0:
        df_brand.at[index, 'brand 1111 activity ratio'] = (df_brand.at[index,'brand 1111 action count']/df_brand.at[index,'brand action count'])
df_brand

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['brand 1111 activity ratio'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand.at[index, 'brand 1111 activity ratio'] = (df_brand.at[index,'brand 1111 action count']/df_brand.at[index,'brand action count'])


Unnamed: 0,brand_id,brand action count,brand action count ratio,brand day count,brand 1111 action count,brand 1111 action count ratio,brand 1111 activity ratio
0,3462.0,36,0.093506,4,30,0.131579,0.833333
1,3462.0,36,0.093506,4,30,0.131579,0.833333
2,3462.0,36,0.093506,4,30,0.131579,0.833333
3,3462.0,36,0.093506,4,30,0.131579,0.833333
4,3462.0,36,0.093506,4,30,0.131579,0.833333
...,...,...,...,...,...,...,...
380,247.0,105,0.272727,9,67,0.293860,0.638095
381,247.0,105,0.272727,9,67,0.293860,0.638095
382,247.0,105,0.272727,9,67,0.293860,0.638095
383,626.0,18,0.046753,2,13,0.057018,0.722222


In [52]:
#FEATURE 10: LATEST ONE-WEEK
latest_week = 1104
df_latest_week = df[df['time_stamp']>=latest_week]
df_latest_week

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,age_range,gender,label
0,379824,198,656,145,3462.0,1111,0,5.0,1.0,0
1,379824,198,656,145,3462.0,1111,0,5.0,1.0,0
2,379824,198,656,145,3462.0,1111,2,5.0,1.0,0
3,379824,198,656,145,3462.0,1110,0,5.0,1.0,0
4,379824,198,656,145,3462.0,1110,0,5.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...
380,122632,175,1181,4760,247.0,1109,0,3.0,0.0,0
381,122632,175,1181,4760,247.0,1108,0,3.0,0.0,0
382,122632,175,1181,4760,247.0,1108,0,3.0,0.0,0
383,95362,253,962,3263,626.0,1111,0,0.0,0.0,0


In [53]:
actions_last_week = {}

# go through user_seller sub-dataframe and split entries by item_id:seller_id
for index, row in df_latest_week.iterrows():
    if row['brand_id'] not in actions_last_week:
        # there may be multiple entries, so default with an empty list
        actions_last_week[row["brand_id"]] = [] 
    actions_last_week[row["brand_id"]].append(row["action_type"])
    
#for k,v in actions_last_week.items():
#    actions_last_week[k] = list(set(v)) # make each value list unique
actions_last_week

{3462.0: [0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 247.0: [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  2.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 5380.0: [0.0,
  0.0,
  

In [54]:
action_count_last_week = {}
for key in actions_last_week.keys():
    action_count_last_week[key] = len(actions_last_week[key])
action_count_last_week

{3462.0: 36,
 247.0: 101,
 5380.0: 18,
 683.0: 75,
 2350.0: 6,
 6208.0: 21,
 6230.0: 2,
 7924.0: 12,
 626.0: 18,
 777.0: 1,
 7936.0: 4,
 1097.0: 3,
 3931.0: 1,
 1246.0: 29,
 1905.0: 1,
 5946.0: 6,
 4631.0: 7,
 5738.0: 2,
 2276.0: 7,
 3654.0: 1,
 5491.0: 2,
 8040.0: 1,
 7371.0: 1}

In [55]:
total_action_count_last_week = sum(action_count_last_week.values())
total_action_count_last_week


355

In [56]:
action_count_ratio_last_week = {}
for key in action_count_last_week:
    action_count_ratio_last_week[key] = action_count_last_week[key]/total_action_count_last_week
action_count_ratio_last_week

{3462.0: 0.10140845070422536,
 247.0: 0.28450704225352114,
 5380.0: 0.05070422535211268,
 683.0: 0.2112676056338028,
 2350.0: 0.016901408450704224,
 6208.0: 0.059154929577464786,
 6230.0: 0.005633802816901409,
 7924.0: 0.03380281690140845,
 626.0: 0.05070422535211268,
 777.0: 0.0028169014084507044,
 7936.0: 0.011267605633802818,
 1097.0: 0.008450704225352112,
 3931.0: 0.0028169014084507044,
 1246.0: 0.08169014084507042,
 1905.0: 0.0028169014084507044,
 5946.0: 0.016901408450704224,
 4631.0: 0.01971830985915493,
 5738.0: 0.005633802816901409,
 2276.0: 0.01971830985915493,
 3654.0: 0.0028169014084507044,
 5491.0: 0.005633802816901409,
 8040.0: 0.0028169014084507044,
 7371.0: 0.0028169014084507044}

In [60]:
df_brand['brand last week action count'] = 0
df_brand['brand last week action count ratio'] = 0
for index,row in df_brand.iterrows():
        if row['brand_id'] in action_count_last_week.keys():
            df_brand.at[index, 'brand last week action count'] = action_count_last_week[row['brand_id']]
        if row['brand_id'] in action_count_ratio_last_week.keys():
            df_brand.at[index, 'brand last week action count ratio'] = action_count_ratio_last_week[row['brand_id']]

df_brand

Unnamed: 0,brand_id,brand action count,brand action count ratio,brand day count,brand 1111 action count,brand 1111 action count ratio,brand 1111 activity ratio,item last week action count,item last week action count ratio,brand last week activity ratio,brand last week action count,brand last week action count ratio
0,3462.0,36,0.093506,4,30,0.131579,0.833333,36,0.101408,0,36,0.101408
1,3462.0,36,0.093506,4,30,0.131579,0.833333,36,0.101408,0,36,0.101408
2,3462.0,36,0.093506,4,30,0.131579,0.833333,36,0.101408,0,36,0.101408
3,3462.0,36,0.093506,4,30,0.131579,0.833333,36,0.101408,0,36,0.101408
4,3462.0,36,0.093506,4,30,0.131579,0.833333,36,0.101408,0,36,0.101408
...,...,...,...,...,...,...,...,...,...,...,...,...
380,247.0,105,0.272727,9,67,0.293860,0.638095,101,0.284507,0,101,0.284507
381,247.0,105,0.272727,9,67,0.293860,0.638095,101,0.284507,0,101,0.284507
382,247.0,105,0.272727,9,67,0.293860,0.638095,101,0.284507,0,101,0.284507
383,626.0,18,0.046753,2,13,0.057018,0.722222,18,0.050704,0,18,0.050704


In [63]:
activity_ratio_last_week = {}
df_brand['brand last week activity ratio'] = 0
for index,row in df_brand.iterrows():
    if df_brand.at[index,'brand last week action count'] != 0:
        df_brand.at[index, 'brand last week activity ratio'] = (df_brand.at[index,'brand last week action count']/df_brand.at[index,'brand action count'])
df_brand

Unnamed: 0,brand_id,brand action count,brand action count ratio,brand day count,brand 1111 action count,brand 1111 action count ratio,brand 1111 activity ratio,brand last week activity ratio,brand last week action count,brand last week action count ratio
0,3462.0,36,0.093506,4,30,0.131579,0.833333,1.000000,36,0.101408
1,3462.0,36,0.093506,4,30,0.131579,0.833333,1.000000,36,0.101408
2,3462.0,36,0.093506,4,30,0.131579,0.833333,1.000000,36,0.101408
3,3462.0,36,0.093506,4,30,0.131579,0.833333,1.000000,36,0.101408
4,3462.0,36,0.093506,4,30,0.131579,0.833333,1.000000,36,0.101408
...,...,...,...,...,...,...,...,...,...,...
380,247.0,105,0.272727,9,67,0.293860,0.638095,0.961905,101,0.284507
381,247.0,105,0.272727,9,67,0.293860,0.638095,0.961905,101,0.284507
382,247.0,105,0.272727,9,67,0.293860,0.638095,0.961905,101,0.284507
383,626.0,18,0.046753,2,13,0.057018,0.722222,1.000000,18,0.050704


In [72]:
#FEATURE 11: LATEST MONTH
latest_month = 1011
df_latest_month = df[df['time_stamp']>=latest_month]
df_latest_month

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,age_range,gender,label
0,379824,198,656,145,3462.0,1111,0,5.0,1.0,0
1,379824,198,656,145,3462.0,1111,0,5.0,1.0,0
2,379824,198,656,145,3462.0,1111,2,5.0,1.0,0
3,379824,198,656,145,3462.0,1110,0,5.0,1.0,0
4,379824,198,656,145,3462.0,1110,0,5.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...
380,122632,175,1181,4760,247.0,1109,0,3.0,0.0,0
381,122632,175,1181,4760,247.0,1108,0,3.0,0.0,0
382,122632,175,1181,4760,247.0,1108,0,3.0,0.0,0
383,95362,253,962,3263,626.0,1111,0,0.0,0.0,0


In [80]:
actions_last_month = {}

# go through user_seller sub-dataframe and split entries by user_id:seller_id
for index, row in df_latest_month.iterrows():
    if row['brand_id'] not in actions_last_month:
        # there may be multiple entries, so default with an empty list
        actions_last_month[row["brand_id"]] = [] 
    actions_last_month[row["brand_id"]].append(row["action_type"])
    
#for k,v in actions_last_month.items():
#    actions_last_month[k] = list(set(v)) # make each value list unique
actions_last_month

{3462.0: [0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 247.0: [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  2.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.

In [81]:
action_count_last_month = {}
for key in actions_last_month.keys():
    action_count_last_month[key] = len(actions_last_month[key])
action_count_last_month

{3462.0: 36,
 247.0: 105,
 5380.0: 19,
 683.0: 76,
 2350.0: 6,
 6208.0: 21,
 6230.0: 2,
 7924.0: 12,
 5491.0: 4,
 626.0: 18,
 777.0: 1,
 7936.0: 4,
 1097.0: 3,
 3931.0: 1,
 1246.0: 30,
 1905.0: 1,
 5946.0: 6,
 4631.0: 9,
 5738.0: 2,
 2276.0: 7,
 3654.0: 1,
 8040.0: 1,
 7371.0: 1}

In [82]:
total_action_count_last_month = sum(action_count_last_month.values())
total_action_count_last_month

366

In [83]:
action_count_ratio_last_month = {}
for key in action_count_last_month:
    action_count_ratio_last_month[key] = action_count_last_month[key]/total_action_count_last_month
action_count_ratio_last_month

{3462.0: 0.09836065573770492,
 247.0: 0.28688524590163933,
 5380.0: 0.05191256830601093,
 683.0: 0.20765027322404372,
 2350.0: 0.01639344262295082,
 6208.0: 0.05737704918032787,
 6230.0: 0.00546448087431694,
 7924.0: 0.03278688524590164,
 5491.0: 0.01092896174863388,
 626.0: 0.04918032786885246,
 777.0: 0.00273224043715847,
 7936.0: 0.01092896174863388,
 1097.0: 0.00819672131147541,
 3931.0: 0.00273224043715847,
 1246.0: 0.08196721311475409,
 1905.0: 0.00273224043715847,
 5946.0: 0.01639344262295082,
 4631.0: 0.02459016393442623,
 5738.0: 0.00546448087431694,
 2276.0: 0.01912568306010929,
 3654.0: 0.00273224043715847,
 8040.0: 0.00273224043715847,
 7371.0: 0.00273224043715847}

In [84]:
df_brand['last month action count'] = 0
df_brand['last month action count ratio'] = 0
for index,row in df_brand.iterrows():
        if row['brand_id'] in action_count_last_month.keys():
            df_brand.at[index, 'last month action count'] = action_count_last_month[row['brand_id']]
        if row['brand_id'] in action_count_ratio_last_month.keys():
            df_brand.at[index, 'last month action count ratio'] = action_count_ratio_last_month[row['brand_id']]

df_brand

Unnamed: 0,brand_id,brand action count,brand action count ratio,brand day count,brand 1111 action count,brand 1111 action count ratio,brand 1111 activity ratio,brand last week activity ratio,brand last week action count,brand last week action count ratio,last month action count,last month action count ratio
0,3462.0,36,0.093506,4,30,0.131579,0.833333,1.000000,36,0.101408,36,0.098361
1,3462.0,36,0.093506,4,30,0.131579,0.833333,1.000000,36,0.101408,36,0.098361
2,3462.0,36,0.093506,4,30,0.131579,0.833333,1.000000,36,0.101408,36,0.098361
3,3462.0,36,0.093506,4,30,0.131579,0.833333,1.000000,36,0.101408,36,0.098361
4,3462.0,36,0.093506,4,30,0.131579,0.833333,1.000000,36,0.101408,36,0.098361
...,...,...,...,...,...,...,...,...,...,...,...,...
380,247.0,105,0.272727,9,67,0.293860,0.638095,0.961905,101,0.284507,105,0.286885
381,247.0,105,0.272727,9,67,0.293860,0.638095,0.961905,101,0.284507,105,0.286885
382,247.0,105,0.272727,9,67,0.293860,0.638095,0.961905,101,0.284507,105,0.286885
383,626.0,18,0.046753,2,13,0.057018,0.722222,1.000000,18,0.050704,18,0.049180


In [89]:
activity_ratio_last_month = {}
df_brand['last month activity ratio'] = 0
for index,row in df_brand.iterrows():
    if df_brand.at[index,'last month action count'] != 0:
        df_brand.at[index, 'last month activity ratio'] = (df_brand.at[index,'last month action count']/df_brand.at[index,'brand action count'])
df_brand

Unnamed: 0,brand_id,brand action count,brand action count ratio,brand day count,brand 1111 action count,brand 1111 action count ratio,brand 1111 activity ratio,brand last week activity ratio,brand last week action count,brand last week action count ratio,last month action count,last month action count ratio,last month activity ratio
0,3462.0,36,0.093506,4,30,0.131579,0.833333,1.000000,36,0.101408,36,0.098361,1.0
1,3462.0,36,0.093506,4,30,0.131579,0.833333,1.000000,36,0.101408,36,0.098361,1.0
2,3462.0,36,0.093506,4,30,0.131579,0.833333,1.000000,36,0.101408,36,0.098361,1.0
3,3462.0,36,0.093506,4,30,0.131579,0.833333,1.000000,36,0.101408,36,0.098361,1.0
4,3462.0,36,0.093506,4,30,0.131579,0.833333,1.000000,36,0.101408,36,0.098361,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,247.0,105,0.272727,9,67,0.293860,0.638095,0.961905,101,0.284507,105,0.286885,1.0
381,247.0,105,0.272727,9,67,0.293860,0.638095,0.961905,101,0.284507,105,0.286885,1.0
382,247.0,105,0.272727,9,67,0.293860,0.638095,0.961905,101,0.284507,105,0.286885,1.0
383,626.0,18,0.046753,2,13,0.057018,0.722222,1.000000,18,0.050704,18,0.049180,1.0


In [90]:
print("TOTAL ACTIONS: ", total_action_count)
print("TOTAL ACTIONS LATEST MONTH: ",total_action_count_last_month)
print("TOTAL ACTIONS LATEST WEEK: ",total_action_count_last_week)
print("TOTAL ACTIONS 1111: ",total_action_count_11)

TOTAL ACTIONS:  385
TOTAL ACTIONS LATEST MONTH:  366
TOTAL ACTIONS LATEST WEEK:  355
TOTAL ACTIONS 1111:  228


In [92]:
#FEATURE 18: AGE RELATED FEATURES
brand_age = {}

# go through user_seller sub-dataframe and split entries by user_id:seller_id
for index, row in df.iterrows():
    if row['brand_id'] not in brand_age:
        # there may be multiple entries, so default with an empty list
        brand_age[row["brand_id"]] = [] 
    brand_age[row["brand_id"]].append(row["age_range"])

brand_age_avg = {}
for row in brand_age.items():
    brand_age_avg[row[0]] = np.nansum(row[1])/len(row[1])
sorted(brand_age_avg.items())

[(247.0, 2.7142857142857144),
 (626.0, 2.4444444444444446),
 (683.0, 2.8705882352941177),
 (777.0, 4.0),
 (1097.0, 1.3333333333333333),
 (1246.0, 2.4),
 (1446.0, 2.0),
 (1905.0, 4.5),
 (2276.0, 1.1428571428571428),
 (2350.0, 3.0),
 (3462.0, 3.611111111111111),
 (3654.0, 0.0),
 (3931.0, 3.0),
 (4631.0, 0.5555555555555556),
 (5380.0, 4.0),
 (5491.0, 4.0),
 (5738.0, 3.0),
 (5946.0, 4.0),
 (6208.0, 3.238095238095238),
 (6230.0, 2.0),
 (6590.0, 6.0),
 (7371.0, 2.0),
 (7892.0, 3.2),
 (7924.0, 2.3333333333333335),
 (7936.0, 5.0),
 (7989.0, 3.0),
 (8040.0, 0.0)]

In [93]:
total_brand_average_age = sum(brand_age_avg.values())/len(brand_age_avg.keys())
print("Average age total: ",total_brand_average_age)

Average age total:  2.790503855863333


In [95]:
df_brand['average age'] = 0
for index,row in df_brand.iterrows():
        if row['brand_id'] in brand_age_avg.keys():
            df_brand.at[index, 'average age'] = brand_age_avg[row['brand_id']]
df_brand

Unnamed: 0,brand_id,brand action count,brand action count ratio,brand day count,brand 1111 action count,brand 1111 action count ratio,brand 1111 activity ratio,brand last week activity ratio,brand last week action count,brand last week action count ratio,last month action count,last month action count ratio,last month activity ratio,average age
0,3462.0,36,0.093506,4,30,0.131579,0.833333,1.000000,36,0.101408,36,0.098361,1.0,3.611111
1,3462.0,36,0.093506,4,30,0.131579,0.833333,1.000000,36,0.101408,36,0.098361,1.0,3.611111
2,3462.0,36,0.093506,4,30,0.131579,0.833333,1.000000,36,0.101408,36,0.098361,1.0,3.611111
3,3462.0,36,0.093506,4,30,0.131579,0.833333,1.000000,36,0.101408,36,0.098361,1.0,3.611111
4,3462.0,36,0.093506,4,30,0.131579,0.833333,1.000000,36,0.101408,36,0.098361,1.0,3.611111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,247.0,105,0.272727,9,67,0.293860,0.638095,0.961905,101,0.284507,105,0.286885,1.0,2.714286
381,247.0,105,0.272727,9,67,0.293860,0.638095,0.961905,101,0.284507,105,0.286885,1.0,2.714286
382,247.0,105,0.272727,9,67,0.293860,0.638095,0.961905,101,0.284507,105,0.286885,1.0,2.714286
383,626.0,18,0.046753,2,13,0.057018,0.722222,1.000000,18,0.050704,18,0.049180,1.0,2.444444


In [96]:
#FEATURE 19: GENDER FEATURES
brand_gender = {}

# go through user_seller sub-dataframe and split entries by user_id:seller_id
for index, row in df.iterrows():
    if row['brand_id'] not in brand_gender:
        # there may be multiple entries, so default with an empty list
        brand_gender[row["brand_id"]] = [] 
    brand_gender[row["brand_id"]].append(row["gender"])

brand_gender_avg = {}
for row in brand_gender.items():
    brand_gender_avg[row[0]] = np.nansum(row[1])/len(row[1])
sorted(brand_gender_avg.items())

[(247.0, 0.6857142857142857),
 (626.0, 0.0),
 (683.0, 0.49411764705882355),
 (777.0, 0.0),
 (1097.0, 0.0),
 (1246.0, 0.6333333333333333),
 (1446.0, 0.0),
 (1905.0, 1.0),
 (2276.0, 0.0),
 (2350.0, 0.0),
 (3462.0, 0.3611111111111111),
 (3654.0, 1.0),
 (3931.0, 0.0),
 (4631.0, 0.7777777777777778),
 (5380.0, 0.0),
 (5491.0, 0.0),
 (5738.0, 1.0),
 (5946.0, 0.0),
 (6208.0, 0.6190476190476191),
 (6230.0, 1.0),
 (6590.0, 1.0),
 (7371.0, 0.0),
 (7892.0, 1.0),
 (7924.0, 0.16666666666666666),
 (7936.0, 0.0),
 (7989.0, 0.0),
 (8040.0, 0.0)]

In [97]:
total_brand_average_gender = sum(brand_gender_avg.values())/len(brand_gender_avg.keys())
print("Average gender total: ",total_brand_average_gender)

Average gender total:  0.36065809039665253


In [99]:
df_brand['average gender'] = 0
for index,row in df_brand.iterrows():
        if row['brand_id'] in brand_gender_avg.keys():
            df_brand.at[index, 'average gender'] = brand_gender_avg[row['brand_id']]
df_brand

Unnamed: 0,brand_id,brand action count,brand action count ratio,brand day count,brand 1111 action count,brand 1111 action count ratio,brand 1111 activity ratio,brand last week activity ratio,brand last week action count,brand last week action count ratio,last month action count,last month action count ratio,last month activity ratio,average age,average gender
0,3462.0,36,0.093506,4,30,0.131579,0.833333,1.000000,36,0.101408,36,0.098361,1.0,3.611111,0.361111
1,3462.0,36,0.093506,4,30,0.131579,0.833333,1.000000,36,0.101408,36,0.098361,1.0,3.611111,0.361111
2,3462.0,36,0.093506,4,30,0.131579,0.833333,1.000000,36,0.101408,36,0.098361,1.0,3.611111,0.361111
3,3462.0,36,0.093506,4,30,0.131579,0.833333,1.000000,36,0.101408,36,0.098361,1.0,3.611111,0.361111
4,3462.0,36,0.093506,4,30,0.131579,0.833333,1.000000,36,0.101408,36,0.098361,1.0,3.611111,0.361111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,247.0,105,0.272727,9,67,0.293860,0.638095,0.961905,101,0.284507,105,0.286885,1.0,2.714286,0.685714
381,247.0,105,0.272727,9,67,0.293860,0.638095,0.961905,101,0.284507,105,0.286885,1.0,2.714286,0.685714
382,247.0,105,0.272727,9,67,0.293860,0.638095,0.961905,101,0.284507,105,0.286885,1.0,2.714286,0.685714
383,626.0,18,0.046753,2,13,0.057018,0.722222,1.000000,18,0.050704,18,0.049180,1.0,2.444444,0.000000
