In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
'''
FEATURES:
Overall action count/ratio
    - 
Overall day count
Monthly action count/ratio
    - Per brand, number of actions in a month / total actions (across all instances) in that month
Penetration (Popularity / Buys)
    - Number of Buys
    - Related Brand Popularity: Among the brands, split into tiers of high popularity vs low popularity
Monthly Aggregation
    - Per brand, number of actions in a month
    - Per brand, average action_type in a month
    - Std. deviation for number of clicks 
    - Per brand, action count by gender in a month
Double 11 Features
Latest One-Week
Repeat Buyer Features
Age Related
Gender Related
'''

'\nFEATURES:\nOverall action count/ratio\n    - \nOverall day count\nMonthly action count/ratio\n    - Per brand, number of actions in a month / total actions (across all instances) in that month\nPenetration (Popularity / Buys)\n    - Number of Buys\n    - Related Brand Popularity: Among the brands, split into tiers of high popularity vs low popularity\nMonthly Aggregation\n    - Per brand, number of actions in a month\n    - Per brand, average action_type in a month\n    - Std. deviation for number of clicks \n    - Per brand, action count by gender in a month\nDouble 11 Features\nLatest One-Week\nRepeat Buyer Features\nAge Related\nGender Related\n'

In [2]:
df = pd.read_csv("./use_data/expanded_testing.csv")
df

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,age_range,gender,prob
0,149002,282,885,3791,1500.0,1111,0,5.0,2.0,
1,149002,282,885,3791,1500.0,1107,0,5.0,2.0,
2,109881,230,629,1,1662.0,820,0,4.0,0.0,
3,109881,230,629,1,1662.0,820,0,4.0,0.0,
4,109881,230,629,1,1662.0,820,0,4.0,0.0,
...,...,...,...,...,...,...,...,...,...,...
6544,73105,259,614,637,2603.0,1111,0,2.0,0.0,
6545,403421,175,1181,4760,247.0,1111,0,3.0,1.0,
6546,116738,186,267,1200,2276.0,1110,0,2.0,1.0,
6547,116738,186,267,1200,2276.0,1101,0,2.0,1.0,


In [5]:
nan_count = df.isna().sum(axis=1)
nan_count


0       1
1       1
2       1
3       1
4       1
       ..
6544    1
6545    1
6546    1
6547    1
6548    1
Length: 6546, dtype: int64

In [4]:
df['gender'] = df['gender'].fillna(2.0)
df = df[df['brand_id'].notna()]

In [6]:
print(sum(df.isna().sum()))

6574


In [9]:
df.columns[df.isna().any()].tolist()

['prob']

In [8]:
df['age_range'] = df['age_range'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['age_range'] = df['age_range'].fillna(0)


In [10]:
df_brand = df[['brand_id']]
df_brand

Unnamed: 0,brand_id
0,1500.0
1,1500.0
2,1662.0
3,1662.0
4,1662.0
...,...
6544,2603.0
6545,247.0
6546,2276.0
6547,2276.0


In [11]:
# Unique Brand Count
unique_brand_count = len(pd.unique(df_brand['brand_id']))
print(f'Unique Brand Count: {unique_brand_count}')

brand_occurrences = df_brand['brand_id'].value_counts()
print(brand_occurrences)

Unique Brand Count: 150
3650.0    1335
2603.0     885
683.0      413
247.0      398
6742.0     310
          ... 
1532.0       1
7651.0       1
5506.0       1
7139.0       1
6781.0       1
Name: brand_id, Length: 150, dtype: int64


In [12]:
df['month'] = df['time_stamp'] // 100
df['day'] = df['time_stamp'] % 100
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['month'] = df['time_stamp'] // 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['day'] = df['time_stamp'] % 100


Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,age_range,gender,prob,month,day
0,149002,282,885,3791,1500.0,1111,0,5.0,2.0,,11,11
1,149002,282,885,3791,1500.0,1107,0,5.0,2.0,,11,7
2,109881,230,629,1,1662.0,820,0,4.0,0.0,,8,20
3,109881,230,629,1,1662.0,820,0,4.0,0.0,,8,20
4,109881,230,629,1,1662.0,820,0,4.0,0.0,,8,20
...,...,...,...,...,...,...,...,...,...,...,...,...
6544,73105,259,614,637,2603.0,1111,0,2.0,0.0,,11,11
6545,403421,175,1181,4760,247.0,1111,0,3.0,1.0,,11,11
6546,116738,186,267,1200,2276.0,1110,0,2.0,1.0,,11,10
6547,116738,186,267,1200,2276.0,1101,0,2.0,1.0,,11,1


In [13]:
# Split Month Day into Separate Columns
df_brand['month'] = df['time_stamp'] // 100
df_brand['day'] = df['time_stamp'] % 100
df_brand

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['month'] = df['time_stamp'] // 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['day'] = df['time_stamp'] % 100


Unnamed: 0,brand_id,month,day
0,1500.0,11,11
1,1500.0,11,7
2,1662.0,8,20
3,1662.0,8,20
4,1662.0,8,20
...,...,...,...
6544,2603.0,11,11
6545,247.0,11,11
6546,2276.0,11,10
6547,2276.0,11,1


Aggregation

Common Aggregate Functions:
- Average
- Count
    > Action Count
- Maximum
- Median
- Minimum
- Mode
- Range
- Sum
- StdDeviation
- NaNMean

In [14]:
# Monthly Action Count

In [15]:
# Monthly Brand Action Counts
df_brand['brand_monthly_action_count'] = df_brand.groupby(['brand_id', 'month']).transform('size')
df_brand

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['brand_monthly_action_count'] = df_brand.groupby(['brand_id', 'month']).transform('size')


Unnamed: 0,brand_id,month,day,brand_monthly_action_count
0,1500.0,11,11,3
1,1500.0,11,7,3
2,1662.0,8,20,10
3,1662.0,8,20,10
4,1662.0,8,20,10
...,...,...,...,...
6544,2603.0,11,11,225
6545,247.0,11,11,342
6546,2276.0,11,10,13
6547,2276.0,11,1,13


In [16]:
# Total Action Count in said month
df_brand['month_total_action_count'] = df_brand.groupby(['month']).transform('size')
df_brand


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['month_total_action_count'] = df_brand.groupby(['month']).transform('size')


Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count
0,1500.0,11,11,3,2291
1,1500.0,11,7,3,2291
2,1662.0,8,20,10,704
3,1662.0,8,20,10,704
4,1662.0,8,20,10,704
...,...,...,...,...,...
6544,2603.0,11,11,225,2291
6545,247.0,11,11,342,2291
6546,2276.0,11,10,13,2291
6547,2276.0,11,1,13,2291


In [17]:
# Monthly Action Count / Ratio (Count/Ratio Type)
df_brand['monthly_action_count_ratio'] = df_brand.groupby(['brand_id', 'month'])['brand_monthly_action_count'].transform(lambda x: x/ df_brand['month_total_action_count'])
df_brand


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['monthly_action_count_ratio'] = df_brand.groupby(['brand_id', 'month'])['brand_monthly_action_count'].transform(lambda x: x/ df_brand['month_total_action_count'])


Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio
0,1500.0,11,11,3,2291,0.001309
1,1500.0,11,7,3,2291,0.001309
2,1662.0,8,20,10,704,0.014205
3,1662.0,8,20,10,704,0.014205
4,1662.0,8,20,10,704,0.014205
...,...,...,...,...,...,...
6544,2603.0,11,11,225,2291,0.098210
6545,247.0,11,11,342,2291,0.149280
6546,2276.0,11,10,13,2291,0.005674
6547,2276.0,11,1,13,2291,0.005674


In [18]:
# Per Brand Mean Action Type (Treats Action Type not really Discrete Label)
df_brand['monthly_mean_action_type'] = df.groupby(['brand_id', 'month'])['action_type'].transform('mean')
df_brand

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brand['monthly_mean_action_type'] = df.groupby(['brand_id', 'month'])['action_type'].transform('mean')


Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type
0,1500.0,11,11,3,2291,0.001309,0.000000
1,1500.0,11,7,3,2291,0.001309,0.000000
2,1662.0,8,20,10,704,0.014205,0.300000
3,1662.0,8,20,10,704,0.014205,0.300000
4,1662.0,8,20,10,704,0.014205,0.300000
...,...,...,...,...,...,...,...
6544,2603.0,11,11,225,2291,0.098210,0.528889
6545,247.0,11,11,342,2291,0.149280,0.324561
6546,2276.0,11,10,13,2291,0.005674,0.307692
6547,2276.0,11,1,13,2291,0.005674,0.307692



# Gender Interaction Count per Brand

In [19]:
# For Other Columns, modify the groupby condition
male_counts = df.groupby(['brand_id'])['gender'].apply(lambda x: (x == 1).sum())
female_counts = df.groupby(['brand_id'])['gender'].apply(lambda x: (x == 0).sum())
unknown_gender_count = df.groupby(['brand_id'])['gender'].apply(lambda x: (x == 2).sum())
gender_total_brand_counts = pd.DataFrame({'brand_male_count': male_counts, 'brand_female_count': female_counts, 'brand_unknown_gender_count': unknown_gender_count})
gender_total_brand_counts


Unnamed: 0_level_0,brand_male_count,brand_female_count,brand_unknown_gender_count
brand_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
68.0,0,14,4
69.0,3,22,0
99.0,6,17,0
136.0,2,0,0
178.0,0,1,0
...,...,...,...
7989.0,3,5,0
8040.0,0,2,0
8122.0,0,6,2
8150.0,0,10,0


In [20]:
df_brand = df_brand.join(gender_total_brand_counts, on='brand_id')
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count
0,1500.0,11,11,3,2291,0.001309,0.000000,1,11,2
1,1500.0,11,7,3,2291,0.001309,0.000000,1,11,2
2,1662.0,8,20,10,704,0.014205,0.300000,6,36,3
3,1662.0,8,20,10,704,0.014205,0.300000,6,36,3
4,1662.0,8,20,10,704,0.014205,0.300000,6,36,3
...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225,2291,0.098210,0.528889,170,689,26
6545,247.0,11,11,342,2291,0.149280,0.324561,197,182,19
6546,2276.0,11,10,13,2291,0.005674,0.307692,2,11,0
6547,2276.0,11,1,13,2291,0.005674,0.307692,2,11,0


# Monthly Gender Action Count per Brand

In [21]:
monthly_male_counts = df.groupby(['brand_id', 'month'])['gender'].apply(lambda x: (x == 1).sum())
monthly_female_counts = df.groupby(['brand_id', 'month'])['gender'].apply(lambda x: (x == 0).sum())
monthly_unknown_gender_count = df.groupby(['brand_id', 'month'])['gender'].apply(lambda x: (x == 2).sum())
monthly_gender_total_brand_counts = pd.DataFrame({'month_brand_male_count': monthly_male_counts, 'month_brand_female_count': monthly_female_counts, 'month_brand_unknown_gender_count': monthly_unknown_gender_count})
monthly_gender_total_brand_counts

Unnamed: 0_level_0,Unnamed: 1_level_0,month_brand_male_count,month_brand_female_count,month_brand_unknown_gender_count
brand_id,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
68.0,6,0,13,3
68.0,7,0,1,1
69.0,7,0,17,0
69.0,8,0,2,0
69.0,9,1,1,0
...,...,...,...,...
8150.0,7,0,3,0
8150.0,8,0,2,0
8150.0,9,0,1,0
8238.0,6,2,57,0


In [22]:
df_brand = df_brand.join(monthly_gender_total_brand_counts, on=['brand_id', 'month'])
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,month_brand_male_count,month_brand_female_count,month_brand_unknown_gender_count
0,1500.0,11,11,3,2291,0.001309,0.000000,1,11,2,0,1,2
1,1500.0,11,7,3,2291,0.001309,0.000000,1,11,2,0,1,2
2,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,1,9,0
3,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,1,9,0
4,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,1,9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225,2291,0.098210,0.528889,170,689,26,46,170,9
6545,247.0,11,11,342,2291,0.149280,0.324561,197,182,19,173,154,15
6546,2276.0,11,10,13,2291,0.005674,0.307692,2,11,0,2,11,0
6547,2276.0,11,1,13,2291,0.005674,0.307692,2,11,0,2,11,0


# Brand Gender Count Ratio

In [23]:
gender_counts = df_brand[['brand_male_count', 'brand_female_count', 'brand_unknown_gender_count']].sum(axis=1)
male_ratio = df_brand.apply(lambda row: row['brand_male_count'] / gender_counts[row.name], axis=1)
female_ratio = df_brand.apply(lambda row: row['brand_female_count'] / gender_counts[row.name], axis=1)
unknown_gender_ratio = df_brand.apply(lambda row: row['brand_unknown_gender_count'] / gender_counts[row.name], axis=1)
df_brand['brand_male_count_ratio'] = male_ratio
df_brand['brand_female_count_ratio'] = female_ratio
df_brand['brand_unknown_count_ratio'] = unknown_gender_ratio
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,month_brand_male_count,month_brand_female_count,month_brand_unknown_gender_count,brand_male_count_ratio,brand_female_count_ratio,brand_unknown_count_ratio
0,1500.0,11,11,3,2291,0.001309,0.000000,1,11,2,0,1,2,0.071429,0.785714,0.142857
1,1500.0,11,7,3,2291,0.001309,0.000000,1,11,2,0,1,2,0.071429,0.785714,0.142857
2,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,1,9,0,0.133333,0.800000,0.066667
3,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,1,9,0,0.133333,0.800000,0.066667
4,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,1,9,0,0.133333,0.800000,0.066667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225,2291,0.098210,0.528889,170,689,26,46,170,9,0.192090,0.778531,0.029379
6545,247.0,11,11,342,2291,0.149280,0.324561,197,182,19,173,154,15,0.494975,0.457286,0.047739
6546,2276.0,11,10,13,2291,0.005674,0.307692,2,11,0,2,11,0,0.153846,0.846154,0.000000
6547,2276.0,11,1,13,2291,0.005674,0.307692,2,11,0,2,11,0,0.153846,0.846154,0.000000


# Brand Monthly Gender Count Ratio

In [24]:
df_brand['brand_monthly_male_count_ratio'] = df_brand['month_brand_male_count'] / df_brand['brand_monthly_action_count']
df_brand['brand_monthly_female_count_ratio'] = df_brand['month_brand_female_count'] / df_brand['brand_monthly_action_count']
df_brand['brand_monthly_unknown_count_ratio'] = df_brand['month_brand_unknown_gender_count'] / df_brand['brand_monthly_action_count']
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,month_brand_male_count,month_brand_female_count,month_brand_unknown_gender_count,brand_male_count_ratio,brand_female_count_ratio,brand_unknown_count_ratio,brand_monthly_male_count_ratio,brand_monthly_female_count_ratio,brand_monthly_unknown_count_ratio
0,1500.0,11,11,3,2291,0.001309,0.000000,1,11,2,0,1,2,0.071429,0.785714,0.142857,0.000000,0.333333,0.666667
1,1500.0,11,7,3,2291,0.001309,0.000000,1,11,2,0,1,2,0.071429,0.785714,0.142857,0.000000,0.333333,0.666667
2,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,1,9,0,0.133333,0.800000,0.066667,0.100000,0.900000,0.000000
3,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,1,9,0,0.133333,0.800000,0.066667,0.100000,0.900000,0.000000
4,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,1,9,0,0.133333,0.800000,0.066667,0.100000,0.900000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225,2291,0.098210,0.528889,170,689,26,46,170,9,0.192090,0.778531,0.029379,0.204444,0.755556,0.040000
6545,247.0,11,11,342,2291,0.149280,0.324561,197,182,19,173,154,15,0.494975,0.457286,0.047739,0.505848,0.450292,0.043860
6546,2276.0,11,10,13,2291,0.005674,0.307692,2,11,0,2,11,0,0.153846,0.846154,0.000000,0.153846,0.846154,0.000000
6547,2276.0,11,1,13,2291,0.005674,0.307692,2,11,0,2,11,0,0.153846,0.846154,0.000000,0.153846,0.846154,0.000000


# All Above Confirmed

# Penetration

In [25]:
df

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,age_range,gender,prob,month,day
0,149002,282,885,3791,1500.0,1111,0,5.0,2.0,,11,11
1,149002,282,885,3791,1500.0,1107,0,5.0,2.0,,11,7
2,109881,230,629,1,1662.0,820,0,4.0,0.0,,8,20
3,109881,230,629,1,1662.0,820,0,4.0,0.0,,8,20
4,109881,230,629,1,1662.0,820,0,4.0,0.0,,8,20
...,...,...,...,...,...,...,...,...,...,...,...,...
6544,73105,259,614,637,2603.0,1111,0,2.0,0.0,,11,11
6545,403421,175,1181,4760,247.0,1111,0,3.0,1.0,,11,11
6546,116738,186,267,1200,2276.0,1110,0,2.0,1.0,,11,10
6547,116738,186,267,1200,2276.0,1101,0,2.0,1.0,,11,1


In [26]:
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,month_brand_male_count,month_brand_female_count,month_brand_unknown_gender_count,brand_male_count_ratio,brand_female_count_ratio,brand_unknown_count_ratio,brand_monthly_male_count_ratio,brand_monthly_female_count_ratio,brand_monthly_unknown_count_ratio
0,1500.0,11,11,3,2291,0.001309,0.000000,1,11,2,0,1,2,0.071429,0.785714,0.142857,0.000000,0.333333,0.666667
1,1500.0,11,7,3,2291,0.001309,0.000000,1,11,2,0,1,2,0.071429,0.785714,0.142857,0.000000,0.333333,0.666667
2,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,1,9,0,0.133333,0.800000,0.066667,0.100000,0.900000,0.000000
3,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,1,9,0,0.133333,0.800000,0.066667,0.100000,0.900000,0.000000
4,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,1,9,0,0.133333,0.800000,0.066667,0.100000,0.900000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225,2291,0.098210,0.528889,170,689,26,46,170,9,0.192090,0.778531,0.029379,0.204444,0.755556,0.040000
6545,247.0,11,11,342,2291,0.149280,0.324561,197,182,19,173,154,15,0.494975,0.457286,0.047739,0.505848,0.450292,0.043860
6546,2276.0,11,10,13,2291,0.005674,0.307692,2,11,0,2,11,0,0.153846,0.846154,0.000000,0.153846,0.846154,0.000000
6547,2276.0,11,1,13,2291,0.005674,0.307692,2,11,0,2,11,0,0.153846,0.846154,0.000000,0.153846,0.846154,0.000000


# Number of Buys Per Brand

In [27]:
brand_buy = pd.DataFrame({'brand_buys': df.groupby(['brand_id'])['action_type'].apply(lambda x: (x == 2).sum())})
df_brand = df_brand.join(brand_buy, on=['brand_id'])
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,month_brand_male_count,month_brand_female_count,month_brand_unknown_gender_count,brand_male_count_ratio,brand_female_count_ratio,brand_unknown_count_ratio,brand_monthly_male_count_ratio,brand_monthly_female_count_ratio,brand_monthly_unknown_count_ratio,brand_buys
0,1500.0,11,11,3,2291,0.001309,0.000000,1,11,2,0,1,2,0.071429,0.785714,0.142857,0.000000,0.333333,0.666667,0
1,1500.0,11,7,3,2291,0.001309,0.000000,1,11,2,0,1,2,0.071429,0.785714,0.142857,0.000000,0.333333,0.666667,0
2,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,1,9,0,0.133333,0.800000,0.066667,0.100000,0.900000,0.000000,0
3,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,1,9,0,0.133333,0.800000,0.066667,0.100000,0.900000,0.000000,0
4,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,1,9,0,0.133333,0.800000,0.066667,0.100000,0.900000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225,2291,0.098210,0.528889,170,689,26,46,170,9,0.192090,0.778531,0.029379,0.204444,0.755556,0.040000,144
6545,247.0,11,11,342,2291,0.149280,0.324561,197,182,19,173,154,15,0.494975,0.457286,0.047739,0.505848,0.450292,0.043860,35
6546,2276.0,11,10,13,2291,0.005674,0.307692,2,11,0,2,11,0,0.153846,0.846154,0.000000,0.153846,0.846154,0.000000,2
6547,2276.0,11,1,13,2291,0.005674,0.307692,2,11,0,2,11,0,0.153846,0.846154,0.000000,0.153846,0.846154,0.000000,2


# Buy Ratio per Brand (Brand Buy Count / Brand Total Actions)

In [28]:
df_brand['brand_buy_ratio'] = df_brand['brand_buys'] / df_brand.groupby(['brand_id']).transform('size')
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,month_brand_female_count,month_brand_unknown_gender_count,brand_male_count_ratio,brand_female_count_ratio,brand_unknown_count_ratio,brand_monthly_male_count_ratio,brand_monthly_female_count_ratio,brand_monthly_unknown_count_ratio,brand_buys,brand_buy_ratio
0,1500.0,11,11,3,2291,0.001309,0.000000,1,11,2,...,1,2,0.071429,0.785714,0.142857,0.000000,0.333333,0.666667,0,0.000000
1,1500.0,11,7,3,2291,0.001309,0.000000,1,11,2,...,1,2,0.071429,0.785714,0.142857,0.000000,0.333333,0.666667,0,0.000000
2,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,9,0,0.133333,0.800000,0.066667,0.100000,0.900000,0.000000,0,0.000000
3,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,9,0,0.133333,0.800000,0.066667,0.100000,0.900000,0.000000,0,0.000000
4,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,9,0,0.133333,0.800000,0.066667,0.100000,0.900000,0.000000,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225,2291,0.098210,0.528889,170,689,26,...,170,9,0.192090,0.778531,0.029379,0.204444,0.755556,0.040000,144,0.162712
6545,247.0,11,11,342,2291,0.149280,0.324561,197,182,19,...,154,15,0.494975,0.457286,0.047739,0.505848,0.450292,0.043860,35,0.087940
6546,2276.0,11,10,13,2291,0.005674,0.307692,2,11,0,...,11,0,0.153846,0.846154,0.000000,0.153846,0.846154,0.000000,2,0.153846
6547,2276.0,11,1,13,2291,0.005674,0.307692,2,11,0,...,11,0,0.153846,0.846154,0.000000,0.153846,0.846154,0.000000,2,0.153846


# Gender-Buy Count per Brand

In [29]:
brand_male_buy = df[df.action_type == 2].groupby(['brand_id'])['gender'].apply(lambda x: (x == 1).sum()) 
brand_female_buy = df[df.action_type == 2].groupby(['brand_id'])['gender'].apply(lambda x: (x == 0).sum())
brand_unknown_buy = df[df.action_type == 2].groupby(['brand_id'])['gender'].apply(lambda x: (x == 2).sum())
gender_buys = pd.DataFrame({'brand_male_buy_count': brand_male_buy, 'brand_female_buy_count': brand_female_buy, 'brand_unknown_buy_count': brand_unknown_buy})
df_brand = df_brand.join(gender_buys, on=['brand_id'])
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand_female_count_ratio,brand_unknown_count_ratio,brand_monthly_male_count_ratio,brand_monthly_female_count_ratio,brand_monthly_unknown_count_ratio,brand_buys,brand_buy_ratio,brand_male_buy_count,brand_female_buy_count,brand_unknown_buy_count
0,1500.0,11,11,3,2291,0.001309,0.000000,1,11,2,...,0.785714,0.142857,0.000000,0.333333,0.666667,0,0.000000,,,
1,1500.0,11,7,3,2291,0.001309,0.000000,1,11,2,...,0.785714,0.142857,0.000000,0.333333,0.666667,0,0.000000,,,
2,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0.800000,0.066667,0.100000,0.900000,0.000000,0,0.000000,,,
3,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0.800000,0.066667,0.100000,0.900000,0.000000,0,0.000000,,,
4,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0.800000,0.066667,0.100000,0.900000,0.000000,0,0.000000,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225,2291,0.098210,0.528889,170,689,26,...,0.778531,0.029379,0.204444,0.755556,0.040000,144,0.162712,33.0,108.0,3.0
6545,247.0,11,11,342,2291,0.149280,0.324561,197,182,19,...,0.457286,0.047739,0.505848,0.450292,0.043860,35,0.087940,17.0,16.0,2.0
6546,2276.0,11,10,13,2291,0.005674,0.307692,2,11,0,...,0.846154,0.000000,0.153846,0.846154,0.000000,2,0.153846,0.0,2.0,0.0
6547,2276.0,11,1,13,2291,0.005674,0.307692,2,11,0,...,0.846154,0.000000,0.153846,0.846154,0.000000,2,0.153846,0.0,2.0,0.0


# Gender-Buy Ratio per Brand (Buy Count of a Gender / Total Buy Count for that Brand)

In [30]:
df_brand['brand_male_buy_ratio'] = df_brand['brand_male_buy_count'] / df_brand['brand_buys']
df_brand['brand_female_buy_ratio'] = df_brand['brand_female_buy_count'] / df_brand['brand_buys']
df_brand['brand_unknown_buy_ratio'] = df_brand['brand_unknown_buy_count'] / df_brand['brand_buys']
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand_monthly_female_count_ratio,brand_monthly_unknown_count_ratio,brand_buys,brand_buy_ratio,brand_male_buy_count,brand_female_buy_count,brand_unknown_buy_count,brand_male_buy_ratio,brand_female_buy_ratio,brand_unknown_buy_ratio
0,1500.0,11,11,3,2291,0.001309,0.000000,1,11,2,...,0.333333,0.666667,0,0.000000,,,,,,
1,1500.0,11,7,3,2291,0.001309,0.000000,1,11,2,...,0.333333,0.666667,0,0.000000,,,,,,
2,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0.900000,0.000000,0,0.000000,,,,,,
3,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0.900000,0.000000,0,0.000000,,,,,,
4,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0.900000,0.000000,0,0.000000,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225,2291,0.098210,0.528889,170,689,26,...,0.755556,0.040000,144,0.162712,33.0,108.0,3.0,0.229167,0.750000,0.020833
6545,247.0,11,11,342,2291,0.149280,0.324561,197,182,19,...,0.450292,0.043860,35,0.087940,17.0,16.0,2.0,0.485714,0.457143,0.057143
6546,2276.0,11,10,13,2291,0.005674,0.307692,2,11,0,...,0.846154,0.000000,2,0.153846,0.0,2.0,0.0,0.000000,1.000000,0.000000
6547,2276.0,11,1,13,2291,0.005674,0.307692,2,11,0,...,0.846154,0.000000,2,0.153846,0.0,2.0,0.0,0.000000,1.000000,0.000000


# Resolving NaNs: When brand_buys is 0, math results in some NaNs for ratios 

In [31]:
# Visualize rows with NaNs
nan_mask = df_brand.isna().any(axis=1)
nan_rows = df_brand[nan_mask]
nan_rows

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand_monthly_female_count_ratio,brand_monthly_unknown_count_ratio,brand_buys,brand_buy_ratio,brand_male_buy_count,brand_female_buy_count,brand_unknown_buy_count,brand_male_buy_ratio,brand_female_buy_ratio,brand_unknown_buy_ratio
0,1500.0,11,11,3,2291,0.001309,0.000000,1,11,2,...,0.333333,0.666667,0,0.0,,,,,,
1,1500.0,11,7,3,2291,0.001309,0.000000,1,11,2,...,0.333333,0.666667,0,0.0,,,,,,
2,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0.900000,0.000000,0,0.0,,,,,,
3,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0.900000,0.000000,0,0.0,,,,,,
4,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0.900000,0.000000,0,0.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6521,3650.0,10,5,99,615,0.160976,0.181818,531,741,63,...,0.525253,0.060606,0,0.0,,,,,,
6525,913.0,9,17,7,615,0.011382,0.428571,1,13,0,...,1.000000,0.000000,0,0.0,,,,,,
6526,6455.0,5,26,12,367,0.032698,0.000000,3,37,0,...,1.000000,0.000000,0,0.0,,,,,,
6527,3650.0,9,14,103,615,0.167480,0.203883,531,741,63,...,0.485437,0.038835,0,0.0,,,,,,


In [32]:
# If Brand_Buys is 0, the previous method generates NaNs. Set these NaNs to 0
condition = df_brand['brand_buys'] == 0
df_brand.loc[condition, ['brand_male_buy_count', 'brand_female_buy_count', 'brand_unknown_buy_count', 'brand_male_buy_ratio', 'brand_female_buy_ratio', 'brand_unknown_buy_ratio']] = 0
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand_monthly_female_count_ratio,brand_monthly_unknown_count_ratio,brand_buys,brand_buy_ratio,brand_male_buy_count,brand_female_buy_count,brand_unknown_buy_count,brand_male_buy_ratio,brand_female_buy_ratio,brand_unknown_buy_ratio
0,1500.0,11,11,3,2291,0.001309,0.000000,1,11,2,...,0.333333,0.666667,0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000
1,1500.0,11,7,3,2291,0.001309,0.000000,1,11,2,...,0.333333,0.666667,0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000
2,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0.900000,0.000000,0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000
3,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0.900000,0.000000,0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000
4,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0.900000,0.000000,0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225,2291,0.098210,0.528889,170,689,26,...,0.755556,0.040000,144,0.162712,33.0,108.0,3.0,0.229167,0.750000,0.020833
6545,247.0,11,11,342,2291,0.149280,0.324561,197,182,19,...,0.450292,0.043860,35,0.087940,17.0,16.0,2.0,0.485714,0.457143,0.057143
6546,2276.0,11,10,13,2291,0.005674,0.307692,2,11,0,...,0.846154,0.000000,2,0.153846,0.0,2.0,0.0,0.000000,1.000000,0.000000
6547,2276.0,11,1,13,2291,0.005674,0.307692,2,11,0,...,0.846154,0.000000,2,0.153846,0.0,2.0,0.0,0.000000,1.000000,0.000000


# Check for unaccounted NaNs

In [33]:
print(sum(df_brand.isna().sum()))

0


# Age-Related

# AgeGroup Counts per Brand

In [34]:
# For Other Columns, modify the groupby condition
ageGroup_0_counts = df.groupby(['brand_id'])['age_range'].apply(lambda x: (x == 0).sum())
ageGroup_1_counts = df.groupby(['brand_id'])['age_range'].apply(lambda x: (x == 1).sum())
ageGroup_2_counts = df.groupby(['brand_id'])['age_range'].apply(lambda x: (x == 2).sum())
ageGroup_3_counts = df.groupby(['brand_id'])['age_range'].apply(lambda x: (x == 3).sum())
ageGroup_4_counts = df.groupby(['brand_id'])['age_range'].apply(lambda x: (x == 4).sum())
ageGroup_5_counts = df.groupby(['brand_id'])['age_range'].apply(lambda x: (x == 5).sum())
ageGroup_6_counts = df.groupby(['brand_id'])['age_range'].apply(lambda x: (x == 6).sum())
ageGroup_7_counts = df.groupby(['brand_id'])['age_range'].apply(lambda x: (x == 7).sum())
ageGroup_8_counts = df.groupby(['brand_id'])['age_range'].apply(lambda x: (x == 8).sum())
gender_total_brand_counts = pd.DataFrame({'brand_ageGroup_1_counts': ageGroup_1_counts,'brand_ageGroup_2_counts': ageGroup_2_counts,'brand_ageGroup_3_counts': ageGroup_3_counts,'brand_ageGroup_4_counts': ageGroup_4_counts,'brand_ageGroup_5_counts': ageGroup_5_counts,
                                          'brand_ageGroup_6_counts': ageGroup_6_counts,'brand_ageGroup_7_counts': ageGroup_7_counts,'brand_ageGroup_8_counts': ageGroup_8_counts, 'brand_ageGroup_0_counts': ageGroup_0_counts})
df_brand = df_brand.join(gender_total_brand_counts, on='brand_id')
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand_unknown_buy_ratio,brand_ageGroup_1_counts,brand_ageGroup_2_counts,brand_ageGroup_3_counts,brand_ageGroup_4_counts,brand_ageGroup_5_counts,brand_ageGroup_6_counts,brand_ageGroup_7_counts,brand_ageGroup_8_counts,brand_ageGroup_0_counts
0,1500.0,11,11,3,2291,0.001309,0.000000,1,11,2,...,0.000000,0,2,4,1,3,0,4,0,0
1,1500.0,11,7,3,2291,0.001309,0.000000,1,11,2,...,0.000000,0,2,4,1,3,0,4,0,0
2,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0.000000,0,0,8,26,5,0,0,0,6
3,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0.000000,0,0,8,26,5,0,0,0,6
4,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0.000000,0,0,8,26,5,0,0,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225,2291,0.098210,0.528889,170,689,26,...,0.020833,0,278,276,84,42,37,6,0,162
6545,247.0,11,11,342,2291,0.149280,0.324561,197,182,19,...,0.057143,0,88,94,71,25,15,2,2,101
6546,2276.0,11,10,13,2291,0.005674,0.307692,2,11,0,...,0.000000,0,2,2,2,2,0,0,0,5
6547,2276.0,11,1,13,2291,0.005674,0.307692,2,11,0,...,0.000000,0,2,2,2,2,0,0,0,5


In [35]:
#FEATURE 1: OVERALL ACTION COUNT/RATIO
actions = {}
for index, row in df.iterrows():
    if row['brand_id'] not in actions:
        # there may be multiple entries, so default with an empty list
        actions[row["brand_id"]] = [] 
    actions[row["brand_id"]].append(row["action_type"])

In [36]:
actions

{1500.0: [0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 1662.0: [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 6885.0: [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
 

In [37]:
action_count = {}
for key in actions.keys():
    action_count[key] = len(actions[key])
action_count

{1500.0: 14,
 1662.0: 45,
 6885.0: 93,
 3650.0: 1335,
 5270.0: 188,
 3080.0: 66,
 99.0: 23,
 5192.0: 26,
 6742.0: 310,
 7924.0: 58,
 1446.0: 124,
 4533.0: 20,
 683.0: 413,
 5717.0: 30,
 69.0: 25,
 3462.0: 111,
 4120.0: 237,
 5360.0: 30,
 5067.0: 268,
 7892.0: 52,
 913.0: 14,
 2603.0: 885,
 944.0: 12,
 247.0: 398,
 8040.0: 2,
 2018.0: 134,
 1097.0: 75,
 1128.0: 5,
 928.0: 23,
 6086.0: 3,
 2541.0: 40,
 3228.0: 30,
 6040.0: 61,
 5890.0: 72,
 5394.0: 12,
 1654.0: 2,
 5588.0: 12,
 5999.0: 84,
 3969.0: 20,
 5230.0: 23,
 5380.0: 20,
 5491.0: 24,
 2350.0: 42,
 2735.0: 73,
 2169.0: 54,
 4385.0: 10,
 4073.0: 33,
 1768.0: 4,
 4583.0: 20,
 136.0: 2,
 68.0: 18,
 6740.0: 15,
 8122.0: 8,
 516.0: 13,
 7936.0: 30,
 4655.0: 7,
 5204.0: 3,
 7537.0: 3,
 6823.0: 21,
 5022.0: 3,
 5478.0: 4,
 5506.0: 1,
 6443.0: 36,
 6772.0: 5,
 3700.0: 28,
 2116.0: 12,
 8150.0: 10,
 4179.0: 9,
 6183.0: 9,
 6936.0: 8,
 7087.0: 4,
 3312.0: 13,
 8238.0: 60,
 6455.0: 40,
 7529.0: 2,
 1859.0: 7,
 6208.0: 31,
 5946.0: 27,
 1905.0

In [38]:
total_action_count = sum(action_count.values())
total_action_count

6546

In [39]:
action_count_ratio = {}
for key in action_count:
    action_count_ratio[key] = action_count[key]/total_action_count
action_count_ratio

{1500.0: 0.0021387106630003055,
 1662.0: 0.0068744271310724105,
 6885.0: 0.014207149404216315,
 3650.0: 0.20394133822181484,
 5270.0: 0.02871982890314696,
 3080.0: 0.01008249312557287,
 99.0: 0.003513596089214788,
 5192.0: 0.003971891231286282,
 6742.0: 0.04735716468072105,
 7924.0: 0.008860372746715552,
 1446.0: 0.018942865872288422,
 4533.0: 0.0030553009471432934,
 683.0: 0.06309196455850902,
 5717.0: 0.00458295142071494,
 69.0: 0.003819126183929117,
 3462.0: 0.01695692025664528,
 4120.0: 0.03620531622364803,
 5360.0: 0.00458295142071494,
 5067.0: 0.040941032691720135,
 7892.0: 0.007943782462572564,
 913.0: 0.0021387106630003055,
 2603.0: 0.13519706691109074,
 944.0: 0.0018331805682859762,
 247.0: 0.06080048884815154,
 8040.0: 0.00030553009471432935,
 2018.0: 0.020470516345860067,
 1097.0: 0.011457378551787351,
 1128.0: 0.0007638252367858234,
 928.0: 0.003513596089214788,
 6086.0: 0.00045829514207149406,
 2541.0: 0.006110601894286587,
 3228.0: 0.00458295142071494,
 6040.0: 0.00931866

In [40]:
df_brand['brand action count'] = -1
df_brand['brand action count ratio'] = -1
for index,row in df_brand.iterrows():
        df_brand.at[index, 'brand action count'] = action_count[row['brand_id']]
        df_brand.at[index, 'brand action count ratio'] = action_count_ratio[row['brand_id']]

df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand_ageGroup_2_counts,brand_ageGroup_3_counts,brand_ageGroup_4_counts,brand_ageGroup_5_counts,brand_ageGroup_6_counts,brand_ageGroup_7_counts,brand_ageGroup_8_counts,brand_ageGroup_0_counts,brand action count,brand action count ratio
0,1500.0,11,11,3,2291,0.001309,0.000000,1,11,2,...,2,4,1,3,0,4,0,0,14,0.002139
1,1500.0,11,7,3,2291,0.001309,0.000000,1,11,2,...,2,4,1,3,0,4,0,0,14,0.002139
2,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0,8,26,5,0,0,0,6,45,0.006874
3,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0,8,26,5,0,0,0,6,45,0.006874
4,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0,8,26,5,0,0,0,6,45,0.006874
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225,2291,0.098210,0.528889,170,689,26,...,278,276,84,42,37,6,0,162,885,0.135197
6545,247.0,11,11,342,2291,0.149280,0.324561,197,182,19,...,88,94,71,25,15,2,2,101,398,0.060800
6546,2276.0,11,10,13,2291,0.005674,0.307692,2,11,0,...,2,2,2,2,0,0,0,5,13,0.001986
6547,2276.0,11,1,13,2291,0.005674,0.307692,2,11,0,...,2,2,2,2,0,0,0,5,13,0.001986


In [41]:
#FEATURE 2: OVERALL DAY COUNT
days = {}

# go through user_seller sub-dataframe and split entries by item_id:seller_id
for index, row in df.iterrows():
    if row['brand_id'] not in days:
        # there may be multiple entries, so default with an empty list
        days[row["brand_id"]] = [] 
    days[row["brand_id"]].append(row["time_stamp"])

In [42]:
for k,v in days.items():
    days[k] = list(set(v)) # make each value list unique

days

{1500.0: [708.0,
  906.0,
  1008.0,
  531.0,
  1107.0,
  1011.0,
  1108.0,
  1111.0,
  603.0,
  604.0,
  605.0],
 1662.0: [516.0,
  903.0,
  904.0,
  905.0,
  906.0,
  523.0,
  909.0,
  528.0,
  912.0,
  914.0,
  915.0,
  531.0,
  917.0,
  920.0,
  921.0,
  923.0,
  803.0,
  819.0,
  820.0,
  828.0,
  830.0,
  702.0,
  704.0,
  705.0,
  706.0,
  707.0,
  709.0,
  710.0,
  711.0,
  715.0,
  718.0,
  728.0,
  730.0,
  614.0,
  617.0,
  1017.0],
 6885.0: [1101.0,
  1102.0,
  1103.0,
  1104.0,
  1105.0,
  1107.0,
  1109.0,
  1110.0,
  1111.0],
 3650.0: [1024.0,
  1025.0,
  1026.0,
  1027.0,
  1028.0,
  1030.0,
  1031.0,
  521.0,
  522.0,
  523.0,
  525.0,
  526.0,
  528.0,
  530.0,
  531.0,
  1102.0,
  1103.0,
  1104.0,
  1105.0,
  1106.0,
  1107.0,
  1108.0,
  1109.0,
  1110.0,
  1111.0,
  601.0,
  602.0,
  603.0,
  604.0,
  605.0,
  607.0,
  609.0,
  610.0,
  611.0,
  612.0,
  613.0,
  614.0,
  615.0,
  616.0,
  617.0,
  618.0,
  619.0,
  620.0,
  621.0,
  622.0,
  623.0,
  624.0,
  625.

In [43]:
day_count = {}
for key in days.keys():
    day_count[key] = len(days[key])
day_count

{1500.0: 11,
 1662.0: 36,
 6885.0: 9,
 3650.0: 158,
 5270.0: 82,
 3080.0: 22,
 99.0: 6,
 5192.0: 14,
 6742.0: 117,
 7924.0: 9,
 1446.0: 45,
 4533.0: 13,
 683.0: 57,
 5717.0: 16,
 69.0: 10,
 3462.0: 11,
 4120.0: 30,
 5360.0: 6,
 5067.0: 67,
 7892.0: 27,
 913.0: 8,
 2603.0: 149,
 944.0: 6,
 247.0: 36,
 8040.0: 2,
 2018.0: 71,
 1097.0: 28,
 1128.0: 5,
 928.0: 15,
 6086.0: 3,
 2541.0: 13,
 3228.0: 19,
 6040.0: 7,
 5890.0: 33,
 5394.0: 8,
 1654.0: 1,
 5588.0: 9,
 5999.0: 45,
 3969.0: 12,
 5230.0: 11,
 5380.0: 8,
 5491.0: 18,
 2350.0: 8,
 2735.0: 42,
 2169.0: 21,
 4385.0: 7,
 4073.0: 16,
 1768.0: 2,
 4583.0: 10,
 136.0: 2,
 68.0: 8,
 6740.0: 10,
 8122.0: 6,
 516.0: 10,
 7936.0: 7,
 4655.0: 6,
 5204.0: 2,
 7537.0: 3,
 6823.0: 17,
 5022.0: 3,
 5478.0: 4,
 5506.0: 1,
 6443.0: 6,
 6772.0: 4,
 3700.0: 6,
 2116.0: 2,
 8150.0: 7,
 4179.0: 6,
 6183.0: 4,
 6936.0: 6,
 7087.0: 4,
 3312.0: 10,
 8238.0: 4,
 6455.0: 28,
 7529.0: 2,
 1859.0: 5,
 6208.0: 6,
 5946.0: 5,
 1905.0: 16,
 1246.0: 8,
 6230.0: 7,


In [44]:
df_brand['brand day count'] = -1
for index,row in df_brand.iterrows():
        df_brand.at[index, 'brand day count'] = day_count[row['brand_id']]

df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand_ageGroup_3_counts,brand_ageGroup_4_counts,brand_ageGroup_5_counts,brand_ageGroup_6_counts,brand_ageGroup_7_counts,brand_ageGroup_8_counts,brand_ageGroup_0_counts,brand action count,brand action count ratio,brand day count
0,1500.0,11,11,3,2291,0.001309,0.000000,1,11,2,...,4,1,3,0,4,0,0,14,0.002139,11
1,1500.0,11,7,3,2291,0.001309,0.000000,1,11,2,...,4,1,3,0,4,0,0,14,0.002139,11
2,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,8,26,5,0,0,0,6,45,0.006874,36
3,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,8,26,5,0,0,0,6,45,0.006874,36
4,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,8,26,5,0,0,0,6,45,0.006874,36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225,2291,0.098210,0.528889,170,689,26,...,276,84,42,37,6,0,162,885,0.135197,149
6545,247.0,11,11,342,2291,0.149280,0.324561,197,182,19,...,94,71,25,15,2,2,101,398,0.060800,36
6546,2276.0,11,10,13,2291,0.005674,0.307692,2,11,0,...,2,2,2,0,0,0,5,13,0.001986,4
6547,2276.0,11,1,13,2291,0.005674,0.307692,2,11,0,...,2,2,2,0,0,0,5,13,0.001986,4


In [45]:
#DOUBLE 11 FEATURES
df_11 = df[df['time_stamp']==1111]
df_11

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,age_range,gender,prob,month,day
0,149002,282,885,3791,1500.0,1111,0,5.0,2.0,,11,11
5,142526,193,656,2781,6885.0,1111,0,6.0,0.0,,11,11
11,46520,257,1238,158,99.0,1111,0,4.0,0.0,,11,11
22,14747,279,898,3323,683.0,1111,0,6.0,1.0,,11,11
31,379824,198,656,145,3462.0,1111,0,5.0,1.0,,11,11
...,...,...,...,...,...,...,...,...,...,...,...,...
6529,107147,259,614,637,2603.0,1111,0,0.0,1.0,,11,11
6538,127336,259,614,637,2603.0,1111,0,0.0,0.0,,11,11
6544,73105,259,614,637,2603.0,1111,0,2.0,0.0,,11,11
6545,403421,175,1181,4760,247.0,1111,0,3.0,1.0,,11,11


In [46]:
actions_11 = {}

# go through user_seller sub-dataframe and split entries by item_id:seller_id
for index, row in df_11.iterrows():
    if row['brand_id'] not in actions_11:
        # there may be multiple entries, so default with an empty list
        actions_11[row["brand_id"]] = [] 
    actions_11[row["brand_id"]].append(row["action_type"])

actions_11

{1500.0: [0.0],
 6885.0: [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0],
 99.0: [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 683.0: [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  

In [47]:
action_count_11 = {}
for key in actions_11.keys():
    action_count_11[key] = len(actions_11[key])
action_count_11

{1500.0: 1,
 6885.0: 69,
 99.0: 17,
 683.0: 163,
 3462.0: 66,
 5360.0: 8,
 5067.0: 62,
 913.0: 5,
 247.0: 234,
 4120.0: 96,
 928.0: 4,
 1654.0: 2,
 3650.0: 19,
 5380.0: 11,
 2603.0: 102,
 2350.0: 11,
 6040.0: 3,
 1097.0: 19,
 7936.0: 17,
 2169.0: 11,
 7924.0: 49,
 3700.0: 19,
 2116.0: 6,
 6208.0: 21,
 1246.0: 39,
 3080.0: 11,
 4655.0: 2,
 6230.0: 2,
 6810.0: 4,
 6443.0: 26,
 5579.0: 2,
 6762.0: 1,
 626.0: 15,
 2276.0: 4,
 5014.0: 2,
 5946.0: 21,
 7087.0: 1,
 777.0: 2,
 3007.0: 2,
 3931.0: 2,
 4583.0: 9,
 192.0: 2,
 5394.0: 3,
 3107.0: 13,
 5204.0: 1,
 4631.0: 5,
 3544.0: 1,
 5192.0: 4,
 2735.0: 1,
 7627.0: 1,
 4533.0: 8,
 1905.0: 1,
 6823.0: 1,
 4273.0: 5,
 7379.0: 2,
 4385.0: 1,
 4160.0: 1,
 5270.0: 3,
 8040.0: 1,
 2856.0: 2,
 5717.0: 1}

In [48]:
total_action_count_11 = sum(action_count_11.values())
total_action_count_11

1217

In [49]:
action_count_ratio_11 = {}
for key in action_count_11:
    action_count_ratio_11[key] = action_count_11[key]/total_action_count_11
action_count_ratio_11

{1500.0: 0.0008216926869350862,
 6885.0: 0.056696795398520954,
 99.0: 0.013968775677896467,
 683.0: 0.13393590797041907,
 3462.0: 0.054231717337715694,
 5360.0: 0.00657354149548069,
 5067.0: 0.05094494658997535,
 913.0: 0.004108463434675432,
 247.0: 0.1922760887428102,
 4120.0: 0.07888249794576828,
 928.0: 0.003286770747740345,
 1654.0: 0.0016433853738701725,
 3650.0: 0.015612161051766639,
 5380.0: 0.009038619556285949,
 2603.0: 0.0838126540673788,
 2350.0: 0.009038619556285949,
 6040.0: 0.0024650780608052587,
 1097.0: 0.015612161051766639,
 7936.0: 0.013968775677896467,
 2169.0: 0.009038619556285949,
 7924.0: 0.04026294165981923,
 3700.0: 0.015612161051766639,
 2116.0: 0.0049301561216105174,
 6208.0: 0.01725554642563681,
 1246.0: 0.03204601479046836,
 3080.0: 0.009038619556285949,
 4655.0: 0.0016433853738701725,
 6230.0: 0.0016433853738701725,
 6810.0: 0.003286770747740345,
 6443.0: 0.021364009860312245,
 5579.0: 0.0016433853738701725,
 6762.0: 0.0008216926869350862,
 626.0: 0.0123253

In [50]:
df_brand['brand 1111 action count'] = 0
df_brand['brand 1111 action count ratio'] = 0
for index,row in df_brand.iterrows():
        if row['brand_id'] in action_count_11.keys():
            df_brand.at[index, 'brand 1111 action count'] = action_count_11[row['brand_id']]
        if row['brand_id'] in action_count_ratio_11.keys():
            df_brand.at[index, 'brand 1111 action count ratio'] = action_count_ratio_11[row['brand_id']]

df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand_ageGroup_5_counts,brand_ageGroup_6_counts,brand_ageGroup_7_counts,brand_ageGroup_8_counts,brand_ageGroup_0_counts,brand action count,brand action count ratio,brand day count,brand 1111 action count,brand 1111 action count ratio
0,1500.0,11,11,3,2291,0.001309,0.000000,1,11,2,...,3,0,4,0,0,14,0.002139,11,1,0.000822
1,1500.0,11,7,3,2291,0.001309,0.000000,1,11,2,...,3,0,4,0,0,14,0.002139,11,1,0.000822
2,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,5,0,0,0,6,45,0.006874,36,0,0.000000
3,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,5,0,0,0,6,45,0.006874,36,0,0.000000
4,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,5,0,0,0,6,45,0.006874,36,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225,2291,0.098210,0.528889,170,689,26,...,42,37,6,0,162,885,0.135197,149,102,0.083813
6545,247.0,11,11,342,2291,0.149280,0.324561,197,182,19,...,25,15,2,2,101,398,0.060800,36,234,0.192276
6546,2276.0,11,10,13,2291,0.005674,0.307692,2,11,0,...,2,0,0,0,5,13,0.001986,4,4,0.003287
6547,2276.0,11,1,13,2291,0.005674,0.307692,2,11,0,...,2,0,0,0,5,13,0.001986,4,4,0.003287


In [51]:
activity_ratio_1111 = {}
df_brand['brand 1111 activity ratio'] = 0
for index,row in df_brand.iterrows():
    if df_brand.at[index,'brand 1111 action count'] != 0:
        df_brand.at[index, 'brand 1111 activity ratio'] = (df_brand.at[index,'brand 1111 action count']/df_brand.at[index,'brand action count'])
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand_ageGroup_6_counts,brand_ageGroup_7_counts,brand_ageGroup_8_counts,brand_ageGroup_0_counts,brand action count,brand action count ratio,brand day count,brand 1111 action count,brand 1111 action count ratio,brand 1111 activity ratio
0,1500.0,11,11,3,2291,0.001309,0.000000,1,11,2,...,0,4,0,0,14,0.002139,11,1,0.000822,0.071429
1,1500.0,11,7,3,2291,0.001309,0.000000,1,11,2,...,0,4,0,0,14,0.002139,11,1,0.000822,0.071429
2,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0,0,0,6,45,0.006874,36,0,0.000000,0.000000
3,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0,0,0,6,45,0.006874,36,0,0.000000,0.000000
4,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0,0,0,6,45,0.006874,36,0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225,2291,0.098210,0.528889,170,689,26,...,37,6,0,162,885,0.135197,149,102,0.083813,0.115254
6545,247.0,11,11,342,2291,0.149280,0.324561,197,182,19,...,15,2,2,101,398,0.060800,36,234,0.192276,0.587940
6546,2276.0,11,10,13,2291,0.005674,0.307692,2,11,0,...,0,0,0,5,13,0.001986,4,4,0.003287,0.307692
6547,2276.0,11,1,13,2291,0.005674,0.307692,2,11,0,...,0,0,0,5,13,0.001986,4,4,0.003287,0.307692


In [52]:
#FEATURE 10: LATEST ONE-WEEK
latest_week = 1104
df_latest_week = df[df['time_stamp']>=latest_week]
df_latest_week

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,age_range,gender,prob,month,day
0,149002,282,885,3791,1500.0,1111,0,5.0,2.0,,11,11
1,149002,282,885,3791,1500.0,1107,0,5.0,2.0,,11,7
5,142526,193,656,2781,6885.0,1111,0,6.0,0.0,,11,11
11,46520,257,1238,158,99.0,1111,0,4.0,0.0,,11,11
22,14747,279,898,3323,683.0,1111,0,6.0,1.0,,11,11
...,...,...,...,...,...,...,...,...,...,...,...,...
6538,127336,259,614,637,2603.0,1111,0,0.0,0.0,,11,11
6544,73105,259,614,637,2603.0,1111,0,2.0,0.0,,11,11
6545,403421,175,1181,4760,247.0,1111,0,3.0,1.0,,11,11
6546,116738,186,267,1200,2276.0,1110,0,2.0,1.0,,11,10


In [53]:
actions_last_week = {}

# go through user_seller sub-dataframe and split entries by item_id:seller_id
for index, row in df_latest_week.iterrows():
    if row['brand_id'] not in actions_last_week:
        # there may be multiple entries, so default with an empty list
        actions_last_week[row["brand_id"]] = [] 
    actions_last_week[row["brand_id"]].append(row["action_type"])
    
#for k,v in actions_last_week.items():
#    actions_last_week[k] = list(set(v)) # make each value list unique
actions_last_week

{1500.0: [0.0, 0.0, 0.0],
 6885.0: [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0],
 99.0: [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 683.0: [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,

In [54]:
action_count_last_week = {}
for key in actions_last_week.keys():
    action_count_last_week[key] = len(actions_last_week[key])
action_count_last_week

{1500.0: 3,
 6885.0: 90,
 99.0: 21,
 683.0: 289,
 69.0: 3,
 3462.0: 101,
 5360.0: 30,
 5067.0: 135,
 913.0: 5,
 247.0: 330,
 8040.0: 2,
 4120.0: 174,
 928.0: 5,
 6040.0: 61,
 5394.0: 7,
 1654.0: 2,
 3650.0: 64,
 5380.0: 18,
 2603.0: 190,
 2350.0: 42,
 4583.0: 12,
 136.0: 1,
 1097.0: 36,
 7936.0: 29,
 2169.0: 17,
 7924.0: 51,
 6443.0: 35,
 3700.0: 27,
 2116.0: 12,
 6208.0: 25,
 1246.0: 69,
 3080.0: 13,
 4655.0: 2,
 6230.0: 2,
 6810.0: 9,
 5270.0: 7,
 4073.0: 1,
 5579.0: 2,
 6762.0: 1,
 626.0: 23,
 5717.0: 4,
 5772.0: 3,
 5703.0: 2,
 1905.0: 4,
 2276.0: 12,
 5014.0: 5,
 7627.0: 5,
 4631.0: 16,
 5946.0: 24,
 7087.0: 1,
 5936.0: 1,
 777.0: 2,
 3007.0: 2,
 6416.0: 2,
 3931.0: 2,
 5192.0: 6,
 4205.0: 1,
 4273.0: 9,
 192.0: 4,
 3107.0: 17,
 2856.0: 5,
 5204.0: 1,
 3544.0: 1,
 2735.0: 1,
 5738.0: 2,
 4533.0: 9,
 6823.0: 2,
 2541.0: 2,
 3654.0: 1,
 5491.0: 4,
 7379.0: 6,
 4385.0: 1,
 4160.0: 1,
 7371.0: 1}

In [55]:
total_action_count_last_week = sum(action_count_last_week.values())
total_action_count_last_week


2105

In [56]:
action_count_ratio_last_week = {}
for key in action_count_last_week:
    action_count_ratio_last_week[key] = action_count_last_week[key]/total_action_count_last_week
action_count_ratio_last_week

{1500.0: 0.0014251781472684087,
 6885.0: 0.04275534441805225,
 99.0: 0.009976247030878859,
 683.0: 0.13729216152019003,
 69.0: 0.0014251781472684087,
 3462.0: 0.047980997624703085,
 5360.0: 0.014251781472684086,
 5067.0: 0.06413301662707839,
 913.0: 0.0023752969121140144,
 247.0: 0.15676959619952494,
 8040.0: 0.0009501187648456057,
 4120.0: 0.0826603325415677,
 928.0: 0.0023752969121140144,
 6040.0: 0.028978622327790974,
 5394.0: 0.00332541567695962,
 1654.0: 0.0009501187648456057,
 3650.0: 0.030403800475059382,
 5380.0: 0.00855106888361045,
 2603.0: 0.09026128266033254,
 2350.0: 0.019952494061757718,
 4583.0: 0.005700712589073635,
 136.0: 0.00047505938242280285,
 1097.0: 0.0171021377672209,
 7936.0: 0.013776722090261283,
 2169.0: 0.008076009501187649,
 7924.0: 0.024228028503562947,
 6443.0: 0.0166270783847981,
 3700.0: 0.012826603325415678,
 2116.0: 0.005700712589073635,
 6208.0: 0.011876484560570071,
 1246.0: 0.0327790973871734,
 3080.0: 0.006175771971496437,
 4655.0: 0.0009501187648

In [57]:
df_brand['brand last week action count'] = 0
df_brand['brand last week action count ratio'] = 0
for index,row in df_brand.iterrows():
        if row['brand_id'] in action_count_last_week.keys():
            df_brand.at[index, 'brand last week action count'] = action_count_last_week[row['brand_id']]
        if row['brand_id'] in action_count_ratio_last_week.keys():
            df_brand.at[index, 'brand last week action count ratio'] = action_count_ratio_last_week[row['brand_id']]

df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand_ageGroup_8_counts,brand_ageGroup_0_counts,brand action count,brand action count ratio,brand day count,brand 1111 action count,brand 1111 action count ratio,brand 1111 activity ratio,brand last week action count,brand last week action count ratio
0,1500.0,11,11,3,2291,0.001309,0.000000,1,11,2,...,0,0,14,0.002139,11,1,0.000822,0.071429,3,0.001425
1,1500.0,11,7,3,2291,0.001309,0.000000,1,11,2,...,0,0,14,0.002139,11,1,0.000822,0.071429,3,0.001425
2,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0,6,45,0.006874,36,0,0.000000,0.000000,0,0.000000
3,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0,6,45,0.006874,36,0,0.000000,0.000000,0,0.000000
4,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0,6,45,0.006874,36,0,0.000000,0.000000,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225,2291,0.098210,0.528889,170,689,26,...,0,162,885,0.135197,149,102,0.083813,0.115254,190,0.090261
6545,247.0,11,11,342,2291,0.149280,0.324561,197,182,19,...,2,101,398,0.060800,36,234,0.192276,0.587940,330,0.156770
6546,2276.0,11,10,13,2291,0.005674,0.307692,2,11,0,...,0,5,13,0.001986,4,4,0.003287,0.307692,12,0.005701
6547,2276.0,11,1,13,2291,0.005674,0.307692,2,11,0,...,0,5,13,0.001986,4,4,0.003287,0.307692,12,0.005701


In [58]:
activity_ratio_last_week = {}
df_brand['brand last week activity ratio'] = 0
for index,row in df_brand.iterrows():
    if df_brand.at[index,'brand last week action count'] != 0:
        df_brand.at[index, 'brand last week activity ratio'] = (df_brand.at[index,'brand last week action count']/df_brand.at[index,'brand action count'])
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand_ageGroup_0_counts,brand action count,brand action count ratio,brand day count,brand 1111 action count,brand 1111 action count ratio,brand 1111 activity ratio,brand last week action count,brand last week action count ratio,brand last week activity ratio
0,1500.0,11,11,3,2291,0.001309,0.000000,1,11,2,...,0,14,0.002139,11,1,0.000822,0.071429,3,0.001425,0.214286
1,1500.0,11,7,3,2291,0.001309,0.000000,1,11,2,...,0,14,0.002139,11,1,0.000822,0.071429,3,0.001425,0.214286
2,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,6,45,0.006874,36,0,0.000000,0.000000,0,0.000000,0.000000
3,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,6,45,0.006874,36,0,0.000000,0.000000,0,0.000000,0.000000
4,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,6,45,0.006874,36,0,0.000000,0.000000,0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225,2291,0.098210,0.528889,170,689,26,...,162,885,0.135197,149,102,0.083813,0.115254,190,0.090261,0.214689
6545,247.0,11,11,342,2291,0.149280,0.324561,197,182,19,...,101,398,0.060800,36,234,0.192276,0.587940,330,0.156770,0.829146
6546,2276.0,11,10,13,2291,0.005674,0.307692,2,11,0,...,5,13,0.001986,4,4,0.003287,0.307692,12,0.005701,0.923077
6547,2276.0,11,1,13,2291,0.005674,0.307692,2,11,0,...,5,13,0.001986,4,4,0.003287,0.307692,12,0.005701,0.923077


In [59]:
#FEATURE 11: LATEST MONTH
latest_month = 1011
df_latest_month = df[df['time_stamp']>=latest_month]
df_latest_month

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,age_range,gender,prob,month,day
0,149002,282,885,3791,1500.0,1111,0,5.0,2.0,,11,11
1,149002,282,885,3791,1500.0,1107,0,5.0,2.0,,11,7
5,142526,193,656,2781,6885.0,1111,0,6.0,0.0,,11,11
10,359170,217,1326,3096,3080.0,1021,3,3.0,0.0,,10,21
11,46520,257,1238,158,99.0,1111,0,4.0,0.0,,11,11
...,...,...,...,...,...,...,...,...,...,...,...,...
6544,73105,259,614,637,2603.0,1111,0,2.0,0.0,,11,11
6545,403421,175,1181,4760,247.0,1111,0,3.0,1.0,,11,11
6546,116738,186,267,1200,2276.0,1110,0,2.0,1.0,,11,10
6547,116738,186,267,1200,2276.0,1101,0,2.0,1.0,,11,1


In [60]:
actions_last_month = {}

# go through user_seller sub-dataframe and split entries by user_id:seller_id
for index, row in df_latest_month.iterrows():
    if row['brand_id'] not in actions_last_month:
        # there may be multiple entries, so default with an empty list
        actions_last_month[row["brand_id"]] = [] 
    actions_last_month[row["brand_id"]].append(row["action_type"])
    
#for k,v in actions_last_month.items():
#    actions_last_month[k] = list(set(v)) # make each value list unique
actions_last_month

{1500.0: [0.0, 0.0, 0.0, 0.0],
 6885.0: [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0],
 3080.0: [3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 99.0: [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,


In [61]:
action_count_last_month = {}
for key in actions_last_month.keys():
    action_count_last_month[key] = len(actions_last_month[key])
action_count_last_month

{1500.0: 4,
 6885.0: 93,
 3080.0: 27,
 99.0: 23,
 5270.0: 22,
 3650.0: 133,
 683.0: 297,
 6742.0: 9,
 5717.0: 26,
 69.0: 4,
 3462.0: 111,
 4120.0: 227,
 5360.0: 30,
 5067.0: 165,
 913.0: 6,
 247.0: 390,
 8040.0: 2,
 928.0: 9,
 2541.0: 35,
 6040.0: 61,
 5394.0: 12,
 4533.0: 11,
 1654.0: 2,
 5380.0: 20,
 2603.0: 376,
 2350.0: 42,
 4583.0: 18,
 136.0: 1,
 7936.0: 30,
 5204.0: 3,
 1097.0: 43,
 5491.0: 10,
 2169.0: 23,
 7924.0: 51,
 5506.0: 1,
 6443.0: 36,
 3700.0: 28,
 2116.0: 12,
 1128.0: 1,
 6936.0: 4,
 6208.0: 25,
 1246.0: 72,
 4655.0: 2,
 6230.0: 4,
 4073.0: 20,
 6810.0: 15,
 5936.0: 2,
 1662.0: 1,
 5579.0: 2,
 6762.0: 1,
 5703.0: 9,
 626.0: 23,
 2018.0: 3,
 4631.0: 21,
 5772.0: 3,
 5738.0: 4,
 5890.0: 1,
 1905.0: 4,
 2276.0: 13,
 5014.0: 5,
 7627.0: 6,
 7371.0: 5,
 5946.0: 25,
 7087.0: 1,
 777.0: 2,
 2735.0: 4,
 3007.0: 2,
 192.0: 5,
 3931.0: 5,
 6416.0: 3,
 5192.0: 8,
 4205.0: 1,
 4273.0: 9,
 3181.0: 5,
 3107.0: 17,
 6590.0: 1,
 6046.0: 1,
 2856.0: 5,
 3544.0: 1,
 7379.0: 8,
 2772.0:

In [62]:
total_action_count_last_month = sum(action_count_last_month.values())
total_action_count_last_month

2752

In [63]:
action_count_ratio_last_month = {}
for key in action_count_last_month:
    action_count_ratio_last_month[key] = action_count_last_month[key]/total_action_count_last_month
action_count_ratio_last_month

{1500.0: 0.0014534883720930232,
 6885.0: 0.03379360465116279,
 3080.0: 0.009811046511627907,
 99.0: 0.008357558139534883,
 5270.0: 0.007994186046511628,
 3650.0: 0.048328488372093026,
 683.0: 0.10792151162790697,
 6742.0: 0.0032703488372093025,
 5717.0: 0.00944767441860465,
 69.0: 0.0014534883720930232,
 3462.0: 0.0403343023255814,
 4120.0: 0.08248546511627906,
 5360.0: 0.010901162790697675,
 5067.0: 0.05995639534883721,
 913.0: 0.002180232558139535,
 247.0: 0.14171511627906977,
 8040.0: 0.0007267441860465116,
 928.0: 0.0032703488372093025,
 2541.0: 0.012718023255813954,
 6040.0: 0.022165697674418606,
 5394.0: 0.00436046511627907,
 4533.0: 0.003997093023255814,
 1654.0: 0.0007267441860465116,
 5380.0: 0.007267441860465116,
 2603.0: 0.13662790697674418,
 2350.0: 0.015261627906976744,
 4583.0: 0.006540697674418605,
 136.0: 0.0003633720930232558,
 7936.0: 0.010901162790697675,
 5204.0: 0.0010901162790697674,
 1097.0: 0.015625,
 5491.0: 0.003633720930232558,
 2169.0: 0.008357558139534883,


In [64]:
df_brand['last month action count'] = 0
df_brand['last month action count ratio'] = 0
for index,row in df_brand.iterrows():
        if row['brand_id'] in action_count_last_month.keys():
            df_brand.at[index, 'last month action count'] = action_count_last_month[row['brand_id']]
        if row['brand_id'] in action_count_ratio_last_month.keys():
            df_brand.at[index, 'last month action count ratio'] = action_count_ratio_last_month[row['brand_id']]

df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand action count ratio,brand day count,brand 1111 action count,brand 1111 action count ratio,brand 1111 activity ratio,brand last week action count,brand last week action count ratio,brand last week activity ratio,last month action count,last month action count ratio
0,1500.0,11,11,3,2291,0.001309,0.000000,1,11,2,...,0.002139,11,1,0.000822,0.071429,3,0.001425,0.214286,4,0.001453
1,1500.0,11,7,3,2291,0.001309,0.000000,1,11,2,...,0.002139,11,1,0.000822,0.071429,3,0.001425,0.214286,4,0.001453
2,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0.006874,36,0,0.000000,0.000000,0,0.000000,0.000000,1,0.000363
3,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0.006874,36,0,0.000000,0.000000,0,0.000000,0.000000,1,0.000363
4,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0.006874,36,0,0.000000,0.000000,0,0.000000,0.000000,1,0.000363
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225,2291,0.098210,0.528889,170,689,26,...,0.135197,149,102,0.083813,0.115254,190,0.090261,0.214689,376,0.136628
6545,247.0,11,11,342,2291,0.149280,0.324561,197,182,19,...,0.060800,36,234,0.192276,0.587940,330,0.156770,0.829146,390,0.141715
6546,2276.0,11,10,13,2291,0.005674,0.307692,2,11,0,...,0.001986,4,4,0.003287,0.307692,12,0.005701,0.923077,13,0.004724
6547,2276.0,11,1,13,2291,0.005674,0.307692,2,11,0,...,0.001986,4,4,0.003287,0.307692,12,0.005701,0.923077,13,0.004724


In [65]:
activity_ratio_last_month = {}
df_brand['last month activity ratio'] = 0
for index,row in df_brand.iterrows():
    if df_brand.at[index,'last month action count'] != 0:
        df_brand.at[index, 'last month activity ratio'] = (df_brand.at[index,'last month action count']/df_brand.at[index,'brand action count'])
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand day count,brand 1111 action count,brand 1111 action count ratio,brand 1111 activity ratio,brand last week action count,brand last week action count ratio,brand last week activity ratio,last month action count,last month action count ratio,last month activity ratio
0,1500.0,11,11,3,2291,0.001309,0.000000,1,11,2,...,11,1,0.000822,0.071429,3,0.001425,0.214286,4,0.001453,0.285714
1,1500.0,11,7,3,2291,0.001309,0.000000,1,11,2,...,11,1,0.000822,0.071429,3,0.001425,0.214286,4,0.001453,0.285714
2,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,36,0,0.000000,0.000000,0,0.000000,0.000000,1,0.000363,0.022222
3,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,36,0,0.000000,0.000000,0,0.000000,0.000000,1,0.000363,0.022222
4,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,36,0,0.000000,0.000000,0,0.000000,0.000000,1,0.000363,0.022222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225,2291,0.098210,0.528889,170,689,26,...,149,102,0.083813,0.115254,190,0.090261,0.214689,376,0.136628,0.424859
6545,247.0,11,11,342,2291,0.149280,0.324561,197,182,19,...,36,234,0.192276,0.587940,330,0.156770,0.829146,390,0.141715,0.979899
6546,2276.0,11,10,13,2291,0.005674,0.307692,2,11,0,...,4,4,0.003287,0.307692,12,0.005701,0.923077,13,0.004724,1.000000
6547,2276.0,11,1,13,2291,0.005674,0.307692,2,11,0,...,4,4,0.003287,0.307692,12,0.005701,0.923077,13,0.004724,1.000000


In [66]:
print("TOTAL ACTIONS: ", total_action_count)
print("TOTAL ACTIONS LATEST MONTH: ",total_action_count_last_month)
print("TOTAL ACTIONS LATEST WEEK: ",total_action_count_last_week)
print("TOTAL ACTIONS 1111: ",total_action_count_11)

TOTAL ACTIONS:  6546
TOTAL ACTIONS LATEST MONTH:  2752
TOTAL ACTIONS LATEST WEEK:  2105
TOTAL ACTIONS 1111:  1217


In [67]:
#FEATURE 18: AGE RELATED FEATURES
brand_age = {}

# go through user_seller sub-dataframe and split entries by user_id:seller_id
for index, row in df.iterrows():
    if row['brand_id'] not in brand_age:
        # there may be multiple entries, so default with an empty list
        brand_age[row["brand_id"]] = [] 
    brand_age[row["brand_id"]].append(row["age_range"])

brand_age_avg = {}
for row in brand_age.items():
    brand_age_avg[row[0]] = np.nansum(row[1])/len(row[1])
sorted(brand_age_avg.items())

[(68.0, 3.5555555555555554),
 (69.0, 3.92),
 (99.0, 3.5652173913043477),
 (136.0, 3.5),
 (178.0, 3.0),
 (184.0, 3.0),
 (192.0, 4.818181818181818),
 (247.0, 2.479899497487437),
 (516.0, 3.1538461538461537),
 (626.0, 2.5217391304347827),
 (683.0, 3.0484261501210654),
 (777.0, 4.428571428571429),
 (913.0, 3.4285714285714284),
 (928.0, 3.217391304347826),
 (944.0, 3.5833333333333335),
 (979.0, 2.0),
 (1097.0, 2.6533333333333333),
 (1128.0, 2.2),
 (1214.0, 2.0),
 (1246.0, 2.138888888888889),
 (1446.0, 3.782258064516129),
 (1457.0, 0.0),
 (1500.0, 4.5),
 (1532.0, 4.0),
 (1567.0, 2.0),
 (1654.0, 3.5),
 (1662.0, 3.4),
 (1768.0, 5.25),
 (1772.0, 4.5),
 (1859.0, 1.7142857142857142),
 (1905.0, 3.3043478260869565),
 (2009.0, 3.0),
 (2018.0, 2.574626865671642),
 (2116.0, 4.083333333333333),
 (2169.0, 5.462962962962963),
 (2276.0, 2.1538461538461537),
 (2350.0, 3.4047619047619047),
 (2357.0, 0.9),
 (2368.0, 4.538461538461538),
 (2370.0, 6.0),
 (2416.0, 4.0),
 (2541.0, 3.775),
 (2603.0, 2.47909604519

In [68]:
total_brand_average_age = sum(brand_age_avg.values())/len(brand_age_avg.keys())
print("Average age total: ",total_brand_average_age)

Average age total:  3.0809195407033743


In [69]:
df_brand['average age'] = 0
for index,row in df_brand.iterrows():
        if row['brand_id'] in brand_age_avg.keys():
            df_brand.at[index, 'average age'] = brand_age_avg[row['brand_id']]
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand 1111 action count,brand 1111 action count ratio,brand 1111 activity ratio,brand last week action count,brand last week action count ratio,brand last week activity ratio,last month action count,last month action count ratio,last month activity ratio,average age
0,1500.0,11,11,3,2291,0.001309,0.000000,1,11,2,...,1,0.000822,0.071429,3,0.001425,0.214286,4,0.001453,0.285714,4.500000
1,1500.0,11,7,3,2291,0.001309,0.000000,1,11,2,...,1,0.000822,0.071429,3,0.001425,0.214286,4,0.001453,0.285714,4.500000
2,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0,0.000000,0.000000,0,0.000000,0.000000,1,0.000363,0.022222,3.400000
3,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0,0.000000,0.000000,0,0.000000,0.000000,1,0.000363,0.022222,3.400000
4,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0,0.000000,0.000000,0,0.000000,0.000000,1,0.000363,0.022222,3.400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225,2291,0.098210,0.528889,170,689,26,...,102,0.083813,0.115254,190,0.090261,0.214689,376,0.136628,0.424859,2.479096
6545,247.0,11,11,342,2291,0.149280,0.324561,197,182,19,...,234,0.192276,0.587940,330,0.156770,0.829146,390,0.141715,0.979899,2.479899
6546,2276.0,11,10,13,2291,0.005674,0.307692,2,11,0,...,4,0.003287,0.307692,12,0.005701,0.923077,13,0.004724,1.000000,2.153846
6547,2276.0,11,1,13,2291,0.005674,0.307692,2,11,0,...,4,0.003287,0.307692,12,0.005701,0.923077,13,0.004724,1.000000,2.153846


In [70]:
#FEATURE 19: GENDER FEATURES
brand_gender = {}

# go through user_seller sub-dataframe and split entries by user_id:seller_id
for index, row in df.iterrows():
    if row['brand_id'] not in brand_gender:
        # there may be multiple entries, so default with an empty list
        brand_gender[row["brand_id"]] = [] 
    brand_gender[row["brand_id"]].append(row["gender"])

brand_gender_avg = {}
for row in brand_gender.items():
    brand_gender_avg[row[0]] = np.nansum(row[1])/len(row[1])
sorted(brand_gender_avg.items())

[(68.0, 0.4444444444444444),
 (69.0, 0.12),
 (99.0, 0.2608695652173913),
 (136.0, 1.0),
 (178.0, 0.0),
 (184.0, 0.0),
 (192.0, 0.8181818181818182),
 (247.0, 0.5904522613065326),
 (516.0, 0.0),
 (626.0, 0.0),
 (683.0, 0.6198547215496368),
 (777.0, 0.42857142857142855),
 (913.0, 0.07142857142857142),
 (928.0, 0.30434782608695654),
 (944.0, 0.6666666666666666),
 (979.0, 0.0),
 (1097.0, 0.26666666666666666),
 (1128.0, 0.0),
 (1214.0, 0.0),
 (1246.0, 0.625),
 (1446.0, 0.14516129032258066),
 (1457.0, 0.0),
 (1500.0, 0.35714285714285715),
 (1532.0, 0.0),
 (1567.0, 0.3333333333333333),
 (1654.0, 0.0),
 (1662.0, 0.26666666666666666),
 (1768.0, 1.0),
 (1772.0, 0.0),
 (1859.0, 0.0),
 (1905.0, 0.2608695652173913),
 (2009.0, 0.0),
 (2018.0, 0.14925373134328357),
 (2116.0, 0.3333333333333333),
 (2169.0, 0.6666666666666666),
 (2276.0, 0.15384615384615385),
 (2350.0, 0.09523809523809523),
 (2357.0, 0.1),
 (2368.0, 0.0),
 (2370.0, 0.0),
 (2416.0, 1.0),
 (2541.0, 0.25),
 (2603.0, 0.25084745762711863),
 

In [71]:
total_brand_average_gender = sum(brand_gender_avg.values())/len(brand_gender_avg.keys())
print("Average gender total: ",total_brand_average_gender)

Average gender total:  0.34177214149512847


In [72]:
df_brand['average gender'] = 0
for index,row in df_brand.iterrows():
        if row['brand_id'] in brand_gender_avg.keys():
            df_brand.at[index, 'average gender'] = brand_gender_avg[row['brand_id']]
df_brand

Unnamed: 0,brand_id,month,day,brand_monthly_action_count,month_total_action_count,monthly_action_count_ratio,monthly_mean_action_type,brand_male_count,brand_female_count,brand_unknown_gender_count,...,brand 1111 action count ratio,brand 1111 activity ratio,brand last week action count,brand last week action count ratio,brand last week activity ratio,last month action count,last month action count ratio,last month activity ratio,average age,average gender
0,1500.0,11,11,3,2291,0.001309,0.000000,1,11,2,...,0.000822,0.071429,3,0.001425,0.214286,4,0.001453,0.285714,4.500000,0.357143
1,1500.0,11,7,3,2291,0.001309,0.000000,1,11,2,...,0.000822,0.071429,3,0.001425,0.214286,4,0.001453,0.285714,4.500000,0.357143
2,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0.000000,0.000000,0,0.000000,0.000000,1,0.000363,0.022222,3.400000,0.266667
3,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0.000000,0.000000,0,0.000000,0.000000,1,0.000363,0.022222,3.400000,0.266667
4,1662.0,8,20,10,704,0.014205,0.300000,6,36,3,...,0.000000,0.000000,0,0.000000,0.000000,1,0.000363,0.022222,3.400000,0.266667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6544,2603.0,11,11,225,2291,0.098210,0.528889,170,689,26,...,0.083813,0.115254,190,0.090261,0.214689,376,0.136628,0.424859,2.479096,0.250847
6545,247.0,11,11,342,2291,0.149280,0.324561,197,182,19,...,0.192276,0.587940,330,0.156770,0.829146,390,0.141715,0.979899,2.479899,0.590452
6546,2276.0,11,10,13,2291,0.005674,0.307692,2,11,0,...,0.003287,0.307692,12,0.005701,0.923077,13,0.004724,1.000000,2.153846,0.153846
6547,2276.0,11,1,13,2291,0.005674,0.307692,2,11,0,...,0.003287,0.307692,12,0.005701,0.923077,13,0.004724,1.000000,2.153846,0.153846


In [73]:

df_brand.to_csv('test_brand_profile.csv',index=False)