# Data Preparation

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from scipy import stats

In [2]:
# Define the data files
brand_data_file = 'Data/brand_data.csv'
user_data_file = 'Data/user_data.csv'
interaction_data_file = 'Data/interaction_data.csv'

# Import data files into dataframes
brand_dataframe = pd.read_csv(brand_data_file)
user_dataframe = pd.read_csv(user_data_file)
interaction_dataframe = pd.read_csv(interaction_data_file)

In [3]:
# Merge interactions with brand features and user features
interaction_preperation_dataframe = interaction_dataframe.merge(brand_dataframe, on='BRAND_ID', how='left')
interaction_preperation_dataframe = interaction_preperation_dataframe.merge(user_dataframe, on='USER_ID', how='left', suffixes=('_BRAND', '_USER'))

In [4]:
# Apply Box-Cox transformation
total_interactions_boxcox, fitted_lambda = stats.boxcox(interaction_preperation_dataframe['TOTAL_INTERACTIONS'])

interaction_preperation_dataframe['TOTAL_INTERACTIONS_BOXCOX'] = total_interactions_boxcox

# Normalise the Box-Cox values
interaction_preperation_dataframe['TOTAL_INTERACTIONS_BOXCOX_NORMALIZED'] = MinMaxScaler().fit_transform(interaction_preperation_dataframe[['TOTAL_INTERACTIONS_BOXCOX']])

# Find the minimum non-zero value
min_non_zero_value = interaction_preperation_dataframe[interaction_preperation_dataframe['TOTAL_INTERACTIONS_BOXCOX_NORMALIZED'] > 0]['TOTAL_INTERACTIONS_BOXCOX_NORMALIZED'].min()

# Replace zero values with the minimum non-zero value
interaction_preperation_dataframe.loc[interaction_preperation_dataframe['TOTAL_INTERACTIONS_BOXCOX_NORMALIZED'] == 0, 'TOTAL_INTERACTIONS_BOXCOX_NORMALIZED'] = min_non_zero_value


In [5]:
# We're weighing after normalisation because of the skewness of the data
interaction_preperation_dataframe['TOTAL_WEIGHT'] = 0

# Define the weight for affiliate use
affiliate_weight = .5

# Increase the total weight where both USES_AFFILIATE and IS_AFFILIATE are true
interaction_preperation_dataframe['TOTAL_INTERACTIONS_BOXCOX_NORMALIZED_WEIGHTED'] = interaction_preperation_dataframe['TOTAL_INTERACTIONS_BOXCOX_NORMALIZED']
affiliate_condition = (interaction_preperation_dataframe['IS_AN_AFFILIATE'] == 1) & (interaction_preperation_dataframe['USES_AFFILIATES'] == 1)
interaction_preperation_dataframe.loc[affiliate_condition, 'TOTAL_WEIGHT'] += affiliate_weight

# Define the weight for matching territories
territory_weight = .1

# Increase the total weight where the user and brand territory match
interaction_preperation_dataframe['TOTAL_INTERACTIONS_BOXCOX_NORMALIZED_WEIGHTED'] = interaction_preperation_dataframe['TOTAL_INTERACTIONS_BOXCOX_NORMALIZED']
affiliate_condition = (interaction_preperation_dataframe['TERRITORY_ID_BRAND'] == interaction_preperation_dataframe['TERRITORY_ID_USER'])
interaction_preperation_dataframe.loc[affiliate_condition, 'TOTAL_WEIGHT'] += territory_weight

# Define the weight if client and user have past collaborations
collaborations_weight = .3

# Increase the total weight where both USES_AFFILIATE and IS_AFFILIATE are true
interaction_preperation_dataframe['TOTAL_INTERACTIONS_BOXCOX_NORMALIZED_WEIGHTED'] = interaction_preperation_dataframe['TOTAL_INTERACTIONS_BOXCOX_NORMALIZED']
affiliate_condition = (interaction_preperation_dataframe['TOTAL_COLLABORATIONS_BRAND'] > 0) & (interaction_preperation_dataframe['TOTAL_COLLABORATIONS_USER'] > 0)
interaction_preperation_dataframe.loc[affiliate_condition, 'TOTAL_WEIGHT'] += affiliate_weight

In [6]:
interaction_preperation_dataframe['TOTAL_INTERACTIONS_BOXCOX_NORMALIZED_WEIGHTED'] *= (1 + interaction_preperation_dataframe['TOTAL_WEIGHT'])

In [7]:
# Drop unecessary fields
interaction_final_dataframe = interaction_preperation_dataframe[['USER_ID', 'BRAND_ID', 'TOTAL_INTERACTIONS_BOXCOX_NORMALIZED_WEIGHTED']]
interaction_final_dataframe = interaction_final_dataframe.rename(columns={'TOTAL_INTERACTIONS_BOXCOX_NORMALIZED_WEIGHTED': 'SCORE'})

# Define the prepared data file
interaction_data_file = 'Data/interaction_data_prepared.csv'

# Save interactions DataFrame to CSV
interaction_final_dataframe.to_csv(interaction_data_file, index=False)