# Data Preparation

This file will prepare the data based on our findings from data exploration as well as applying domain knowledge about our industry, brands, users, and what we know contributes to successful matches.

In [8]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from scipy import stats

In [11]:
# Define the data file names
brand_data_file = 'data/brand_data_sanitised.csv'
user_data_file = 'data/user_data_sanitised.csv'
interaction_data_file = 'data/interaction_data_sanitised.csv'

# Import data files into dataframes
brand_dataframe = pd.read_csv(brand_data_file)
user_dataframe = pd.read_csv(user_data_file)
interaction_dataframe = pd.read_csv(interaction_data_file)

In [13]:
# Remove outliers
interaction_dataframe = interaction_dataframe[interaction_dataframe['TOTAL_INTERACTIONS'] <= 100]

In [14]:
# Merge interactions with brand features and user features
interaction_preperation_dataframe = interaction_dataframe.merge(brand_dataframe, on='BRAND_ID', how='inner')
interaction_preperation_dataframe = interaction_preperation_dataframe.merge(user_dataframe, on='USER_ID', how='inner', suffixes=('_BRAND', '_USER'))


In [15]:
# Apply Box-Cox transformation to attempt to improve the distribution of interactions as they are so scewed
total_interactions_boxcox, fitted_lambda = stats.boxcox(interaction_preperation_dataframe['TOTAL_INTERACTIONS'])

interaction_preperation_dataframe['TOTAL_INTERACTIONS_BOXCOX'] = total_interactions_boxcox

# Normalise the Box-Cox values
interaction_preperation_dataframe['TOTAL_INTERACTIONS_BOXCOX_NORMALISED'] = MinMaxScaler().fit_transform(interaction_preperation_dataframe[['TOTAL_INTERACTIONS_BOXCOX']])

# Find the minimum non-zero value
min_non_zero_value = interaction_preperation_dataframe[interaction_preperation_dataframe['TOTAL_INTERACTIONS_BOXCOX_NORMALISED'] > 0]['TOTAL_INTERACTIONS_BOXCOX_NORMALISED'].min()

# Replace zero values with the minimum non-zero value as we do not want 0 scores in our training
interaction_preperation_dataframe.loc[interaction_preperation_dataframe['TOTAL_INTERACTIONS_BOXCOX_NORMALISED'] == 0, 'TOTAL_INTERACTIONS_BOXCOX_NORMALISED'] = min_non_zero_value


In [16]:
# We're weighting the scroes after normalisation (rather than before) because of the large disctribution of values
interaction_preperation_dataframe['TOTAL_WEIGHT'] = 0

# Add weight for affiliate use
affiliate_weight = .5
interaction_preperation_dataframe['TOTAL_INTERACTIONS_BOXCOX_NORMALISED_WEIGHTED'] = interaction_preperation_dataframe['TOTAL_INTERACTIONS_BOXCOX_NORMALISED']
affiliate_condition = (interaction_preperation_dataframe['IS_AN_AFFILIATE'] == 1) & (interaction_preperation_dataframe['USES_AFFILIATES'] == 1)
interaction_preperation_dataframe.loc[affiliate_condition, 'TOTAL_WEIGHT'] += affiliate_weight

# Add weight for matching territories
territory_weight = .1
interaction_preperation_dataframe['TOTAL_INTERACTIONS_BOXCOX_NORMALISED_WEIGHTED'] = interaction_preperation_dataframe['TOTAL_INTERACTIONS_BOXCOX_NORMALISED']
affiliate_condition = (interaction_preperation_dataframe['TERRITORY_ID_BRAND'] == interaction_preperation_dataframe['TERRITORY_ID_USER'])
interaction_preperation_dataframe.loc[affiliate_condition, 'TOTAL_WEIGHT'] += territory_weight

# Add weight if client and user have past collaborations
collaborations_weight = .3
interaction_preperation_dataframe['TOTAL_INTERACTIONS_BOXCOX_NORMALISED_WEIGHTED'] = interaction_preperation_dataframe['TOTAL_INTERACTIONS_BOXCOX_NORMALISED']
affiliate_condition = (interaction_preperation_dataframe['TOTAL_COLLABORATIONS_BRAND'] > 0) & (interaction_preperation_dataframe['TOTAL_COLLABORATIONS_USER'] > 0)
interaction_preperation_dataframe.loc[affiliate_condition, 'TOTAL_WEIGHT'] += affiliate_weight

# Add weight based on the domain authority of the client
scaler = MinMaxScaler(feature_range=(0, 0.1))
interaction_preperation_dataframe['DOMAIN_WEIGHT'] = scaler.fit_transform(interaction_preperation_dataframe[['DOMAIN_AUTHORITY']])
interaction_preperation_dataframe['TOTAL_WEIGHT'] += interaction_preperation_dataframe['DOMAIN_WEIGHT']

In [17]:
# Apply the total weight to the normalised score
interaction_preperation_dataframe['TOTAL_INTERACTIONS_BOXCOX_NORMALISED_WEIGHTED'] *= (1 + interaction_preperation_dataframe['TOTAL_WEIGHT'])

In [18]:
# Drop unecessary fields and rename the normalised weighted interaction to 'score' for clarity
interaction_final_dataframe = interaction_preperation_dataframe[['USER_ID', 'BRAND_ID', 'TOTAL_INTERACTIONS_BOXCOX_NORMALISED_WEIGHTED']]
interaction_final_dataframe = interaction_final_dataframe.rename(columns={'TOTAL_INTERACTIONS_BOXCOX_NORMALISED_WEIGHTED': 'SCORE'})

# Define the prepared data file
interaction_data_file = 'data/interaction_data_prepared.csv'

# Save prepared interactions DataFrame to CSV
interaction_final_dataframe.to_csv(interaction_data_file, index=False)