# Data Sanitation

In [1]:
import pandas as pd

In [2]:
# Define the data files
brand_data_file = 'Data/brand_data.csv'
user_data_file = 'Data/user_data.csv'
interaction_data_file = 'Data/interaction_data.csv'

# Import data files into dataframes
brand_dataframe = pd.read_csv(brand_data_file)
user_dataframe = pd.read_csv(user_data_file)
interaction_dataframe = pd.read_csv(interaction_data_file)

In [3]:
# Check for NaN values in the brands dataframe (before sanitisation)
nan_values = brand_dataframe.isnull().sum()
print("Number of NaN values in brand data (before sanitisation):")
print()
print(nan_values)
print()

# Replace DOMAIN_AUTHORITY NaN values with the 1.0 (the data default)
brand_dataframe.fillna(1.0, inplace=True)

# Replace all 1.0 values with the 25% percentile of this column
non_default_values = brand_dataframe[brand_dataframe['DOMAIN_AUTHORITY'] != 1.0]['DOMAIN_AUTHORITY']
percentile_25 = non_default_values.quantile(0.25)
brand_dataframe.loc[brand_dataframe['DOMAIN_AUTHORITY'] == 1.0, 'DOMAIN_AUTHORITY'] = percentile_25

# Check for NaN values in the brands dataframe (after sanitisation)
nan_values = brand_dataframe.isnull().sum()
print("Number of NaN values in brand data (after sanitisation):")
print()
print(nan_values)


Number of NaN values in brand data (before sanitisation):

BRAND_ID                0
TERRITORY_ID            0
PRIMARY_CATEGORY        0
DOMAIN_AUTHORITY        3
TOTAL_PRODUCTS          0
TOTAL_PRESS_RELEASES    0
TOTAL_COLLABORATIONS    0
HAS_FACEBOOK            0
HAS_TWITTER             0
HAS_PINTEREST           0
HAS_INSTAGRAM           0
IS_AN_AFFILIATE         0
dtype: int64

Number of NaN values in brand data (after sanitisation):

BRAND_ID                0
TERRITORY_ID            0
PRIMARY_CATEGORY        0
DOMAIN_AUTHORITY        0
TOTAL_PRODUCTS          0
TOTAL_PRESS_RELEASES    0
TOTAL_COLLABORATIONS    0
HAS_FACEBOOK            0
HAS_TWITTER             0
HAS_PINTEREST           0
HAS_INSTAGRAM           0
IS_AN_AFFILIATE         0
dtype: int64


In [4]:
# Check for values of 0 in the specified columns in the brand dataframe (before sanitisation)
zero_values = (brand_dataframe[['BRAND_ID', 'TERRITORY_ID', 'PRIMARY_CATEGORY']] == 0).sum()
print("Number of 0 values in relevant brand data fields (before sanitisation):")
print()
print(zero_values)

Number of 0 values in relevant brand data fields (before sanitisation):

BRAND_ID            0
TERRITORY_ID        0
PRIMARY_CATEGORY    0
dtype: int64


In [5]:
# Check for NaN values in the users dataframe (before sanitisation)
nan_values = user_dataframe.isnull().sum()
print("Number of NaN values in user data (before sanitisation):")
print()
print(nan_values)

Number of NaN values in user data (before sanitisation):

USER_ID                 0
TERRITORY_ID            0
TYPE_IN_HOUSE           0
TYPE_FREELANCE          0
TOTAL_DOWNLOADS         0
TOTAL_COLLABORATIONS    0
HAS_FACEBOOK            0
HAS_TWITTER             0
HAS_PINTEREST           0
HAS_INSTAGRAM           0
USES_AFFILIATES         0
dtype: int64


In [6]:
# Check for values of 0 in the specified columns in the user dataframe (before sanitisation)
zero_values = (user_dataframe[['USER_ID', 'TERRITORY_ID']] == 0).sum()
print("Number of 0 values in relevant user data fields (before sanitisation):")
print()
print(zero_values)

Number of 0 values in relevant user data fields (before sanitisation):

USER_ID         0
TERRITORY_ID    0
dtype: int64


In [7]:
# Check for NaN values in the interactions dataframe (before sanitisation)
nan_values = interaction_dataframe.isnull().sum()
print("Number of NaN values in interaction data (before sanitisation):")
print()
print(nan_values)

Number of NaN values in interaction data (before sanitisation):

BRAND_ID              0
USER_ID               0
TOTAL_INTERACTIONS    0
dtype: int64


In [8]:
# Check for values of 0 in the specified columns in the interaction dataframe (before sanitisation)
zero_values = (interaction_dataframe[['BRAND_ID', 'USER_ID', 'TOTAL_INTERACTIONS']] == 0).sum()
print("Number of 0 values in relevant interaction data fields (before sanitisation):")
print()
print(zero_values)

Number of 0 values in relevant interaction data fields (before sanitisation):

BRAND_ID              0
USER_ID               0
TOTAL_INTERACTIONS    0
dtype: int64


In [9]:
# Ensure correct data types on brand dataframes
brand_dataframe = brand_dataframe.astype({
    'BRAND_ID': int,
    'TERRITORY_ID': int,
    'PRIMARY_CATEGORY': int,
    'DOMAIN_AUTHORITY': float,
    'TOTAL_PRODUCTS': int,
    'TOTAL_PRESS_RELEASES': int,
    'TOTAL_COLLABORATIONS': int,
    'HAS_FACEBOOK': bool,
    'HAS_TWITTER': bool,
    'HAS_PINTEREST': bool,
    'HAS_INSTAGRAM': bool,
    'IS_AN_AFFILIATE': bool
})

# Ensure correct data types on user dataframes
user_dataframe = user_dataframe.astype({
    'USER_ID': int,
    'TERRITORY_ID': int,
    'TYPE_IN_HOUSE': bool,
    'TYPE_FREELANCE': bool,
    'TOTAL_DOWNLOADS': int,
    'TOTAL_COLLABORATIONS': int,
    'HAS_FACEBOOK': bool,
    'HAS_TWITTER': bool,
    'HAS_PINTEREST': bool,
    'HAS_INSTAGRAM': bool,
    'HAS_INSTAGRAM': bool,
    'USES_AFFILIATES': bool
})

# Ensure correct data types on interaction dataframes
interaction_dataframe = interaction_dataframe.astype({
    'BRAND_ID': int,
    'USER_ID': int,
    'TOTAL_INTERACTIONS': int
})

In [10]:
# Define the sanitised data files
brand_data_file = 'Data/brand_data_sanitised.csv'
user_data_file = 'Data/user_data_sanitised.csv'
interaction_data_file = 'Data/interaction_data_sanitised.csv'

# Save brands DataFrame to CSV
brand_dataframe.to_csv(brand_data_file, index=False)

# Save users DataFrame to CSV
user_dataframe.to_csv(user_data_file, index=False)

# Save interactions DataFrame to CSV
interaction_dataframe.to_csv(interaction_data_file, index=False)