# Import libraries

In [7]:
# Importing necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency, mannwhitneyu
from scipy.stats import zscore
import numpy as np


# Load Data

In [8]:
# Reading Digital Footprints data
file_path_1 = '../Raw Data/df_final_web_data_pt_1.txt'
file_path_2 = '../Raw Data/df_final_web_data_pt_2.txt'
file_path_demo = '../Raw Data/df_final_demo.txt'
file_path_experiment = '../Raw Data/df_final_experiment_clients.txt'

df_final_demo = pd.read_csv(file_path_demo, sep=",")
df_final_experiment_clients = pd.read_csv(file_path_experiment, sep=",")

df1 = pd.read_csv(file_path_1, sep=",")
df2 = pd.read_csv(file_path_2, sep=',')

# Combining Digital Footprints data
df_combined = pd.concat([df1, df2])

# Saving combined data
output_path = '../Raw Data/df_combined.txt'
df_combined.to_csv(output_path, sep=',', index=False)
print("Data successfully merged and saved in:", output_path)


FileNotFoundError: [Errno 2] No such file or directory: '../Raw Data/df_final_demo.txt'

# Merge Dataset's

In [None]:
# Standardizing the column names
df_final_demo.columns = df_final_demo.columns.str.strip().str.lower()
df_final_experiment_clients.columns = df_final_experiment_clients.columns.str.strip().str.lower()
df_combined_web_data.columns = df_combined_web_data.columns.str.strip().str.lower()

# Merging the dataframes
merged_df = pd.merge(df_final_demo, df_final_experiment_clients, on='client_id', how='inner')
merged_df = pd.merge(merged_df, df_combined_web_data, on='client_id', how='inner')

# Check the merged dataframe
print(merged_df.head())


   client_id  clnt_tenure_yr  clnt_tenure_mnth  clnt_age gendr  num_accts  \
0     836976             6.0              73.0      60.5     U        2.0   
1     836976             6.0              73.0      60.5     U        2.0   
2     836976             6.0              73.0      60.5     U        2.0   
3     836976             6.0              73.0      60.5     U        2.0   
4     836976             6.0              73.0      60.5     U        2.0   

       bal  calls_6_mnth  logons_6_mnth variation            visitor_id  \
0  45105.3           6.0            9.0      Test  427070339_1413275162   
1  45105.3           6.0            9.0      Test  427070339_1413275162   
2  45105.3           6.0            9.0      Test  427070339_1413275162   
3  45105.3           6.0            9.0      Test  427070339_1413275162   
4  45105.3           6.0            9.0      Test  427070339_1413275162   

                      visit_id process_step            date_time  
0  228976764_468254

# Clean Data
Merge all datasets into a single dataframe.

In [None]:
# Drop rows with missing values
Clean_Data = merged_df.dropna()

# Check for remaining missing values
print(Clean_Data.isnull().sum())


client_id           0
clnt_tenure_yr      0
clnt_tenure_mnth    0
clnt_age            0
gendr               0
num_accts           0
bal                 0
calls_6_mnth        0
logons_6_mnth       0
variation           0
visitor_id          0
visit_id            0
process_step        0
date_time           0
dtype: int64


# Data Cleaning

Remove rows with missing values.

In [None]:
# Remove duplicates
Clean_Data = Clean_Data.drop_duplicates()


# Handle Duplicates
Remove any duplicate rows.

In [None]:
# Remove duplicates
Clean_Data = Clean_Data.drop_duplicates()


# Consistency in Data Types
Ensure numerical and categorical data types are correct.


In [None]:
# Convert numerical columns to appropriate data types
numerical_columns = ['clnt_age', 'num_accts', 'bal', 'calls_6_mnth', 'logons_6_mnth']
Clean_Data[numerical_columns] = Clean_Data[numerical_columns].apply(pd.to_numeric, errors='coerce')

# Ensure 'gendr' is treated as a category
Clean_Data['gendr'] = Clean_Data['gendr'].astype('category')


# Data Validation

Identify and remove outliers in numerical columns, especially 'bal'.

In [None]:
# Calculate z-scores for 'bal'
Clean_Data['bal_zscore'] = zscore(Clean_Data['bal'])

# Filter out outliers (those with z-score < -3 or > 3)
Clean_Data_no_outliers = Clean_Data[(Clean_Data['bal_zscore'] >= -3) & (Clean_Data['bal_zscore'] <= 3)]

# Drop the z-score column as it was only needed for outlier detection
Clean_Data_no_outliers = Clean_Data_no_outliers.drop(columns=['bal_zscore'])


# Clean 'gendr' Column
Remove rows where 'gendr' is 'U' or 'X', and drop NaN values

In [None]:
# Filter out 'U' and 'X' from 'gendr' and drop NaN values
Clean_Data_no_outliers = Clean_Data_no_outliers[Clean_Data_no_outliers['gendr'].isin(['M', 'F'])].dropna(subset=['gendr'])

# Check unique values in 'gendr' to confirm cleaning
print(Clean_Data_no_outliers['gendr'].unique())


['M', 'F']
Categories (4, object): ['F', 'M', 'U', 'X']


# Final Cleanup and Save
Ensure the dataset is fully cleaned and save it.

In [None]:
# Final check for missing values and datatypes
print(Clean_Data_no_outliers.isnull().sum())
print(Clean_Data_no_outliers.dtypes)

# Save the cleaned dataset
Clean_Data_no_outliers.to_csv('Clean_Data.csv', index=False)


client_id           0
clnt_tenure_yr      0
clnt_tenure_mnth    0
clnt_age            0
gendr               0
num_accts           0
bal                 0
calls_6_mnth        0
logons_6_mnth       0
variation           0
visitor_id          0
visit_id            0
process_step        0
date_time           0
dtype: int64
client_id              int64
clnt_tenure_yr       float64
clnt_tenure_mnth     float64
clnt_age             float64
gendr               category
num_accts            float64
bal                  float64
calls_6_mnth         float64
logons_6_mnth        float64
variation             object
visitor_id            object
visit_id              object
process_step          object
date_time             object
dtype: object


In [None]:
num_rows, num_cols = Clean_Data.shape
print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_cols}")

Number of rows: 317123
Number of columns: 15


client_id: A unique identifier for each client.
clnt_tenure_yr: The duration of the client's membership with Vanguard in years.
clnt_tenure_mnth: The duration of the client's membership with Vanguard in months.
clnt_age: The age of the customer.
gendr: The gender of the customer, with possible values being “M” (male), “F” (female), “U” (unknown), or “X” (unspecified).
num_accts: The number of accounts the customer has with Vanguard.
bal: The total balance across all of the customer's accounts.
calls_6_mnth: The number of calls the customer has made to customer service in the last six months.
logons_6_mnth: The number of times the customer has logged on to the Vanguard platform in the last six months.
variation: The group assignment for the A/B test (either “Control” or “Test”).
visitor_id: A unique identifier for each customer-device combination.
visit_id: A unique identifier for each visit/session on the website.
process_step: The step in the digital process that the customer is in (e.g., “step_1”, “step_2”, “step_3”, or “confirm”).
date_time: The timestamp of when the customer performed a specific action on the website.
