# ðŸ§© Vanguard Digital Experiment â€” EDA & Data Cleaning
This notebook performs initial **data exploration**, **cleaning**, and **visualization** for the Vanguard digital experiment project.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options
pd.set_option('display.max_columns', None)
sns.set(style='whitegrid', palette='pastel')

## 2. Load Data

In [None]:
# Load datasets

demo = pd.read_csv('df_final_demo.csv')
web1 = pd.read_csv('df_final_web_data_pt_1.csv')
web2 = pd.read_csv('df_final_web_data_pt_2.csv')
exp = pd.read_csv('df_final_experiment_clients.csv')

# Merge the two web data parts
web_data = pd.concat([web1, web2], ignore_index=True)

print('Demo shape:', demo.shape)
print('Web data shape:', web_data.shape)
print('Experiment shape:', exp.shape)

## 3. Inspect Data

In [None]:
# Quick look at each dataset
print('\nDemo Data Preview:')
demo.head()

print('\nWeb Data Preview:')
web_data.head()

print('\nExperiment Data Preview:')
exp.head()

### Data Info and Summary

In [None]:
demo.info()
web_data.info()
exp.info()

# Basic statistics
demo.describe(include='all')

## 4. Data Cleaning

In [None]:
# Convert date_time to datetime
web_data['date_time'] = pd.to_datetime(web_data['date_time'], errors='coerce')

# Check for missing values
print('Missing values per dataset:')
print('Demo:', demo.isna().sum().sum())
print('Web data:', web_data.isna().sum().sum())
print('Experiment:', exp.isna().sum().sum())

### Merge Datasets

In [None]:
# Merge demographics, web data, and experiment info
merged = web_data.merge(exp, on='client_id', how='left').merge(demo, on='client_id', how='left')

print('Merged dataset shape:', merged.shape)
merged.head()

## 5. Exploratory Data Analysis (EDA)

### 5.1 Demographics Overview

In [None]:
# Age distribution
sns.histplot(demo['clnt_age'], kde=True)
plt.title('Client Age Distribution')
plt.show()

# Gender distribution
demo['gendr'].value_counts().plot(kind='bar', title='Gender Distribution')
plt.show()

### 5.2 Client Tenure and Balance

In [None]:
sns.boxplot(x='gendr', y='bal', data=demo)
plt.title('Balance by Gender')
plt.show()

sns.histplot(demo['clnt_tenure_yr'], bins=20, kde=True)
plt.title('Client Tenure Distribution')
plt.show()

### 5.3 Process Behavior

In [None]:
web_data['process_step'].value_counts().plot(kind='bar')
plt.title('Frequency of Each Process Step')
plt.show()

# Completion rate estimation
completion_counts = web_data.groupby('variation')['process_step'].apply(lambda x: (x == 'confirm').sum())
total_counts = web_data['variation'].value_counts()
completion_rate = (completion_counts / total_counts) * 100
print('Completion Rate by Variation (%):')
print(completion_rate)

### 5.4 Correlation Heatmap (Demographics)

In [None]:
numeric_cols = ['clnt_age','clnt_tenure_yr','bal','logons_6_mnth','calls_6_mnth']
sns.heatmap(demo[numeric_cols].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap of Client Metrics')
plt.show()

## 6. Early Insights

Use this section to note key takeaways:
- Who are the primary clients (age, gender, tenure)?
- Are there visible differences between control and test groups?
- Are there any anomalies, missing data, or outliers?

ðŸ’¡ These insights will guide KPI and hypothesis testing in the next steps.