# Univariate, Bivariate, and Multivariate Analysis

This notebook performs comprehensive analysis at different variable levels:
- Univariate Analysis: Single variable analysis
- Bivariate Analysis: Two variable relationships
- Multivariate Analysis: Multiple variable relationships


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Load and preprocess data
df = pd.read_csv('../../data/mock_treatment_starts_2016.csv')
df['TreatmentStart'] = pd.to_datetime(df['TreatmentStart'], format='%m/%d/%y')
df['Year'] = df['TreatmentStart'].dt.year
df['Month'] = df['TreatmentStart'].dt.month
df['MonthName'] = df['TreatmentStart'].dt.strftime('%B')
df['Weekday'] = df['TreatmentStart'].dt.day_name()

print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")
df.head()


## 1. Univariate Analysis


In [None]:
# Univariate Analysis: Numerical Variable (Dosage)
print("=" * 60)
print("UNIVARIATE ANALYSIS: DOSAGE")
print("=" * 60)

fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# 1. Histogram
axes[0, 0].hist(df['Dosage'], bins=15, edgecolor='black', alpha=0.7, color='steelblue')
axes[0, 0].axvline(df['Dosage'].mean(), color='r', linestyle='--', label=f'Mean: {df["Dosage"].mean():.2f}')
axes[0, 0].axvline(df['Dosage'].median(), color='g', linestyle='--', label=f'Median: {df["Dosage"].median():.2f}')
axes[0, 0].set_title('Histogram: Dosage Distribution')
axes[0, 0].set_xlabel('Dosage')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. Box Plot
axes[0, 1].boxplot(df['Dosage'], vert=True)
axes[0, 1].set_title('Box Plot: Dosage')
axes[0, 1].set_ylabel('Dosage')
axes[0, 1].grid(True, alpha=0.3)

# 3. Density Plot
df['Dosage'].plot.density(ax=axes[0, 2], color='purple')
axes[0, 2].set_title('Density Plot: Dosage')
axes[0, 2].set_xlabel('Dosage')
axes[0, 2].grid(True, alpha=0.3)

# 4. Violin Plot
sns.violinplot(y=df['Dosage'], ax=axes[1, 0], color='lightblue')
axes[1, 0].set_title('Violin Plot: Dosage')
axes[1, 0].set_ylabel('Dosage')
axes[1, 0].grid(True, alpha=0.3)

# 5. Q-Q Plot
stats.probplot(df['Dosage'], dist="norm", plot=axes[1, 1])
axes[1, 1].set_title('Q-Q Plot: Normality Check')
axes[1, 1].grid(True, alpha=0.3)

# 6. Cumulative Distribution
axes[1, 2].hist(df['Dosage'], bins=15, cumulative=True, density=True, 
                edgecolor='black', alpha=0.7, color='orange')
axes[1, 2].set_title('Cumulative Distribution: Dosage')
axes[1, 2].set_xlabel('Dosage')
axes[1, 2].set_ylabel('Cumulative Probability')
axes[1, 2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Summary statistics
print("\nSummary Statistics:")
print(df['Dosage'].describe())
print(f"\nSkewness: {df['Dosage'].skew():.4f}")
print(f"Kurtosis: {df['Dosage'].kurtosis():.4f}")


In [None]:
# Univariate Analysis: Categorical Variables
print("=" * 60)
print("UNIVARIATE ANALYSIS: CATEGORICAL VARIABLES")
print("=" * 60)

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Drug distribution
drug_counts = df['Drug'].value_counts()
axes[0].bar(drug_counts.index, drug_counts.values, color=['steelblue', 'coral'])
axes[0].set_title('Drug Distribution')
axes[0].set_xlabel('Drug')
axes[0].set_ylabel('Count')
axes[0].grid(True, alpha=0.3, axis='y')
for i, v in enumerate(drug_counts.values):
    axes[0].text(i, v, str(v), ha='center', va='bottom')

# Month distribution
month_counts = df['MonthName'].value_counts().sort_index()
axes[1].bar(range(len(month_counts)), month_counts.values, color='teal')
axes[1].set_title('Treatment Starts by Month')
axes[1].set_xlabel('Month')
axes[1].set_ylabel('Count')
axes[1].set_xticks(range(len(month_counts)))
axes[1].set_xticklabels(month_counts.index, rotation=45, ha='right')
axes[1].grid(True, alpha=0.3, axis='y')
for i, v in enumerate(month_counts.values):
    axes[1].text(i, v, str(v), ha='center', va='bottom')

plt.tight_layout()
plt.show()

print("\nDrug Distribution:")
print(df['Drug'].value_counts())
print(f"\nProportions:\n{df['Drug'].value_counts(normalize=True) * 100}")


## 2. Bivariate Analysis


In [None]:
# Bivariate Analysis: Dosage vs Drug
print("=" * 60)
print("BIVARIATE ANALYSIS: DOSAGE vs DRUG")
print("=" * 60)

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Box plot: Dosage by Drug
sns.boxplot(data=df, x='Drug', y='Dosage', ax=axes[0, 0])
axes[0, 0].set_title('Dosage Distribution by Drug')
axes[0, 0].grid(True, alpha=0.3, axis='y')

# 2. Violin plot: Dosage by Drug
sns.violinplot(data=df, x='Drug', y='Dosage', ax=axes[0, 1])
axes[0, 1].set_title('Dosage Distribution (Violin) by Drug')
axes[0, 1].grid(True, alpha=0.3, axis='y')

# 3. Bar plot: Mean Dosage by Drug
mean_dosage = df.groupby('Drug')['Dosage'].mean()
axes[1, 0].bar(mean_dosage.index, mean_dosage.values, color=['steelblue', 'coral'])
axes[1, 0].set_title('Mean Dosage by Drug')
axes[1, 0].set_xlabel('Drug')
axes[1, 0].set_ylabel('Mean Dosage')
axes[1, 0].grid(True, alpha=0.3, axis='y')
for i, v in enumerate(mean_dosage.values):
    axes[1, 0].text(i, v, f'{v:.2f}', ha='center', va='bottom')

# 4. Swarm plot
sns.stripplot(data=df, x='Drug', y='Dosage', ax=axes[1, 1], size=8, alpha=0.6)
axes[1, 1].set_title('Dosage by Drug (Swarm Plot)')
axes[1, 1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

# Statistical summary
print("\nDosage by Drug:")
print(df.groupby('Drug')['Dosage'].describe())
print("\nMean Dosage Comparison:")
print(mean_dosage)


In [None]:
# Bivariate Analysis: Dosage vs Month
print("=" * 60)
print("BIVARIATE ANALYSIS: DOSAGE vs MONTH")
print("=" * 60)

fig, axes = plt.subplots(2, 2, figsize=(18, 10))

# Order months chronologically
month_order = ['January', 'February', 'March', 'April', 'May', 'June']
month_counts_ordered = df['MonthName'].value_counts().reindex(month_order, fill_value=0)

# 1. Line plot: Mean Dosage by Month
mean_dosage_month = df.groupby('MonthName')['Dosage'].mean().reindex(month_order)
axes[0, 0].plot(range(len(mean_dosage_month)), mean_dosage_month.values, 
                marker='o', linewidth=2, markersize=8, color='steelblue')
axes[0, 0].set_title('Mean Dosage by Month')
axes[0, 0].set_xlabel('Month')
axes[0, 0].set_ylabel('Mean Dosage')
axes[0, 0].set_xticks(range(len(mean_dosage_month)))
axes[0, 0].set_xticklabels(mean_dosage_month.index, rotation=45, ha='right')
axes[0, 0].grid(True, alpha=0.3)

# 2. Box plot: Dosage by Month
month_df = df[df['MonthName'].isin(month_order)]
sns.boxplot(data=month_df, x='MonthName', y='Dosage', order=month_order, ax=axes[0, 1])
axes[0, 1].set_title('Dosage Distribution by Month')
axes[0, 1].tick_params(axis='x', rotation=45)
axes[0, 1].grid(True, alpha=0.3, axis='y')

# 3. Bar plot: Treatment Count by Month
axes[1, 0].bar(range(len(month_counts_ordered)), month_counts_ordered.values, color='teal')
axes[1, 0].set_title('Treatment Starts Count by Month')
axes[1, 0].set_xlabel('Month')
axes[1, 0].set_ylabel('Count')
axes[1, 0].set_xticks(range(len(month_counts_ordered)))
axes[1, 0].set_xticklabels(month_counts_ordered.index, rotation=45, ha='right')
axes[1, 0].grid(True, alpha=0.3, axis='y')

# 4. Heatmap: Drug usage by Month
drug_month_crosstab = pd.crosstab(df['MonthName'], df['Drug']).reindex(month_order, fill_value=0)
sns.heatmap(drug_month_crosstab, annot=True, fmt='d', cmap='YlOrRd', ax=axes[1, 1])
axes[1, 1].set_title('Drug Usage by Month (Heatmap)')
axes[1, 1].set_xlabel('Drug')
axes[1, 1].set_ylabel('Month')

plt.tight_layout()
plt.show()

print("\nMean Dosage by Month:")
print(mean_dosage_month)
print("\nDrug Usage by Month:")
print(drug_month_crosstab)


In [None]:
# Bivariate Analysis: Correlation and Association
print("=" * 60)
print("BIVARIATE ANALYSIS: CORRELATIONS AND ASSOCIATIONS")
print("=" * 60)

# Create numerical encoding for categorical variables
df_encoded = df.copy()
df_encoded['Drug_encoded'] = df_encoded['Drug'].map({'Cisplatin': 0, 'Nivolumab': 1})
df_encoded['Month_encoded'] = df_encoded['Month']

# Correlation matrix
corr_vars = ['Dosage', 'Drug_encoded', 'Month_encoded']
corr_matrix = df_encoded[corr_vars].corr()

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Correlation heatmap
sns.heatmap(corr_matrix, annot=True, fmt='.3f', cmap='coolwarm', center=0, 
            square=True, ax=axes[0], cbar_kws={'label': 'Correlation'})
axes[0].set_title('Correlation Matrix')
axes[0].set_xticklabels(['Dosage', 'Drug', 'Month'], rotation=45, ha='right')
axes[0].set_yticklabels(['Dosage', 'Drug', 'Month'], rotation=0)

# Scatter plot: Dosage vs Month
axes[1].scatter(df_encoded['Month'], df_encoded['Dosage'], 
               c=df_encoded['Drug_encoded'], cmap='viridis', alpha=0.6, s=100)
axes[1].set_title('Dosage vs Month (colored by Drug)')
axes[1].set_xlabel('Month')
axes[1].set_ylabel('Dosage')
axes[1].grid(True, alpha=0.3)
cbar = plt.colorbar(axes[1].collections[0], ax=axes[1])
cbar.set_label('Drug (0=Cisplatin, 1=Nivolumab)')

plt.tight_layout()
plt.show()

print("\nCorrelation Matrix:")
print(corr_matrix)

# Chi-square test for Drug and Month association
from scipy.stats import chi2_contingency
contingency_table = pd.crosstab(df['Drug'], df['MonthName'])
chi2, p_value, dof, expected = chi2_contingency(contingency_table)
print(f"\nChi-square test for Drug-Month association:")
print(f"  Chi-square statistic: {chi2:.4f}")
print(f"  p-value: {p_value:.4f}")
print(f"  Degrees of freedom: {dof}")
print(f"  Significant association: {'Yes' if p_value < 0.05 else 'No'} (α=0.05)")


## 3. Multivariate Analysis


In [None]:
# Multivariate Analysis: Multiple Variables
print("=" * 60)
print("MULTIVARIATE ANALYSIS")
print("=" * 60)

fig, axes = plt.subplots(2, 2, figsize=(18, 12))

# 1. Grouped bar chart: Mean Dosage by Drug and Month
drug_month_mean = df.groupby(['Drug', 'MonthName'])['Dosage'].mean().unstack(fill_value=0)
drug_month_mean = drug_month_mean.reindex(columns=month_order, fill_value=0)
drug_month_mean.plot(kind='bar', ax=axes[0, 0], color=['steelblue', 'coral'])
axes[0, 0].set_title('Mean Dosage by Drug and Month')
axes[0, 0].set_xlabel('Drug')
axes[0, 0].set_ylabel('Mean Dosage')
axes[0, 0].legend(title='Month', bbox_to_anchor=(1.05, 1), loc='upper left')
axes[0, 0].grid(True, alpha=0.3, axis='y')
axes[0, 0].tick_params(axis='x', rotation=0)

# 2. Heatmap: Mean Dosage by Drug and Month
sns.heatmap(drug_month_mean, annot=True, fmt='.1f', cmap='YlOrRd', ax=axes[0, 1], cbar_kws={'label': 'Mean Dosage'})
axes[0, 1].set_title('Mean Dosage Heatmap: Drug × Month')
axes[0, 1].set_xlabel('Month')
axes[0, 1].set_ylabel('Drug')

# 3. Faceted histogram: Dosage distribution by Drug and Month
for i, drug in enumerate(df['Drug'].unique()):
    drug_data = df[df['Drug'] == drug]
    axes[1, 0].hist(drug_data['Dosage'], alpha=0.6, label=drug, bins=10)
axes[1, 0].set_title('Dosage Distribution by Drug')
axes[1, 0].set_xlabel('Dosage')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# 4. Scatter plot matrix (simplified)
# Create a pivot table for visualization
pivot_data = df.pivot_table(values='Dosage', index='MonthName', columns='Drug', aggfunc='mean')
pivot_data = pivot_data.reindex(month_order, fill_value=0)

# Scatter with multiple dimensions
scatter = axes[1, 1].scatter(df['Month'], df['Dosage'], 
                            c=df['Drug'].map({'Cisplatin': 0, 'Nivolumab': 1}), 
                            s=df['Dosage']/5, alpha=0.6, cmap='viridis')
axes[1, 1].set_title('Multivariate View: Month vs Dosage\n(size=Dosage, color=Drug)')
axes[1, 1].set_xlabel('Month')
axes[1, 1].set_ylabel('Dosage')
axes[1, 1].grid(True, alpha=0.3)
cbar = plt.colorbar(scatter, ax=axes[1, 1])
cbar.set_label('Drug (0=Cisplatin, 1=Nivolumab)')

plt.tight_layout()
plt.show()

print("\nMean Dosage by Drug and Month:")
print(drug_month_mean)
print("\nSummary Statistics by Drug and Month:")
print(df.groupby(['Drug', 'MonthName'])['Dosage'].describe())


In [None]:
# Multivariate Analysis: Advanced visualizations
print("=" * 60)
print("MULTIVARIATE ANALYSIS: ADVANCED VISUALIZATIONS")
print("=" * 60)

fig, axes = plt.subplots(1, 2, figsize=(18, 6))

# 1. Pair plot style analysis
# Create a comprehensive view of relationships
import itertools

# Prepare data for pair analysis
analysis_vars = ['Dosage', 'Month', 'Drug']
drug_numeric = df['Drug'].map({'Cisplatin': 0, 'Nivolumab': 1})

# Correlation with all variables
extended_corr = df_encoded[['Dosage', 'Drug_encoded', 'Month']].corr()
sns.heatmap(extended_corr, annot=True, fmt='.3f', cmap='coolwarm', center=0,
            square=True, ax=axes[0], cbar_kws={'label': 'Correlation'})
axes[0].set_title('Extended Correlation Matrix')
axes[0].set_xticklabels(['Dosage', 'Drug', 'Month'], rotation=45, ha='right')
axes[0].set_yticklabels(['Dosage', 'Drug', 'Month'], rotation=0)

# 2. 3D-style visualization using size and color
scatter = axes[1].scatter(df['Month'], df['Dosage'], 
                         c=drug_numeric, s=df['Dosage']/3,
                         alpha=0.7, cmap='RdYlBu', edgecolors='black', linewidth=1)
axes[1].set_title('Multivariate Scatter: Month, Dosage, Drug\n(Color=Drug, Size=Dosage)')
axes[1].set_xlabel('Month')
axes[1].set_ylabel('Dosage')
axes[1].grid(True, alpha=0.3)
cbar = plt.colorbar(scatter, ax=axes[1])
cbar.set_label('Drug (0=Cisplatin, 1=Nivolumab)')

plt.tight_layout()
plt.show()

# Statistical summary of multivariate relationships
print("\nMultivariate Summary:")
print("\n1. Dosage statistics by Drug:")
print(df.groupby('Drug')['Dosage'].agg(['mean', 'std', 'min', 'max']))

print("\n2. Dosage statistics by Month:")
print(df.groupby('MonthName')['Dosage'].agg(['mean', 'std', 'count']).reindex(month_order, fill_value=0))

print("\n3. Drug distribution by Month:")
print(pd.crosstab(df['MonthName'], df['Drug'], margins=True).reindex(month_order, fill_value=0))
