In [None]:
# Cell 1: Import necessary libraries
import pandas as pd
from scipy import stats
import numpy as np


In [None]:
# Cell 2: Load the dataset
file_path = './data/MachineLearningRating_v3.txt'
df = pd.read_csv(file_path, delimiter='|')  # Adjust delimiter if needed
df.head()


In [None]:
# Cell 3: Function to calculate claim frequency, severity, and margin
def calculate_risk_metrics(df):
    """Calculates Claim Frequency, Claim Severity, and Margin."""
    df['ClaimOccurred'] = (df['TotalClaims'] > 0).astype(int)
    df['ClaimFrequency'] = df['ClaimOccurred']
    df['ClaimSeverity'] = df['TotalClaims'].where(df['ClaimOccurred'] == 1)
    df['Margin'] = df['TotalPremium'] - df['TotalClaims']
    return df

# Apply calculation
df = calculate_risk_metrics(df)
df[['TotalPremium', 'TotalClaims', 'ClaimOccurred', 'ClaimFrequency', 'ClaimSeverity', 'Margin']].head()


In [None]:
# Cell 4: T-test for numerical comparisons
def perform_t_test(group1_data, group2_data, alpha=0.05):
    """Performs independent t-test for numerical data."""
    stat, p_value = stats.ttest_ind(group1_data.dropna(), group2_data.dropna(), equal_var=False)  # Welch's t-test
    return stat, p_value, p_value < alpha


In [None]:
# Cell 5: Chi-squared test for categorical comparisons
def perform_chi_squared_test(group1_claims_occurred, group1_no_claims,
                             group2_claims_occurred, group2_no_claims, alpha=0.05):
    """Performs Chi-squared test for Claim Frequency."""
    contingency_table = np.array([[group1_claims_occurred, group1_no_claims],
                                  [group2_claims_occurred, group2_no_claims]])
    chi2_stat, p_value, _, _ = stats.chi2_contingency(contingency_table)
    return chi2_stat, p_value, p_value < alpha
