In [46]:
from typing import List, Tuple
from helper.helper_functions import load_dataset, save_model
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

In [47]:
data = load_dataset('../data/assignment2_income_cleaned.xlsx')

In [48]:
# Splitting the data into features (X) and target (y)
X = data.drop(columns=['income'])
y = data['income']

### a
Point-biserial correlation is a special case of Pearson correlation coefficient used when one variable is continuous and the other is dichotomous (binary).

https://en.wikipedia.org/wiki/Point-biserial_correlation_coefficient
https://resources.nu.edu/statsresources/Pointbiserial

In [49]:
from scipy.stats import shapiro, kstest

continuous_features = ['age', 'workinghours', 'education', 'ability to speak english']

for feature in continuous_features:
    # Shapiro-Wilk Test
    shapiro_stat, shapiro_p_value = shapiro(data[feature])
    print(f"Shapiro-Wilk Test ({feature}) - Statistic: {shapiro_stat}, P-value: {shapiro_p_value:.10f}")

    # Kolmogorov-Smirnov Test
    ks_stat, ks_p_value = kstest(data[feature], 'norm')
    print(f"Kolmogorov-Smirnov Test ({feature}) - Statistic: {ks_stat}, P-value: {ks_p_value:.10f}")

Shapiro-Wilk Test (age) - Statistic: 0.974037822220245, P-value: 0.0000000000
Kolmogorov-Smirnov Test (age) - Statistic: 1.0, P-value: 0.0000000000
Shapiro-Wilk Test (workinghours) - Statistic: 0.8889242919715895, P-value: 0.0000000000
Kolmogorov-Smirnov Test (workinghours) - Statistic: 0.9946501019683699, P-value: 0.0000000000
Shapiro-Wilk Test (education) - Statistic: 0.8713723419028663, P-value: 0.0000000000
Kolmogorov-Smirnov Test (education) - Statistic: 0.9906663800150948, P-value: 0.0000000000
Shapiro-Wilk Test (ability to speak english) - Statistic: 0.19080425962324588, P-value: 0.0000000000
Kolmogorov-Smirnov Test (ability to speak english) - Statistic: 0.5, P-value: 0.0000000000


  res = hypotest_fun_out(*samples, **kwds)


In [50]:
from scipy.stats import pointbiserialr
# Encode the binary class feature as numeric values
binary_class_feature = 'binary_class_feature'
data[binary_class_feature] = data['income'].map({'low': 0, 'high': 1})

for feature in continuous_features:
    point_biserial_corr, p_value = pointbiserialr(data[feature], data[binary_class_feature])
    print(f"Point-Biserial Correlation Coefficient for '{feature}': {point_biserial_corr:.3f}, P-value: {p_value:.6f}")

Point-Biserial Correlation Coefficient for 'age': 0.280, P-value: 0.000000
Point-Biserial Correlation Coefficient for 'workinghours': 0.295, P-value: 0.000000
Point-Biserial Correlation Coefficient for 'education': 0.289, P-value: 0.000000
Point-Biserial Correlation Coefficient for 'ability to speak english': -0.041, P-value: 0.000084


In [51]:
from scipy.stats import spearmanr, kendalltau

# Calculate Spearman's rank correlation coefficient for each continuous feature
print("Spearman's Rank Correlation Coefficients:")
for feature in continuous_features:
    spearman_corr, spearman_p_value = spearmanr(data[feature], data[binary_class_feature])
    print(f"   {feature}: {spearman_corr:.3f} (p-value: {spearman_p_value:.3f})")

# Calculate Kendall's tau correlation coefficient for each continuous feature
print("\nKendall's Tau Correlation Coefficients:")
for feature in continuous_features:
    kendall_tau_corr, kendall_tau_p_value = kendalltau(data[feature], data[binary_class_feature])
    print(f"   {feature}: {kendall_tau_corr:.3f} (p-value: {kendall_tau_p_value:.3f})")

Spearman's Rank Correlation Coefficients:
   age: 0.284 (p-value: 0.000)
   workinghours: 0.328 (p-value: 0.000)
   education: 0.329 (p-value: 0.000)
   ability to speak english: -0.031 (p-value: 0.003)

Kendall's Tau Correlation Coefficients:
   age: 0.234 (p-value: 0.000)
   workinghours: 0.290 (p-value: 0.000)
   education: 0.286 (p-value: 0.000)
   ability to speak english: -0.031 (p-value: 0.003)


In [52]:
# Calculate chi-squared statistics and p-values between each feature and the target class
chi2_stat, p_values = chi2(X, y)

# Create a DataFrame to display chi-squared statistics and p-values
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Chi-squared statistic': chi2_stat, 'P-value': p_values})

# Sort the DataFrame by chi-squared statistic in descending order
feature_importance_df = feature_importance_df.sort_values(by='Chi-squared statistic', ascending=False)

print("Feature importances (Chi-squared test):")
print(feature_importance_df)

ValueError: could not convert string to float: 'self employed'

In [ ]:
# Calculate mutual information between each feature and the target class
mutual_info = mutual_info_classif(X, y)

# Create a DataFrame to display feature importances
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': mutual_info})

# Sort the DataFrame by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print feature importances
print("Feature importances (Mutual Information):")
print(feature_importance_df)