In [1]:
from typing import List, Tuple
from helper.helper_functions import load_dataset, save_model, get_features_and_target, encode_nominal_features
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

In [2]:
data = load_dataset('../data/assignment2_income_cleaned.xlsx')

In [3]:
# Splitting the data into features (X) and target (y)
X, y = get_features_and_target(data, target_column='income')

In [4]:
columns_to_use = ['age', 'workclass', 'education', 'marital status', 'occupation', 'workinghours', 'sex', 'ability to speak english', 'gave birth this year']

- **Categorical features**:
    - **nominal features**:
        'workclass', 'marital status', 'occupation', 'sex', and 'gave birth this year'
    - **ordinal features**:
        'education', 'ability to speak English'
- **Continuous features**:
    'age', 'workinghours'

### a
Point-biserial correlation is a special case of Pearson correlation coefficient used when one variable is continuous and the other is dichotomous (binary).

https://en.wikipedia.org/wiki/Point-biserial_correlation_coefficient
https://resources.nu.edu/statsresources/Pointbiserial

In [5]:
from scipy.stats import shapiro, kstest

continuous_features = ['age', 'workinghours']

for feature in continuous_features:
    # Shapiro-Wilk Test
    shapiro_stat, shapiro_p_value = shapiro(data[feature])
    print(f"Shapiro-Wilk Test ({feature}) - Statistic: {shapiro_stat}, P-value: {shapiro_p_value:.10f}")

    # Kolmogorov-Smirnov Test
    ks_stat, ks_p_value = kstest(data[feature], 'norm')
    print(f"Kolmogorov-Smirnov Test ({feature}) - Statistic: {ks_stat}, P-value: {ks_p_value:.10f}")

Shapiro-Wilk Test (age) - Statistic: 0.974037822220245, P-value: 0.0000000000
Kolmogorov-Smirnov Test (age) - Statistic: 1.0, P-value: 0.0000000000
Shapiro-Wilk Test (workinghours) - Statistic: 0.8889242919715895, P-value: 0.0000000000
Kolmogorov-Smirnov Test (workinghours) - Statistic: 0.9946501019683699, P-value: 0.0000000000


  res = hypotest_fun_out(*samples, **kwds)


In [6]:
from scipy.stats import pointbiserialr
# Encode the binary class feature as numeric values
binary_class_feature = 'binary_class_feature'
data[binary_class_feature] = data['income'].map({'low': 0, 'high': 1})

for feature in continuous_features:
    point_biserial_corr, p_value = pointbiserialr(data[feature], data[binary_class_feature])
    print(f"Point-Biserial Correlation Coefficient for '{feature}': {point_biserial_corr:.3f}, P-value: {p_value:.6f}")

Point-Biserial Correlation Coefficient for 'age': 0.280, P-value: 0.000000
Point-Biserial Correlation Coefficient for 'workinghours': 0.295, P-value: 0.000000


In [7]:
from scipy.stats import spearmanr, kendalltau

# Calculate Spearman's rank correlation coefficient for each continuous feature
print("Spearman's Rank Correlation Coefficients:")
for feature in continuous_features:
    spearman_corr, spearman_p_value = spearmanr(data[feature], data[binary_class_feature])
    print(f"   {feature}: {spearman_corr:.3f} (p-value: {spearman_p_value:.3f})")

# Calculate Kendall's tau correlation coefficient for each continuous feature
print("\nKendall's Tau Correlation Coefficients:")
for feature in continuous_features:
    kendall_tau_corr, kendall_tau_p_value = kendalltau(data[feature], data[binary_class_feature])
    print(f"   {feature}: {kendall_tau_corr:.3f} (p-value: {kendall_tau_p_value:.3f})")

Spearman's Rank Correlation Coefficients:
   age: 0.284 (p-value: 0.000)
   workinghours: 0.328 (p-value: 0.000)

Kendall's Tau Correlation Coefficients:
   age: 0.234 (p-value: 0.000)
   workinghours: 0.290 (p-value: 0.000)


### b

In [8]:
# List of nominal features
nominal_features_lc = ['workclass', 'sex', 'marital status', 'gave birth this year'] # low cardinality features
nominal_features_hc = ['occupation'] # high cardinality features

X_encoded = encode_nominal_features(X, nominal_features_lc, nominal_features_hc)

  y = column_or_1d(y, warn=True)


In [9]:
X_encoded

Unnamed: 0,age,education,workinghours,ability to speak english,occupation_encoded,workclass_governmental,workclass_no paid work,workclass_private,workclass_self employed,sex_Female,sex_Male,marital status_Divorced,marital status_Husband,marital status_Never married,marital status_Separated,marital status_Widowed,marital status_Wife,gave birth this year_No,gave birth this year_Yes
0,52,16,50,0,17,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,60,20,30,0,6,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,64,21,40,0,8,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,64,17,40,0,17,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,31,15,40,0,17,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8995,28,16,40,0,2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
8996,61,24,40,0,2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
8997,34,23,50,0,6,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
8998,60,19,40,0,8,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [10]:
from sklearn.feature_selection import SelectPercentile, chi2

# Feature selection using SelectPercentile with chi-squared test
percentile_selector = SelectPercentile(score_func=chi2, percentile=10)
X_selected = percentile_selector.fit_transform(X_encoded, y)
# Get the indices of selected features
selected_feature_indices = percentile_selector.get_support(indices=True)
# Get the importance scores of selected features
feature_importance_scores = percentile_selector.scores_
# Create a DataFrame to display feature importances
feature_importance_df = pd.DataFrame({'Feature': X_encoded.columns, 'Importance': feature_importance_scores})

In [11]:
feature_importance_df

Unnamed: 0,Feature,Importance
0,age,3783.249409
1,education,424.146521
2,workinghours,3098.182416
3,ability to speak english,31.502407
4,occupation_encoded,146.337835
5,workclass_governmental,62.348295
6,workclass_no paid work,3.282243
7,workclass_private,17.983133
8,workclass_self employed,3.606871
9,sex_Female,253.022927


In [12]:
# Calculate chi-squared statistics and p-values between each feature and the target class (**in pairs**)
chi2_stat, p_values = chi2(X_encoded, y)
# Create a DataFrame to display chi-squared statistics and p-values
feature_importance_df = pd.DataFrame({'Feature': X_encoded.columns, 'Chi-squared statistic': chi2_stat, 'P-value': [round(p, 6) for p in p_values]})
# Sort the DataFrame by chi-squared statistic value
feature_importance_df = feature_importance_df.sort_values(by='Chi-squared statistic', ascending=False)

In [13]:
feature_importance_df

Unnamed: 0,Feature,Chi-squared statistic,P-value
0,age,3783.249409,0.0
2,workinghours,3098.182416,0.0
12,marital status_Husband,673.531893,0.0
13,marital status_Never married,556.620501,0.0
1,education,424.146521,0.0
9,sex_Female,253.022927,0.0
4,occupation_encoded,146.337835,0.0
10,sex_Male,126.511463,0.0
16,marital status_Wife,67.470942,0.0
5,workclass_governmental,62.348295,0.0


In [14]:
# Calculate mutual information between each feature and the target class
mutual_info = mutual_info_classif(X_encoded, y)
# Create a DataFrame to display feature importances
feature_importance_df = pd.DataFrame({'Feature': X_encoded.columns, 'Importance': mutual_info})
# Sort the DataFrame by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

In [15]:
feature_importance_df

Unnamed: 0,Feature,Importance
4,occupation_encoded,0.07555
12,marital status_Husband,0.066462
0,age,0.065038
2,workinghours,0.054646
1,education,0.053464
13,marital status_Never married,0.046691
10,sex_Male,0.023961
9,sex_Female,0.022999
17,gave birth this year_No,0.009813
6,workclass_no paid work,0.00605
