In [296]:
from typing import List, Tuple
from helper.helper_functions import load_dataset, save_model
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

In [297]:
data = load_dataset('../data/assignment2_income_cleaned.xlsx')

In [298]:
# Splitting the data into features (X) and target (y)
X = data.drop(columns=['income'])
y = data['income']

In [299]:
columns_to_use = ['age', 'workclass', 'education', 'marital status', 'occupation', 'workinghours', 'sex', 'ability to speak english', 'gave birth this year']

- **Categorical features**:
    - **nominal features**:
        'workclass', 'marital status', 'occupation', 'sex', and 'gave birth this year'
    - **ordinal features**:
        'education', 'ability to speak English'
- **Continuous features**:
    'age', 'workinghours'

### a
Point-biserial correlation is a special case of Pearson correlation coefficient used when one variable is continuous and the other is dichotomous (binary).

https://en.wikipedia.org/wiki/Point-biserial_correlation_coefficient
https://resources.nu.edu/statsresources/Pointbiserial

In [300]:
from scipy.stats import shapiro, kstest

continuous_features = ['age', 'workinghours']

for feature in continuous_features:
    # Shapiro-Wilk Test
    shapiro_stat, shapiro_p_value = shapiro(data[feature])
    print(f"Shapiro-Wilk Test ({feature}) - Statistic: {shapiro_stat}, P-value: {shapiro_p_value:.10f}")

    # Kolmogorov-Smirnov Test
    ks_stat, ks_p_value = kstest(data[feature], 'norm')
    print(f"Kolmogorov-Smirnov Test ({feature}) - Statistic: {ks_stat}, P-value: {ks_p_value:.10f}")

Shapiro-Wilk Test (age) - Statistic: 0.974037822220245, P-value: 0.0000000000
Kolmogorov-Smirnov Test (age) - Statistic: 1.0, P-value: 0.0000000000
Shapiro-Wilk Test (workinghours) - Statistic: 0.8889242919715895, P-value: 0.0000000000
Kolmogorov-Smirnov Test (workinghours) - Statistic: 0.9946501019683699, P-value: 0.0000000000


  res = hypotest_fun_out(*samples, **kwds)


In [301]:
from scipy.stats import pointbiserialr
# Encode the binary class feature as numeric values
binary_class_feature = 'binary_class_feature'
data[binary_class_feature] = data['income'].map({'low': 0, 'high': 1})

for feature in continuous_features:
    point_biserial_corr, p_value = pointbiserialr(data[feature], data[binary_class_feature])
    print(f"Point-Biserial Correlation Coefficient for '{feature}': {point_biserial_corr:.3f}, P-value: {p_value:.6f}")

Point-Biserial Correlation Coefficient for 'age': 0.280, P-value: 0.000000
Point-Biserial Correlation Coefficient for 'workinghours': 0.295, P-value: 0.000000


In [302]:
from scipy.stats import spearmanr, kendalltau

# Calculate Spearman's rank correlation coefficient for each continuous feature
print("Spearman's Rank Correlation Coefficients:")
for feature in continuous_features:
    spearman_corr, spearman_p_value = spearmanr(data[feature], data[binary_class_feature])
    print(f"   {feature}: {spearman_corr:.3f} (p-value: {spearman_p_value:.3f})")

# Calculate Kendall's tau correlation coefficient for each continuous feature
print("\nKendall's Tau Correlation Coefficients:")
for feature in continuous_features:
    kendall_tau_corr, kendall_tau_p_value = kendalltau(data[feature], data[binary_class_feature])
    print(f"   {feature}: {kendall_tau_corr:.3f} (p-value: {kendall_tau_p_value:.3f})")

Spearman's Rank Correlation Coefficients:
   age: 0.284 (p-value: 0.000)
   workinghours: 0.328 (p-value: 0.000)

Kendall's Tau Correlation Coefficients:
   age: 0.234 (p-value: 0.000)
   workinghours: 0.290 (p-value: 0.000)


### b

In [303]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_selection import SelectPercentile, chi2

# List of nominal features
nominal_features_lc = ['workclass', 'sex', 'marital status', 'gave birth this year'] # low count
nominal_features_hc = ['occupation']

# Initialize encoder
one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
label_encoder = LabelEncoder()

In [304]:
X_encoded = X.copy()
# Encode nominal features with high count using label encoding
# nominal_encoded_hc = label_encoder.fit_transform(X[nominal_features_hc])
# nominal_encoded_hc_df = pd.DataFrame(nominal_encoded_hc, columns=[f'{feature}_encoded' for feature in nominal_features_hc])
X_encoded['occupation'] = label_encoder.fit_transform(X['occupation'])

# Encode nominal features
nominal_encoded_lc = one_hot_encoder.fit_transform(X[nominal_features_lc])
nominal_encoded_df_lc = pd.DataFrame(nominal_encoded_lc, columns=one_hot_encoder.get_feature_names_out(nominal_features_lc))

X_encoded = pd.concat([X_encoded, nominal_encoded_df_lc], axis=1).drop(columns=nominal_features_lc + nominal_features_hc)

In [305]:
X_encoded

Unnamed: 0,age,education,workinghours,ability to speak english,workclass_governmental,workclass_no paid work,workclass_private,workclass_self employed,sex_Female,sex_Male,marital status_Divorced,marital status_Husband,marital status_Never married,marital status_Separated,marital status_Widowed,marital status_Wife,gave birth this year_No,gave birth this year_Yes
0,52,16,50,0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,60,20,30,0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,64,21,40,0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,64,17,40,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,31,15,40,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8995,28,16,40,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
8996,61,24,40,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
8997,34,23,50,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
8998,60,19,40,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [306]:
# Feature selection using SelectPercentile with chi-squared test
percentile_selector = SelectPercentile(score_func=chi2, percentile=10)
X_selected = percentile_selector.fit_transform(X_encoded, y)

# Get the indices of selected features
selected_feature_indices = percentile_selector.get_support(indices=True)

# Get the importance scores of selected features
feature_importance_scores = percentile_selector.scores_

# Create a DataFrame to display feature importances
feature_importance_df = pd.DataFrame({'Feature': X_encoded.columns, 'Importance': feature_importance_scores})

In [307]:
feature_importance_df

Unnamed: 0,Feature,Importance
0,age,3783.249409
1,education,424.146521
2,workinghours,3098.182416
3,ability to speak english,31.502407
4,workclass_governmental,62.348295
5,workclass_no paid work,3.282243
6,workclass_private,17.983133
7,workclass_self employed,3.606871
8,sex_Female,253.022927
9,sex_Male,126.511463


In [308]:
# Calculate chi-squared statistics and p-values between each feature and the target class
chi2_stat, p_values = chi2(X_encoded, y)

# Create a DataFrame to display chi-squared statistics and p-values
feature_importance_df = pd.DataFrame({'Feature': X_encoded.columns, 'Chi-squared statistic': chi2_stat, 'P-value': p_values})

# Sort the DataFrame by chi-squared statistic in descending order
feature_importance_df = feature_importance_df.sort_values(by='Chi-squared statistic', ascending=False)

print("Feature importances (Chi-squared test):")
print(feature_importance_df)

Feature importances (Chi-squared test):
                         Feature  Chi-squared statistic        P-value
0                            age            3783.249409   0.000000e+00
2                   workinghours            3098.182416   0.000000e+00
11        marital status_Husband             673.531893  1.704223e-148
12  marital status_Never married             556.620501  4.568543e-123
1                      education             424.146521   3.054340e-94
8                     sex_Female             253.022927   5.694163e-57
9                       sex_Male             126.511463   2.376251e-29
15           marital status_Wife              67.470942   2.138152e-16
4         workclass_governmental              62.348295   2.877805e-15
3       ability to speak english              31.502407   1.991931e-08
6              workclass_private              17.983133   2.228710e-05
17      gave birth this year_Yes              16.481382   4.913017e-05
13      marital status_Separated     

In [309]:
# Calculate mutual information between each feature and the target class
mutual_info = mutual_info_classif(X_encoded, y)

# Create a DataFrame to display feature importances
feature_importance_df = pd.DataFrame({'Feature': X_encoded.columns, 'Importance': mutual_info})

# Sort the DataFrame by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print feature importances
print("Feature importances (Mutual Information):")
print(feature_importance_df)

Feature importances (Mutual Information):
                         Feature  Importance
11        marital status_Husband    0.067315
2                   workinghours    0.062132
0                            age    0.061749
1                      education    0.055201
12  marital status_Never married    0.039803
9                       sex_Male    0.022598
8                     sex_Female    0.018220
3       ability to speak english    0.008561
6              workclass_private    0.006189
16       gave birth this year_No    0.006153
15           marital status_Wife    0.003860
4         workclass_governmental    0.001674
17      gave birth this year_Yes    0.000550
10       marital status_Divorced    0.000116
7        workclass_self employed    0.000000
5         workclass_no paid work    0.000000
13      marital status_Separated    0.000000
14        marital status_Widowed    0.000000
