In [1]:
import sys
from pathlib import Path

project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from src.preprocessing import preprocess_data

In [3]:
df = pd.read_csv("../data/raw/retail_customers_COMPLETE_CATEGORICAL.csv")

In [4]:
df = preprocess_data(df)

In [5]:
# drop the target variable for correlation analysis
features_df = df.drop(columns=["Churn", "ChurnRiskCategory"])

In [6]:
threshold = 0.8
corr_matrix = features_df.corr().abs()

high_corr = np.where(corr_matrix > threshold)
high_corr = [
    (corr_matrix.index[i], corr_matrix.columns[j], corr_matrix.iloc[i, j])
    for i, j in zip(*high_corr)
    if i != j and i < j
]

churn_corr = df.corr()["Churn"].abs()
high_corr_with_churn = []
for feat1, feat2, corr_value in high_corr:
    high_corr_with_churn.append(
        (
            feat1,
            feat2,
            corr_value,
            churn_corr[feat1] < churn_corr[feat2] and feat1 or feat2,
        )
    )

high_corr_with_churn.sort(key=lambda x: x[2], reverse=True)


high_corr_with_churn

[('Frequency', 'UniqueInvoices', np.float64(1.0), 'UniqueInvoices'),
 ('NegativeQuantityCount',
  'CancelledTransactions',
  np.float64(1.0),
  'CancelledTransactions'),
 ('UniqueProducts',
  'UniqueDescriptions',
  np.float64(0.9999299055172687),
  'UniqueProducts'),
 ('MonetaryMin', 'MonetaryMax', np.float64(0.9939474266440925), 'MonetaryMin'),
 ('Age', 'AgeCategory', np.float64(0.9813350752506724), 'AgeCategory'),
 ('MonetaryStd', 'MinQuantity', np.float64(0.9740468050249654), 'MinQuantity'),
 ('MonetaryStd', 'MaxQuantity', np.float64(0.9725849496075486), 'MaxQuantity'),
 ('MonetaryStd', 'MonetaryMin', np.float64(0.9673247200463374), 'MonetaryMin'),
 ('MonetaryStd', 'MonetaryMax', np.float64(0.9661346474200498), 'MonetaryMax'),
 ('AvgProductsPerTransaction',
  'AvgLinesPerInvoice',
  np.float64(0.9632101745073993),
  'AvgProductsPerTransaction'),
 ('MinQuantity', 'MaxQuantity', np.float64(0.9611744124388267), 'MaxQuantity'),
 ('CustomerTenureDays',
  'LoyaltyLevel',
  np.float64(0.9

In [7]:
# print the features that oughta be dropped
features_to_drop = set()
for _, _, _, less_important in high_corr_with_churn:
    features_to_drop.add(less_important)
    
features_to_drop

{'AgeCategory',
 'AvgProductsPerTransaction',
 'CancelledTransactions',
 'CustomerType_Perdu',
 'LoyaltyLevel',
 'MaxQuantity',
 'MinQuantity',
 'MonetaryMax',
 'MonetaryMin',
 'MonetaryTotal',
 'PreferredHour',
 'Region_UK',
 'RegistrationYear',
 'TotalTransactions',
 'UniqueInvoices',
 'UniqueProducts',
 'WeekendPreference_Weekend'}

In [8]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Prepare feature matrix for VIF (exclude target)
X_vif = (
    # I don't want it to detect features that I already know are highly correlated with another pair.
    features_df.drop(columns=features_to_drop, errors="ignore")
    .select_dtypes(include=[np.number, "bool"])
    .astype(float)
)

# VIF cannot handle NaN values
X_vif = X_vif.fillna(X_vif.median(numeric_only=True))

vif_threshold = 10.0
features_to_drop_vif = []

while True:
    vif_scores = pd.DataFrame(
        {
            "feature": X_vif.columns,
            "VIF": [
                variance_inflation_factor(X_vif.values, i)
                for i in range(X_vif.shape[1])
            ],
        }
    ).sort_values("VIF", ascending=False)

    max_feature = vif_scores.iloc[0]["feature"]
    max_vif = vif_scores.iloc[0]["VIF"]

    if np.isfinite(max_vif) and max_vif <= vif_threshold:
        break

    features_to_drop_vif.append(max_feature)
    X_vif = X_vif.drop(columns=[max_feature])

# Final VIF table after iterative drops
final_vif = pd.DataFrame(
    {
        "feature": X_vif.columns,
        "VIF": [
            variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])
        ],
    }
).sort_values("VIF", ascending=False)

features_to_drop_vif = set(features_to_drop_vif)
features_to_drop_vif, final_vif.head(20)

({'BasketSizeCategory',
  'Country',
  'DaysSinceRegistration',
  'FirstPurchaseDaysAgo',
  'GeoIP',
  'RFMSegment',
  'SpendingCategory',
  'UniqueCountries'},
                      feature       VIF
 16                       Age  9.956508
 8             PreferredMonth  9.030529
 19        PreferredTimeOfDay  6.472549
 11        UniqueDescriptions  6.162488
 1                  Frequency  6.006228
 6         CustomerTenureDays  5.698298
 0                    Recency  5.639207
 18         SatisfactionScore  5.534103
 15        AvgLinesPerInvoice  5.203764
 7         PreferredDayOfWeek  5.166373
 45         RegistrationMonth  4.757821
 46           RegistrationDay  4.172900
 21  CustomerType_Occasionnel  3.235785
 47     RegistrationDayOfWeek  3.202385
 20      CustomerType_Nouveau  3.094922
 23      FavoriteSeason_Hiver  2.794140
 24  FavoriteSeason_Printemps  2.719504
 12     NegativeQuantityCount  2.391126
 9       WeekendPurchaseRatio  2.318822
 22     CustomerType_RÃ©gulier  2.26030

In [9]:
features_to_drop_vif | features_to_drop

{'AgeCategory',
 'AvgProductsPerTransaction',
 'BasketSizeCategory',
 'CancelledTransactions',
 'Country',
 'CustomerType_Perdu',
 'DaysSinceRegistration',
 'FirstPurchaseDaysAgo',
 'GeoIP',
 'LoyaltyLevel',
 'MaxQuantity',
 'MinQuantity',
 'MonetaryMax',
 'MonetaryMin',
 'MonetaryTotal',
 'PreferredHour',
 'RFMSegment',
 'Region_UK',
 'RegistrationYear',
 'SpendingCategory',
 'TotalTransactions',
 'UniqueCountries',
 'UniqueInvoices',
 'UniqueProducts',
 'WeekendPreference_Weekend'}

In [10]:
# show columns that are most correlated with the Age column
age_corr = df.corr()["Age"].abs().sort_values(ascending=False)

age_corr

Age                      1.000000
AgeCategory              0.981335
FavoriteSeason_Hiver     0.060121
BasketSizeCategory       0.039999
NegativeQuantityCount    0.039449
                           ...   
CustomerType_Perdu       0.001801
Region_Autre             0.001173
GeoIP                    0.000683
Region_Europe du Nord    0.000604
Country                  0.000274
Name: Age, Length: 75, dtype: float64