In [14]:
import sys
from pathlib import Path

project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from src.preprocessing import prepare_features

ImportError: cannot import name 'prepare_features' from 'src.preprocessing' (d:\ml_prj\src\preprocessing.py)

In [None]:
df = pd.read_csv("../data/raw/retail_customers_COMPLETE_CATEGORICAL.csv")

In [None]:
df = prepare_features(df)

In [None]:
# drop the target variable for correlation analysis
features_df = df.drop(columns=["Churn", "ChurnRiskCategory"])

In [None]:
threshold = 0.8
corr_matrix = features_df.corr().abs()

high_corr = np.where(corr_matrix > threshold)
high_corr = [
    (corr_matrix.index[i], corr_matrix.columns[j], corr_matrix.iloc[i, j])
    for i, j in zip(*high_corr)
    if i != j and i < j
]

churn_corr = df.corr()["Churn"].abs()
high_corr_with_churn = []
for feat1, feat2, corr_value in high_corr:
    high_corr_with_churn.append(
        (
            feat1,
            feat2,
            corr_value,
            churn_corr[feat1] < churn_corr[feat2] and feat1 or feat2,
        )
    )

high_corr_with_churn.sort(key=lambda x: x[2], reverse=True)


high_corr_with_churn

[('Frequency', 'UniqueInvoices', np.float64(1.0), 'UniqueInvoices'),
 ('NegativeQuantityCount',
  'CancelledTransactions',
  np.float64(1.0),
  'CancelledTransactions'),
 ('UniqueProducts',
  'UniqueDescriptions',
  np.float64(0.9999299055172687),
  'UniqueProducts'),
 ('MonetaryMin', 'MonetaryMax', np.float64(0.9939474266440925), 'MonetaryMin'),
 ('Age', 'AgeCategory', np.float64(0.9813350752506724), 'AgeCategory'),
 ('MonetaryStd', 'MinQuantity', np.float64(0.9740468050249654), 'MinQuantity'),
 ('MonetaryStd', 'MaxQuantity', np.float64(0.9725849496075486), 'MaxQuantity'),
 ('MonetaryStd', 'MonetaryMin', np.float64(0.9673247200463374), 'MonetaryMin'),
 ('MonetaryStd', 'MonetaryMax', np.float64(0.9661346474200498), 'MonetaryMax'),
 ('AvgProductsPerTransaction',
  'AvgLinesPerInvoice',
  np.float64(0.9632101745073993),
  'AvgProductsPerTransaction'),
 ('MinQuantity', 'MaxQuantity', np.float64(0.9611744124388267), 'MaxQuantity'),
 ('CustomerTenureDays',
  'LoyaltyLevel',
  np.float64(0.9

In [None]:
# print the features that oughta be dropped
features_to_drop = set()
for _, _, _, less_important in high_corr_with_churn:
    features_to_drop.add(less_important)
    
features_to_drop

{'AgeCategory',
 'AvgProductsPerTransaction',
 'CancelledTransactions',
 'CustomerType_Perdu',
 'LoyaltyLevel',
 'MaxQuantity',
 'MinQuantity',
 'MonetaryMax',
 'MonetaryMin',
 'MonetaryTotal',
 'PreferredHour',
 'Region_UK',
 'RegistrationYear',
 'TotalTransactions',
 'UniqueInvoices',
 'UniqueProducts',
 'WeekendPreference_Weekend'}

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Prepare feature matrix for VIF (exclude target)
X_vif = (
    # I don't want it to detect features that I already know are highly correlated with another pair.
    features_df.drop(columns=[features_to_drop], errors="ignore")
    .select_dtypes(include=[np.number, "bool"])
    .astype(float)
)

# VIF cannot handle NaN values
X_vif = X_vif.fillna(X_vif.median(numeric_only=True))

vif_threshold = 10.0
features_to_drop_vif = []

while True:
    # 1. ADD THE CONSTANT: This anchors the regression and fixes the "Age" bug
    X_vif_const = sm.add_constant(X_vif)
    
    # 2. Calculate VIFs
    vif_scores = pd.DataFrame(
        {
            "feature": X_vif_const.columns,
            "VIF": [
                variance_inflation_factor(X_vif_const.values, i)
                for i in range(X_vif_const.shape[1])
            ],
        }
    )
    
    # 3. Drop the 'const' row from the scores so we don't accidentally eliminate it
    vif_scores = vif_scores[vif_scores["feature"] != "const"].sort_values("VIF", ascending=False)

    # 4. Grab the worst offender
    max_feature = vif_scores.iloc[0]["feature"]
    max_vif = vif_scores.iloc[0]["VIF"]

    # 5. Break if the worst offender is under the threshold
    if np.isfinite(max_vif) and max_vif <= vif_threshold:
        break

    # 6. Otherwise, drop it and repeat
    features_to_drop_vif.append(max_feature)
    X_vif = X_vif.drop(columns=[max_feature])

# Final VIF table after iterative drops
X_vif_final_const = sm.add_constant(X_vif)
final_vif = pd.DataFrame(
    {
        "feature": X_vif_final_const.columns,
        "VIF": [
            variance_inflation_factor(X_vif_final_const.values, i) 
            for i in range(X_vif_final_const.shape[1])
        ],
    }
)
# Filter out the constant for the final display so it's clean
final_vif = final_vif[final_vif["feature"] != "const"].sort_values("VIF", ascending=False)

features_to_drop_vif = set(features_to_drop_vif)

# Display results
print(f"Dropped {len(features_to_drop_vif)} features due to VIF.")
features_to_drop_vif, final_vif.head(20)

Dropped 1 features due to VIF.


({'CustomerTenureDays'},
                         feature       VIF
 21                   RFMSegment  5.817318
 1                       Recency  5.700959
 2                     Frequency  4.769223
 12           UniqueDescriptions  4.326566
 7          FirstPurchaseDaysAgo  3.883543
 22             SpendingCategory  3.745612
 27     CustomerType_Occasionnel  3.148528
 26         CustomerType_Nouveau  3.141822
 25                      Country  2.814175
 17           AvgLinesPerInvoice  2.606223
 30     FavoriteSeason_Printemps  2.468894
 14        NegativeQuantityCount  2.252092
 29         FavoriteSeason_Hiver  2.207449
 28        CustomerType_Régulier  2.184555
 55        DaysSinceRegistration  2.134249
 9                PreferredMonth  1.946256
 10         WeekendPurchaseRatio  1.931992
 5                 TotalQuantity  1.926559
 44      ProductDiversity_Modéré  1.913370
 45  ProductDiversity_Spécialisé  1.897545)

In [None]:
features_to_drop_vif | features_to_drop

{'AgeCategory',
 'AvgProductsPerTransaction',
 'BasketSizeCategory',
 'CancelledTransactions',
 'Country',
 'CustomerType_Perdu',
 'DaysSinceRegistration',
 'FirstPurchaseDaysAgo',
 'GeoIP',
 'LoyaltyLevel',
 'MaxQuantity',
 'MinQuantity',
 'MonetaryMax',
 'MonetaryMin',
 'MonetaryTotal',
 'PreferredHour',
 'RFMSegment',
 'Region_UK',
 'RegistrationYear',
 'SpendingCategory',
 'TotalTransactions',
 'UniqueCountries',
 'UniqueInvoices',
 'UniqueProducts',
 'WeekendPreference_Weekend'}

Age                      1.000000
AgeCategory              0.981335
FavoriteSeason_Hiver     0.060121
BasketSizeCategory       0.039999
NegativeQuantityCount    0.039449
                           ...   
CustomerType_Perdu       0.001801
Region_Autre             0.001173
GeoIP                    0.000683
Region_Europe du Nord    0.000604
Country                  0.000274
Name: Age, Length: 75, dtype: float64