In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [None]:
transactions_df = pd.read_csv('400_transactions.csv')
products_df = pd.read_csv('400_products.csv')
households_df = pd.read_csv('400_households.csv')

In [None]:
transactions_df['PURCHASE_'] = pd.to_datetime(transactions_df['PURCHASE_'], format='%m/%d/%Y')

last_purchase_dates = transactions_df.groupby('HSHD_NUM')['PURCHASE_'].max()

dataset_end_date = transactions_df['PURCHASE_'].max()
churn_threshold = pd.Timedelta(days=90)  # 3 months

churned_customers = (dataset_end_date - last_purchase_dates) > churn_threshold
churn_status = pd.DataFrame({
    'HSHD_NUM': churned_customers.index,
    'is_churned': churned_customers.values
})

In [None]:
def calculate_customer_features(transactions_df, households_df):
    """
    Calculate features for each customer that might indicate likelihood to churn.
    Now updated with correct column names from the dataset.
    """
    customer_patterns = transactions_df.groupby('HSHD_NUM').agg({
        'BASKET_NUM': 'nunique',
        'SPEND': ['mean', 'sum', 'std'],
        'PURCHASE_': ['min', 'max']
    }).reset_index()
    
    customer_patterns.columns = [
        'HSHD_NUM', 'total_trips', 'avg_spend', 
        'total_spend', 'spend_std', 'first_purchase', 'last_purchase'
    ]
    
    customer_patterns['customer_lifetime'] = (
        customer_patterns['last_purchase'] - 
        customer_patterns['first_purchase']
    ).dt.days
    
    customer_patterns['avg_time_between_trips'] = (
        customer_patterns['customer_lifetime'] / 
        customer_patterns['total_trips']
    )
    
    customer_features = customer_patterns.merge(
        households_df, 
        on='HSHD_NUM',
        how='left'
    )
    
    customer_features = customer_features.merge(
        churn_status,
        on='HSHD_NUM',
        how='left'
    )
    
    return customer_features

customer_features = calculate_customer_features(transactions_df, households_df)

plt.figure(figsize=(15, 10))

plt.subplot(2, 2, 1)
sns.boxplot(x='is_churned', y='avg_spend', data=customer_features)
plt.title('Average Spend by Churn Status')
plt.xlabel('Churned')
plt.ylabel('Average Spend per Trip')

plt.subplot(2, 2, 2)
sns.boxplot(x='is_churned', y='avg_time_between_trips', data=customer_features)
plt.title('Shopping Frequency by Churn Status')
plt.xlabel('Churned')
plt.ylabel('Average Days Between Trips')

plt.subplot(2, 2, 3)
churn_by_income = customer_features.groupby('INCOME_RANGE')['is_churned'].mean()
churn_by_income.plot(kind='bar')
plt.title('Churn Rate by Income Range')
plt.xticks(rotation=45)
plt.ylabel('Churn Rate')

plt.subplot(2, 2, 4)
churn_by_size = customer_features.groupby('HH_SIZE')['is_churned'].mean()
churn_by_size.plot(kind='bar')
plt.title('Churn Rate by Household Size')
plt.xlabel('Household Size')
plt.ylabel('Churn Rate')

plt.tight_layout()
plt.show()

In [None]:
print("\nMissing values in our features:")
print(feature_df.isnull().sum())


from sklearn.impute import SimpleImputer

numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

numeric_features = [
    'total_trips', 'avg_spend', 'total_spend', 'spend_std',
    'customer_lifetime', 'avg_time_between_trips', 'HH_SIZE', 'CHILDREN'
]

categorical_features = ['INCOME_RANGE', 'L', 'AGE_RANGE', 'MARITAL', 'HOMEOWNER']

numeric_df = customer_features[numeric_features].copy()
categorical_df = customer_features[categorical_features].copy()

numeric_df = pd.DataFrame(
    numeric_imputer.fit_transform(numeric_df),
    columns=numeric_features,
    index=numeric_df.index
)

categorical_df = pd.DataFrame(
    categorical_imputer.fit_transform(categorical_df),
    columns=categorical_features,
    index=categorical_df.index
)

categorical_dummies = pd.get_dummies(categorical_df)

feature_df = pd.concat([numeric_df, categorical_dummies], axis=1)

scaler = StandardScaler()
feature_df[numeric_features] = scaler.fit_transform(feature_df[numeric_features])

X_train, X_test, y_train, y_test = train_test_split(
    feature_df,
    customer_features['is_churned'],
    test_size=0.2,
    random_state=42
)

model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Evaluate model performance
y_pred = model.predict(X_test)
print("\nChurn Prediction Model Performance:")
print(classification_report(y_test, y_pred))

feature_importance = pd.DataFrame({
    'feature': feature_df.columns,
    'importance': abs(model.coef_[0])
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(x='importance', y='feature', data=feature_importance.head(10))
plt.title('Top 10 Features for Predicting Customer Churn')
plt.show()

In [None]:
customer_features['churn_probability'] = model.predict_proba(feature_df)[:, 1]

customer_features['risk_segment'] = pd.qcut(
    customer_features['churn_probability'],
    q=4,
    labels=['Low Risk', 'Medium-Low Risk', 'Medium-High Risk', 'High Risk']
)

print("\nCharacteristics of High-Risk Customers:")
high_risk = customer_features[customer_features['risk_segment'] == 'High Risk']
print("\nAverage Metrics for High-Risk Customers:")
print(high_risk[['avg_spend', 'total_trips', 'avg_time_between_trips']].mean())

def generate_retention_recommendations(customer_features):
    """
    Generate targeted retention strategies based on analysis results.
    """
    recommendations = []
    
    if customer_features['avg_spend'].mean() < customer_features['avg_spend'].median():
        recommendations.append(
            "Implement targeted promotions for high-risk customers based on their purchase history"
        )
    
    if customer_features['avg_time_between_trips'].mean() > 14:
        recommendations.append(
            "Develop an early warning system for customers showing decreased shopping frequency"
        )
    
    loyalty_impact = customer_features.groupby('L')['is_churned'].mean()
    if 'Y' in loyalty_impact and 'N' in loyalty_impact:
        if loyalty_impact['Y'] < loyalty_impact['N']:
            recommendations.append(
                "Enhance loyalty program benefits and actively recruit non-members"
            )
    
    return recommendations

print("\nRecommended Retention Strategies:")
for rec in generate_retention_recommendations(customer_features):
    print(f"- {rec}")