In [1]:
import pandas as pd
import numpy as np

# Load the preprocessed data
df = pd.read_csv('../data/processed/preprocessed_churn_data.csv')

# Display all column names
print("All columns in the dataset:")
print("=" * 50)
for i, col in enumerate(df.columns, 1):
    print(f"{i:2d}. {col}")

# Get column information and sample values
print("\nColumn information and sample values:")
print("=" * 50)
for col in df.columns:
    print(f"\nColumn: {col}")
    print(f"Type: {df[col].dtype}")
    print(f"Sample unique values: {df[col].unique()[:5]}")
    print("-" * 30)

# Display basic statistics for numeric columns
print("\nNumeric columns statistics:")
print("=" * 50)
print(df.describe())

All columns in the dataset:
 1. visitorid
 2. ses_rec
 3. ses_rec_avg
 4. ses_rec_sd
 5. ses_rec_cv
 6. user_rec
 7. ses_n
 8. ses_n_r
 9. int_n
10. int_n_r
11. tran_n
12. tran_n_r
13. rev_sum
14. rev_sum_r
15. major_spend_r
16. int_cat_n_avg
17. int_itm_n_avg
18. ses_mo_avg
19. ses_mo_sd
20. ses_ho_avg
21. ses_ho_sd
22. ses_wknd_r
23. ses_len_avg
24. time_to_int
25. time_to_tran
26. int_cat1_n
27. int_cat2_n
28. int_cat3_n
29. int_cat4_n
30. int_cat5_n
31. int_cat6_n
32. int_cat7_n
33. int_cat8_n
34. int_cat9_n
35. int_cat10_n
36. int_cat11_n
37. int_cat12_n
38. int_cat13_n
39. int_cat15_n
40. int_cat16_n
41. int_cat17_n
42. int_cat18_n
43. int_cat19_n
44. int_cat20_n
45. int_cat21_n
46. int_cat22_n
47. int_cat23_n
48. int_cat24_n
49. target_class
50. engagement_ratio
51. conversion_rate
52. avg_items_per_interaction
53. session_consistency
54. weekend_preference
55. category_diversity
56. avg_transaction_value
57. value_segment
58. recency_score
59. user_lifetime
60. activity_regular

# Column Descriptions

## User Identification
- `visitorid`: Unique identifier for each customer

## Session and Interaction Metrics
- `ses_rec`: Recent session activity
- `ses_n`: Number of sessions
- `int_n`: Number of interactions
- `tran_n`: Number of transactions
- `rev_sum`: Total revenue/purchase amount

## Time-based Metrics
- `time_to_int`: Time to interaction
- `time_to_tran`: Time to transaction
- `user_lifetime`: Customer's lifetime with the platform

## Engagement Metrics
- `engagement_ratio`: User engagement score
- `conversion_rate`: Conversion rate for the user
- `avg_items_per_interaction`: Average items per interaction
- `session_consistency`: Consistency of user sessions
- `weekend_preference`: Preference for weekend activity

## Value and Risk Metrics
- `avg_transaction_value`: Average value per transaction
- `value_segment`: Customer value segment (Low, Medium, High)
- `churn_risk_score`: Probability of customer churning
- `risk_segment`: Risk categorization (Low, Medium, High, Very High)

## Behavioral Metrics
- `category_diversity`: Diversity in category interactions
- `activity_regularity`: Regularity of user activity
- `peak_hour_activity`: Activity during peak hours
- `activity_decline`: Whether activity is declining

## Target Variable
- `target_class`: Churn status (1 for churned, 0 for not churned)

## Category Interaction Counts
- `int_cat1_n` through `int_cat24_n`: Interaction counts for different product categories

In [3]:
import pandas as pd
import numpy as np

# Load the preprocessed data
df = pd.read_csv('../data/processed/preprocessed_churn_data.csv')

# Create a structured summary of columns
def get_column_groups():
    column_groups = {
        'Session Metrics': [
            'ses_rec', 'ses_rec_avg', 'ses_rec_sd', 'ses_rec_cv',
            'ses_n', 'ses_n_r',
            'ses_mo_avg', 'ses_mo_sd',
            'ses_ho_avg', 'ses_ho_sd',
            'ses_wknd_r',
            'ses_len_avg'
        ],
        'User Metrics': [
            'visitorid',
            'user_rec',
            'user_lifetime'
        ],
        'Interaction Metrics': [
            'int_n', 'int_n_r',
            'int_cat_n_avg', 'int_itm_n_avg'
        ],
        'Transaction Metrics': [
            'tran_n', 'tran_n_r',
            'rev_sum', 'rev_sum_r',
            'major_spend_r'
        ],
        'Time Metrics': [
            'time_to_int',
            'time_to_tran'
        ],
        'Category Interactions': [
            'int_cat1_n', 'int_cat2_n', 'int_cat3_n', 'int_cat4_n',
            'int_cat5_n', 'int_cat6_n', 'int_cat7_n', 'int_cat8_n',
            'int_cat9_n', 'int_cat10_n', 'int_cat11_n', 'int_cat12_n',
            'int_cat13_n', 'int_cat15_n', 'int_cat16_n', 'int_cat17_n',
            'int_cat18_n', 'int_cat19_n', 'int_cat20_n', 'int_cat21_n',
            'int_cat22_n', 'int_cat23_n', 'int_cat24_n'
        ],
        'Engagement and Behavioral': [
            'engagement_ratio',
            'conversion_rate',
            'avg_items_per_interaction',
            'session_consistency',
            'weekend_preference',
            'category_diversity',
            'avg_transaction_value'
        ],
        'Segments and Risk': [
            'value_segment',
            'recency_score',
            'recency_segment',
            'activity_regularity',
            'peak_hour_activity',
            'activity_decline',
            'low_engagement',
            'recency_risk',
            'engagement_risk',
            'value_risk',
            'churn_risk_score',
            'risk_segment'
        ],
        'Target': [
            'target_class'
        ]
    }
    return column_groups

# Get column groups
column_groups = get_column_groups()

# Print summary
print("COMPLETE COLUMN SUMMARY")
print("=" * 50)
print(f"Total number of columns: {len(df.columns)}")
print("\nBreakdown by category:")
print("-" * 30)

for category, cols in column_groups.items():
    print(f"\n{category} ({len(cols)} columns):")
    for col in cols:
        if col in df.columns:
            dtype = str(df[col].dtype)
            sample = str(df[col].iloc[0])[:50]  # Truncate long values
            print(f"  - {col:<25} | Type: {dtype:<15} | Example: {sample}")
        else:
            print(f"  - {col:<25} | Not found in dataset")
        
# Verify all columns are accounted for
all_grouped_cols = [col for cols in column_groups.values() for col in cols]
missing_cols = set(df.columns) - set(all_grouped_cols)
extra_cols = set(all_grouped_cols) - set(df.columns)

if missing_cols:
    print("\nWarning: Following columns are not categorized:")
    for col in sorted(missing_cols):
        dtype = str(df[col].dtype)
        sample = str(df[col].iloc[0])[:50]
        print(f"  - {col:<25} | Type: {dtype:<15} | Example: {sample}")
        
if extra_cols:
    print("\nWarning: Following categorized columns are not in dataset:")
    for col in sorted(extra_cols):
        print(f"  - {col}")

COMPLETE COLUMN SUMMARY
Total number of columns: 69

Breakdown by category:
------------------------------

Session Metrics (12 columns):
  - ses_rec                   | Type: float64         | Example: 0.4375
  - ses_rec_avg               | Type: float64         | Example: 0.9649122807017544
  - ses_rec_sd                | Type: float64         | Example: 0.0
  - ses_rec_cv                | Type: float64         | Example: 0.0
  - ses_n                     | Type: float64         | Example: 0.0
  - ses_n_r                   | Type: float64         | Example: -0.3736263514536896
  - ses_mo_avg                | Type: float64         | Example: 0.0
  - ses_mo_sd                 | Type: float64         | Example: 0.0
  - ses_ho_avg                | Type: float64         | Example: -0.5419354489073905
  - ses_ho_sd                 | Type: float64         | Example: 1.0283819382372748
  - ses_wknd_r                | Type: float64         | Example: 1.0
  - ses_len_avg               | Type: 