In [36]:
import pandas as pd

df = pd.read_csv('train.csv')

df.info(), df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 490513 entries, 0 to 490512
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   PERIOD        490513 non-null  object 
 1   cl_id         490513 non-null  int64  
 2   MCC           490513 non-null  int64  
 3   channel_type  487603 non-null  object 
 4   currency      490513 non-null  int64  
 5   TRDATETIME    490513 non-null  object 
 6   amount        490513 non-null  float64
 7   trx_category  490513 non-null  object 
 8   target_flag   490513 non-null  int64  
 9   target_sum    490513 non-null  float64
dtypes: float64(2), int64(4), object(4)
memory usage: 37.4+ MB


(None,
        PERIOD  cl_id   MCC channel_type  currency        TRDATETIME   amount  \
 0  01/10/2017      0  5200          NaN       810  21OCT17:00:00:00   5023.0   
 1  01/10/2017      0  6011          NaN       810  12OCT17:12:24:07  20000.0   
 2  01/12/2017      0  5921          NaN       810  05DEC17:00:00:00    767.0   
 3  01/10/2017      0  5411          NaN       810  21OCT17:00:00:00   2031.0   
 4  01/10/2017      0  6012          NaN       810  24OCT17:13:14:24  36562.0   
 
   trx_category  target_flag  target_sum  
 0          POS            0         0.0  
 1      DEPOSIT            0         0.0  
 2          POS            0         0.0  
 3          POS            0         0.0  
 4      C2C_OUT            0         0.0  )

In [37]:
df.dropna(inplace=True)
df.isnull().sum()

PERIOD          0
cl_id           0
MCC             0
channel_type    0
currency        0
TRDATETIME      0
amount          0
trx_category    0
target_flag     0
target_sum      0
dtype: int64

In [38]:
# Feature engineering
df['TRDATETIME'] = pd.to_datetime(df['TRDATETIME'], format='%d%b%y:%H:%M:%S')

# Number of tx per customer
transaction_freq = df.groupby(
    'cl_id').size().reset_index(name='transaction_freq')

# Avg tx amount per customer
avg_transaction_amount = df.groupby(
    'cl_id')['amount'].mean().reset_index(name='avg_transaction_amount')

# Days since last transaction
last_transaction = df.groupby(
    'cl_id')['TRDATETIME'].max().reset_index(name='last_transaction')
last_transaction['time_since_last_transaction'] = (
    pd.to_datetime('now') - last_transaction['last_transaction']).dt.days

# Unique MCC codes per customer
mcc_diversity = df.groupby(
    'cl_id')['MCC'].nunique().reset_index(name='mcc_diversity')

# Preferred channel per customer
channel_preference = df.groupby(
    ['cl_id', 'channel_type']).size().reset_index(name='channel_count')
channel_preference = channel_preference.loc[channel_preference.groupby(
    'cl_id')['channel_count'].idxmax()].reset_index(drop=True)
channel_preference = channel_preference[['cl_id', 'channel_type']].rename(
    columns={'channel_type': 'preferred_channel'})

# Tx time: Hour of the day, day of the week, month
df['transaction_hour'] = df['TRDATETIME'].dt.hour
df['transaction_day_of_week'] = df['TRDATETIME'].dt.dayofweek
df['transaction_month'] = df['TRDATETIME'].dt.month

# Aggregate time-based features per customer
time_features = df.groupby('cl_id').agg({
    # Mean and standard deviation of transaction hour
    'transaction_hour': ['mean', 'std'],
    # Mean and standard deviation of day of week
    'transaction_day_of_week': ['mean', 'std'],
    # Mean and standard deviation of transaction month
    'transaction_month': ['mean', 'std']
}).reset_index()
time_features.columns = ['cl_id', 'mean_hour', 'std_hour',
                         'mean_day_of_week', 'std_day_of_week', 'mean_month', 'std_month']

# Merge all features
features = transaction_freq.merge(avg_transaction_amount, on='cl_id') \
                           .merge(last_transaction[['cl_id', 'time_since_last_transaction']], on='cl_id') \
                           .merge(mcc_diversity, on='cl_id') \
                           .merge(channel_preference, on='cl_id') \
                           .merge(time_features, on='cl_id')

# Add target columns if needed
targets = df[['cl_id', 'target_flag', 'target_sum']].drop_duplicates()
features = features.merge(targets, on='cl_id', how='left')

print(features.head())

   cl_id  transaction_freq  avg_transaction_amount  \
0     50                47             5683.168298   
1     52                19            27088.392632   
2     54               167             1049.635689   
3     55               120             2937.106833   
4     56                40            27661.519250   

   time_since_last_transaction  mcc_diversity preferred_channel  mean_hour  \
0                         2689             15             type5   1.595745   
1                         2670              4             type5   6.578947   
2                         2788             24             type5   0.694611   
3                         2836             28             type5   0.100000   
4                         2685              7             type5   7.950000   

   std_hour  mean_day_of_week  std_day_of_week  mean_month  std_month  \
0  4.981093          3.744681         1.823376    8.510638   0.856493   
1  8.180708          2.842105         1.424514    8.000000  

In [39]:
features.dropna(inplace=True)
features.isnull().sum()

cl_id                          0
transaction_freq               0
avg_transaction_amount         0
time_since_last_transaction    0
mcc_diversity                  0
preferred_channel              0
mean_hour                      0
std_hour                       0
mean_day_of_week               0
std_day_of_week                0
mean_month                     0
std_month                      0
target_flag                    0
target_sum                     0
dtype: int64

In [43]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Prepare data
X = features.drop(columns=['cl_id', 'target_flag', 'target_sum'])
y = features['target_flag']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Preprocessing
numerical_features = ['transaction_freq', 'avg_transaction_amount', 'time_since_last_transaction',
                      'mcc_diversity', 'mean_hour', 'std_hour', 'mean_day_of_week', 'std_day_of_week', 'mean_month', 'std_month']
categorical_features = ['preferred_channel']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Define models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=1000, random_state=42)
}

# Evaluate individual models
results = {}
for name, model in models.items():
    # Create a pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
    scores = cross_val_score(pipeline, X_train, y_train, cv=5)

    # Evaluate performance
    # results[name] = {
    #     'Accuracy': accuracy_score(y_test, y_pred),
    #     'Precision': precision_score(y_test, y_pred),
    #     'Recall': recall_score(y_test, y_pred),
    #     'F1-Score': f1_score(y_test, y_pred),
    #     'AUC-ROC': roc_auc_score(y_test, y_pred_proba),
    #     'Mean CV score': scores.mean()
    # }
    print(f"Classification Report: {name}")
    print(classification_report(y_test, y_pred))
    cm_rf = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(4, 3))
    sns.heatmap(
        cm_rf,
        annot=True,
        fmt='d',
        cmap='Blues',
        xticklabels=['No Churn', 'Churn'],
        yticklabels=['No Churn', 'Churn']
    )
    plt.title(f'Confusion Matrix: {name}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

# Display results
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value:.4f}")
    print()

Model: Random Forest
Accuracy: 0.7250
Precision: 0.7689
Recall: 0.7389
F1-Score: 0.7536
AUC-ROC: 0.7939
Mean CV score: 0.7341

Model: Gradient Boosting
Accuracy: 0.7159
Precision: 0.7701
Recall: 0.7140
F1-Score: 0.7410
AUC-ROC: 0.7921
Mean CV score: 0.7376

Model: Logistic Regression
Accuracy: 0.6775
Precision: 0.7133
Recall: 0.7247
F1-Score: 0.7189
AUC-ROC: 0.7502
Mean CV score: 0.6784

Model: Neural Network
Accuracy: 0.6603
Precision: 0.7204
Recall: 0.6590
F1-Score: 0.6883
AUC-ROC: 0.7375
Mean CV score: 0.6890



In [44]:
from sklearn.ensemble import StackingClassifier

# Define base models
base_models = [
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42)),
    ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=42)),
    ('Neural Network', MLPClassifier(hidden_layer_sizes=(
        64, 32), max_iter=1000, random_state=42))
]

# Define meta-model
meta_model = LogisticRegression()

# Create stacking ensemble
stacking_ensemble = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    stack_method='predict_proba'  # Use predicted probabilities as input to the meta-model
)

# Create pipeline for the ensemble
ensemble_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', stacking_ensemble)
])

# Train the ensemble
ensemble_pipeline.fit(X_train, y_train)

# Evaluate the ensemble
y_pred_ensemble = ensemble_pipeline.predict(X_test)
y_pred_proba_ensemble = ensemble_pipeline.predict_proba(X_test)[:, 1]
scores = cross_val_score(ensemble_pipeline, X_train, y_train, cv=5)

ensemble_metrics = {
    'Accuracy': accuracy_score(y_test, y_pred_ensemble),
    'Precision': precision_score(y_test, y_pred_ensemble),
    'Recall': recall_score(y_test, y_pred_ensemble),
    'F1-Score': f1_score(y_test, y_pred_ensemble),
    'AUC-ROC': roc_auc_score(y_test, y_pred_proba_ensemble),
    'Mean CV score': scores.mean()
}

# Display ensemble results
print("Ensemble Model Performance:")
for metric_name, value in ensemble_metrics.items():
    print(f"{metric_name}: {value:.4f}")

Ensemble Model Performance:
Accuracy: 0.7250
Precision: 0.7720
Recall: 0.7336
F1-Score: 0.7523
AUC-ROC: 0.7980
Mean CV score: 0.7399


In [7]:
import pickle

# Save the ensemble model
with open('stacking_ensemble.pkl', 'wb') as file:
    pickle.dump(ensemble_pipeline, file)

In [8]:
import joblib
joblib.dump(ensemble_pipeline, 'ensemble_model.joblib')

['ensemble_model.joblib']

In [9]:
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder, StandardScaler
# from sklearn.impute import SimpleImputer

# features = df.drop(
#     columns=['PERIOD', 'cl_id', 'TRDATETIME', 'target_flag', 'target_sum'])
# target_flag = df['target_flag']
# target_sum = df['target_sum']

# label_encoders = {}
# for column in ['channel_type', 'trx_category']:
#     le = LabelEncoder()
#     features[column] = le.fit_transform(features[column])
#     label_encoders[column] = le

In [10]:

# scaler = StandardScaler()
# features[['MCC', 'currency', 'amount']] = scaler.fit_transform(
#     features[['MCC', 'currency', 'amount']])

In [11]:
# X_train, X_test, y_flag_train, y_flag_test, y_sum_train, y_sum_test = train_test_split(
#     features, target_flag, target_sum, test_size=0.2, random_state=42)

# X_train.shape, X_test.shape

In [12]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense
# from sklearn.ensemble import RandomForestRegressor

# # Define a simple neural network model
# nn_model = Sequential([
#     Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
#     Dense(32, activation='relu'),
#     Dense(1, activation='sigmoid')  # Binary classification for target_flag
# ])
# nn_model.compile(optimizer='adam', loss='binary_crossentropy',
#                  metrics=['accuracy'])

In [13]:
# # Define a random forest regressor model for target_sum
# rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

In [14]:
# # Train the neural network
# nn_model.fit(X_train, y_flag_train, epochs=10,
#              batch_size=32, validation_split=0.2)

In [15]:
# # Train the random forest
# rf_model.fit(X_train, y_sum_train)

In [16]:
# # Neural network for classification
# nn_predictions = nn_model.predict(X_test).flatten()
# nn_predictions = (nn_predictions > 0.5).astype(int)

# # Random forest for regression
# rf_predictions = rf_model.predict(X_test)

In [17]:
# from sklearn.metrics import accuracy_score, mean_squared_error

# # Evaluate the neural network on target_flag
# nn_accuracy = accuracy_score(y_flag_test, nn_predictions)

# # Evaluate the random forest on target_sum
# rf_mse = mean_squared_error(y_sum_test, rf_predictions)

# print("Neural Network Accuracy (target_flag):", nn_accuracy)
# print("Random Forest MSE (target_sum):", rf_mse)