In [1]:
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Load the unseen data
clv_unseen = pd.read_csv('new_obs_unseen_dummy3.csv')

# Split the unseen data into two halves (50% each), stratified by the 'Response'
clv_unseen_untreated, clv_unseen_treated = train_test_split(
    clv_unseen,
    stratify=clv_unseen['Response'],
    test_size=0.5,
    random_state=42
)

In [2]:
# Section 1: Preparing and testing treated data with pipe_tuned_pipeline

# Dropping unnecessary columns
clv_unseen_treated = clv_unseen_treated.drop(columns=['Customer', 'Effective To Date'])

# Mapping 'Yes'/'No' to 1/0 in the Response column
clv_unseen_treated['Response'] = clv_unseen_treated['Response'].map({'Yes': 1, 'No': 0}).astype(float)

# Applying logarithmic transformation
clv_unseen_treated['CLV_log'] = np.log1p(clv_unseen_treated['Customer Lifetime Value'])
clv_unseen_treated['Income_Log'] = np.log1p(clv_unseen_treated['Income'])
clv_unseen_treated['TCA_Log'] = np.log1p(clv_unseen_treated['Total Claim Amount'])

# Dropping the original columns after logging
clv_unseen_treated = clv_unseen_treated.drop(columns=['Customer Lifetime Value', 'Income', 'Total Claim Amount'])

# Splitting treated data into features and target variable
X_unseen_treated = clv_unseen_treated.drop(columns=['Response'])
y_unseen_treated = clv_unseen_treated['Response']

# Load the pipe_tuned pipeline
pipe_tuned_pipeline = joblib.load('pipe_tuned_pipeline.pkl')

# Predict using the treated data pipeline
y_pred_treated = pipe_tuned_pipeline.predict(X_unseen_treated)

In [3]:
# Evaluate performance
print("\nPerformance on Treated Data (pipe_tuned_pipeline):")
print("Accuracy:", accuracy_score(y_unseen_treated, y_pred_treated))
print("Classification Report:\n", classification_report(y_unseen_treated, y_pred_treated))
print("Confusion Matrix:\n", confusion_matrix(y_unseen_treated, y_pred_treated))


Performance on Treated Data (pipe_tuned_pipeline):
Accuracy: 0.8982494529540481
Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      0.89      0.94       783
         1.0       0.59      0.98      0.73       131

    accuracy                           0.90       914
   macro avg       0.79      0.93      0.84       914
weighted avg       0.94      0.90      0.91       914

Confusion Matrix:
 [[693  90]
 [  3 128]]


In [4]:
# Count the predictions
treated_yes_count = sum(y_pred_treated)
treated_no_count = len(y_pred_treated) - treated_yes_count
print(f"Treated Data - 'Yes' Predictions: {treated_yes_count}, 'No' Predictions: {treated_no_count}")

# Save predictions to CSV
treated_results = pd.DataFrame({
    'Actual': y_unseen_treated,
    'Predicted': y_pred_treated
})
treated_results.to_csv('treated_predictions.csv', index=False)
print("Treated predictions saved to 'treated_predictions.csv'.")

Treated Data - 'Yes' Predictions: 218.0, 'No' Predictions: 696.0
Treated predictions saved to 'treated_predictions.csv'.


In [5]:
# Section 2: Minimal Treatment (One-Hot Encoding)

# Dropping unnecessary columns
clv_unseen_untreated = clv_unseen_untreated.drop(columns=['Customer', 'Effective To Date', 'Unnamed: 0'])

# Mapping 'Yes'/'No' to 1/0 in the Response column
clv_unseen_untreated['Response'] = clv_unseen_untreated['Response'].map({'Yes': 1, 'No': 0}).astype(float)

# Split the data into features and target variable
X_unseen_untreated = clv_unseen_untreated.drop('Response', axis=1)
y_unseen_untreated = clv_unseen_untreated['Response']

# Identify categorical columns automatically
categorical_columns = X_unseen_untreated.select_dtypes(include=['object', 'category']).columns.tolist()

# Identify numerical columns automatically
numerical_columns = X_unseen_untreated.select_dtypes(exclude=['object', 'category']).columns.tolist()

# Apply One-Hot Encoding to all categorical columns
one_hot_encoder = OneHotEncoder(sparse_output=False)
X_unseen_untreated_categorical = one_hot_encoder.fit_transform(X_unseen_untreated[categorical_columns])

# Convert One-Hot Encoded data to DataFrame with appropriate column names
one_hot_encoded_columns = one_hot_encoder.get_feature_names_out(categorical_columns)
X_unseen_untreated_categorical = pd.DataFrame(X_unseen_untreated_categorical, columns=one_hot_encoded_columns, index=X_unseen_untreated.index)

# Combine the numerical features with the one-hot encoded categorical features
X_unseen_combined = pd.concat([X_unseen_untreated[numerical_columns], X_unseen_untreated_categorical], axis=1)

# Load the knn_tuned model
knn_tuned_model = joblib.load('knn_tuned_model.pkl')

# Predict using the untreated data model
y_pred_untreated = knn_tuned_model.predict(X_unseen_combined)

In [6]:
# Evaluate performance
print("Performance on Minimally Treated Data (One-Hot Encoding):")
print("Accuracy:", accuracy_score(y_unseen_untreated, y_pred_untreated))
print("Classification Report:\n", classification_report(y_unseen_untreated, y_pred_untreated))
print("Confusion Matrix:\n", confusion_matrix(y_unseen_untreated, y_pred_untreated))

Performance on Minimally Treated Data (One-Hot Encoding):
Accuracy: 0.8970427163198248
Classification Report:
               precision    recall  f1-score   support

         0.0       0.96      0.92      0.94       782
         1.0       0.61      0.76      0.68       131

    accuracy                           0.90       913
   macro avg       0.79      0.84      0.81       913
weighted avg       0.91      0.90      0.90       913

Confusion Matrix:
 [[720  62]
 [ 32  99]]


In [7]:
# Count the predictions for untreated data
untreated_yes_count = sum(y_pred_untreated)
untreated_no_count = len(y_pred_untreated) - untreated_yes_count
print(f"Untreated Data - 'Yes' Predictions: {untreated_yes_count}, 'No' Predictions: {untreated_no_count}")

# Save predictions to CSV
untreated_results = pd.DataFrame({
    'Actual': y_unseen_untreated,
    'Predicted': y_pred_untreated
})
untreated_results.to_csv('untreated_predictions.csv', index=False)
print("Untreated predictions saved to 'untreated_predictions.csv'.")

Untreated Data - 'Yes' Predictions: 161.0, 'No' Predictions: 752.0
Untreated predictions saved to 'untreated_predictions.csv'.


## Percentage Amplification Analysis

### 1. 'Yes' Predictions:
- **Untreated:** 161
- **Treated:** 218

**Percentage Amplification (Yes):**
<br>**(218 - 161) / 161 * 100** =
<br>57 / 161 * 100 ≈ **35.4%**

### 2. 'No' Predictions:
- **Untreated:** 752
- **Treated:** 696

**Percentage Amplification (No):**
<br>**(696 - 752) / 752 * 100** =
<br>-56 / 752 * 100 ≈ **-7.45%**

### Summary:
- **Yes Predictions:** The treated data resulted in a **35.4% increase** in 'Yes' predictions compared to the untreated data.
- **No Predictions:** The treated data resulted in a **7.45% decrease** in 'No' predictions compared to the untreated data.

This analysis suggests that the preprocessing treatments applied increased the model's likelihood of predicting 'Yes' outcomes, while slightly reducing the likelihood of predicting 'No' outcomes.