In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/project-1-me-4127-e/submission_sample.csv
/kaggle/input/project-1-me-4127-e/train_data.csv
/kaggle/input/project-1-me-4127-e/test_data_file.csv


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import cross_val_predict

In [4]:
# Load datasets
train_df = pd.read_csv('/kaggle/input/project-1-me-4127-e/train_data.csv')
test_df = pd.read_csv('/kaggle/input/project-1-me-4127-e/test_data_file.csv')


In [6]:
# Data preprocessing
X = train_df.drop(columns=['target', 'id'])  # Drop 'target' and 'id'
y = train_df['target'].astype(int)  # Ensure target is integer

In [7]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

In [8]:
# Manually encode categorical features
encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_cols]))


In [9]:
# Standardize numerical columns
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X[numerical_cols]), columns=numerical_cols)

In [10]:
# Combine the scaled numerical and encoded categorical features
X_processed = pd.concat([X_scaled.reset_index(drop=True), X_encoded.reset_index(drop=True)], axis=1)

In [12]:
# Convert column names to strings
X_processed.columns = X_processed.columns.astype(str)


In [13]:
# Splitting the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_processed, y, test_size=0.2, random_state=42)

In [12]:
# Set up parameter grid for hyperparameter tuning with Elastic Net
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],  # Regularization strength
    'solver': ['saga'],  # Solver that supports Elastic Net
    'penalty': ['elasticnet'],  # Elastic Net penalty (combines L1 and L2)
    'l1_ratio': [0.1, 0.5, 0.9]
}

In [13]:
#supress ConvergenceWarning
from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Perform grid search for logistic regression
try:
    grid_search = GridSearchCV(LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced'), 
                               param_grid, cv=5, scoring='neg_log_loss', error_score='raise')
    grid_search.fit(X_train, y_train)
    print("Grid search completed successfully.")
    
    # Get the best model
    log_reg = grid_search.best_estimator_
    print(f"Best estimator: {log_reg}")
except Exception as e:
    print(f"Error during grid search: {e}")

# If grid search didn't complete, handle it
if 'grid_search' in locals() and hasattr(grid_search, 'best_estimator_'):
    log_reg = grid_search.best_estimator_
else:
    print("Grid search did not complete successfully.")



KeyboardInterrupt: 

In [14]:
# Logistic Regression Model
log_reg = LogisticRegression(C=0.1, class_weight='balanced', l1_ratio=0.1, max_iter=1000,
                   penalty='elasticnet', random_state=42, solver='saga')

# Step 2: Use cross_val_predict to get predicted probabilities
y_pred_prob = cross_val_predict(log_reg, X_train, y_train, cv=5, method='predict_proba')

In [20]:

# Step 3: Fit the CalibratedClassifierCV with the predicted probabilities
calibrated_logreg = CalibratedClassifierCV(base_estimator=log_reg, method='sigmoid')
calibrated_logreg.fit(X_train, y_train)




In [26]:

# Step 4: Predict probabilities on the validation set
y_val_pred_prob = calibrated_logreg.predict_proba(X_val)[:, 1]

In [32]:
# Step 5: Calculate Log Loss on the validation set
validation_log_loss = log_loss(y_val, y_val_pred_prob)
print(f"Validation Log Loss: {validation_log_loss}")

Validation Log Loss: 0.3695408184282856


In [34]:
# Get calibrated probabilities for training set
calibrated_prob = calibrated_logreg.predict_proba(X_train)[:, 1]

In [35]:
# Step 5: Calculate log loss
# Step 5: Calculate log loss
mean_log_loss = log_loss(y_train, calibrated_prob)

print(f"Mean Log Loss after Calibration: {mean_log_loss}")

Mean Log Loss after Calibration: 0.36797394713941134


In [47]:
# Step 9: Print the results
print(f"Mean Log Loss after Calibration: {mean_log_loss:.4f}")


Mean Log Loss after Calibration: 0.3680


In [37]:
# Predict probabilities on the training set
y_train_pred_prob = calibrated_logreg.predict_proba(X_train)[:, 1]
# Calculate Log Loss on the training set
training_log_loss = log_loss(y_train, y_train_pred_prob)


In [48]:
# Print all log loss metrics
print(f"Training Log Loss: {training_log_loss}")
print(f"Validation Log Loss: {validation_log_loss}")

 

Training Log Loss: 0.36797394713941134
Validation Log Loss: 0.3695408184282856


In [39]:
# Process the test dataset in the same way as training data
X_test = test_df.drop(columns=['id'])
categorical_cols_test = X_test.select_dtypes(include=['object']).columns
numerical_cols_test = X_test.select_dtypes(exclude=['object']).columns


In [40]:
X_test_encoded = pd.DataFrame(encoder.transform(X_test[categorical_cols_test]))
X_test_scaled = pd.DataFrame(scaler.transform(X_test[numerical_cols_test]), columns=numerical_cols_test)




In [41]:
X_test_processed = pd.concat([X_test_scaled.reset_index(drop=True), X_test_encoded.reset_index(drop=True)], axis=1)


In [42]:
# Convert all column names in X_test_processed to string to avoid column name issues
X_test_processed.columns = X_test_processed.columns.astype(str)

# Ensure the test data has the same feature columns as the training data
X_test_processed = X_test_processed[X_processed.columns]

# Now predict probabilities for the test dataset using calibrated model
test_pred_prob = calibrated_logreg.predict_proba(X_test_processed)[:, 1]


In [43]:
#Check for overfitting
# 1. Calculate Training and Validation Log Loss
y_train_pred_prob = calibrated_logreg.predict_proba(X_train)[:, 1]
training_log_
y_val_pred_prob = calibrated_logreg.predict_proba(X_val)[:, 1]
validation_log_loss = log_loss(y_val, y_val_pred_prob)

print(f"Training Log Loss: {training_log_loss}")
print(f"Validation Log Loss: {validation_log_loss}")
loss = log_loss(y_train, y_train_pred_prob)


Training Log Loss: 0.36797394713941134
Validation Log Loss: 0.3695408184282856


In [45]:
# 2. Cross-Validation Scores
cv_log_losses = cross_val_score(calibrated_logreg, X_train, y_train, cv=5, scoring='neg_log_loss')
mean_cv_log_loss = -cv_log_losses.mean()
print(f"Mean CV Log Loss: {mean_cv_log_loss}")





Mean CV Log Loss: 0.3699615924103472


In [49]:
# Prepare the submission file
submission_df = pd.DataFrame({
    'id': test_df['id'],  # Use the test dataset's IDs
    'target': test_pred_prob  # Use the predicted probabilities for churn
})

In [50]:
# Save the submission file as a CSV
submission_df.to_csv('submission12.csv', index=False)
