In [1]:
import pandas as pd

# Load main dataset
df = pd.read_csv("main.csv")

# Load train and test labels
train_labels = pd.read_csv("train_cxid.csv")  # Contains Employee ID and Churn
test_labels = pd.read_csv("test_cxid.csv")    # Contains Employee ID and Churn

# Merge train and test labels with the main dataset on Employee ID
train_df = train_labels.merge(df, on="customer_id", how="left")
test_df = test_labels.merge(df, on="customer_id", how="left")

# Check for missing values after merging
print("Missing values in Train Data:", train_df.isnull().sum())
print("Missing values in Test Data:", test_df.isnull().sum())

# Drop rows with missing values (optional, or fill with meaningful data)
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

# Encoding categorical variables (same process as before)
from sklearn.preprocessing import LabelEncoder
label_encoders = {}
categorical_cols = df.select_dtypes(include=['object']).columns.difference(['customer_id'])
for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    label_encoders[col] = le

# Splitting into features and target
X_train = train_df.drop(columns=['customer_id', 'churn'])  # Drop unnecessary columns
y_train = train_df['churn']

X_test = test_df.drop(columns=['customer_id', 'churn'])
y_test = test_df['churn']

# Standardizing numerical features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test) 

# Train Logistic Regression with Regularization
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

log_reg = LogisticRegression(penalty='l2', C=1.0, solver='liblinear', max_iter=500)
log_reg.fit(X_train, y_train)

# Predictions
y_pred = log_reg.predict(X_test)

# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Missing values in Train Data: customer_id    0
churn          0
usage_type     0
Day_1          0
Day_2          0
              ..
Day_86         0
Day_87         0
Day_88         0
Day_89         0
Day_90         0
Length: 93, dtype: int64
Missing values in Test Data: customer_id    0
churn          0
usage_type     0
Day_1          0
Day_2          0
              ..
Day_86         0
Day_87         0
Day_88         0
Day_89         0
Day_90         0
Length: 93, dtype: int64
Logistic Regression Accuracy: 0.79
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.96      0.88    120276
           1       0.61      0.23      0.34     35736

    accuracy                           0.79    156012
   macro avg       0.71      0.60      0.61    156012
weighted avg       0.76      0.79      0.75    156012

Confusion Matrix:
 [[114958   5318]
 [ 27355   8381]]
