In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

INPUT = Path("D:/Project/WA_Fn-UseC_-Telco-Customer-Churn.csv") 
OUTPATH = Path("D:/Project/churn_50k_synthetic.csv")

df = pd.read_csv(INPUT)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['MonthlyCharges'] * df['tenure'], inplace=True)

rng = np.random.default_rng(42)
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
TARGET = 'Churn'
rows = []
for _ in range(50000):
    base = df.sample(n=1, replace=True, random_state=rng.integers(0,1<<31)).iloc[0].copy()
    for c in numeric_cols:
        noise = rng.normal(0, 0.02 * max(1.0, abs(base[c])))
        val = base[c] + noise
        base[c] = int(round(val)) if c == 'tenure' else max(0, val)
    base['customerID'] = f"SYN{rng.integers(1_000_000_000):09d}"
    # occasional categorical swaps
    if rng.random() < 0.03:
        base['Contract'] = rng.choice(df['Contract'].unique())
    if rng.random() < 0.02:
        base['PaymentMethod'] = rng.choice(df['PaymentMethod'].unique())
    rows.append(base)
df_syn = pd.DataFrame(rows)
df_syn.to_csv(OUTPATH, index=False)
print("Saved synthetic dataset to:", OUTPATH)

X = df_syn.drop(columns=['customerID', TARGET])
y = (df_syn[TARGET] == 'Yes').astype(int)

X_sub, _, y_sub, _ = train_test_split(X, y, train_size=30000, random_state=42, stratify=y)
X_train, X_test, y_train, y_test = train_test_split(X_sub, y_sub, test_size=0.25, random_state=42, stratify=y_sub)

numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_features = [c for c in X_train.columns if c not in numeric_features]

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
])


pipeline = Pipeline([
    ('pre', preprocessor),
    ('clf', RandomForestClassifier(n_estimators=120, max_depth=12, random_state=42, n_jobs=-1))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['MonthlyCharges'] * df['tenure'], inplace=True)


Saved synthetic dataset to: D:\Project\churn_50k_synthetic.csv
Accuracy: 0.8997333333333334
              precision    recall  f1-score   support

           0       0.92      0.95      0.93      5487
           1       0.84      0.77      0.81      2013

    accuracy                           0.90      7500
   macro avg       0.88      0.86      0.87      7500
weighted avg       0.90      0.90      0.90      7500

