In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("telecom_churn.csv")

# Example cleanup
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df = df.dropna()

# DO NOT do get_dummies here
df.to_csv("telecom_churn_processed.csv", index=False)

In [3]:
print(df.columns.tolist())


['CustomerID', 'Gender', 'SeniorCitizen', 'Partner', 'Dependents', 'Tenure', 'PhoneService', 'InternetService', 'OnlineSecurity', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'CHURN']


In [4]:
for col in df.columns:
    if 'churn' in col.lower():
        print(col)


CHURN


In [5]:
df = df.rename(columns={"CHURN": "Churn"})


In [6]:
df = df.dropna()

In [7]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()


In [8]:
# Remove target variable from features
if 'Churn' in categorical_cols:
    categorical_cols.remove('Churn')
if 'Churn' in numerical_cols:
    numerical_cols.remove('Churn')

print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)



Categorical columns: ['CustomerID', 'Gender', 'Partner', 'Dependents', 'PhoneService', 'InternetService', 'OnlineSecurity', 'Contract', 'PaperlessBilling', 'PaymentMethod']
Numerical columns: ['SeniorCitizen', 'Tenure', 'MonthlyCharges', 'TotalCharges']


In [9]:
# Encode categorical variables
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [10]:
if df_encoded['Churn'].dtype == 'object':
   df_encoded['Churn'] = df_encoded['Churn'].map({'Yes': 1, 'No': 0})

In [11]:
# Save processed data for Snowflake upload
df_encoded.to_csv('telecom_churn_processed.csv', index=False)

print("\nProcessed data saved as 'telecom_churn_processed.csv'")
print("Shape:", df_encoded.shape)
print("\nFirst 5 rows of processed data:")
print(df_encoded.head())


Processed data saved as 'telecom_churn_processed.csv'
Shape: (5000, 5018)

First 5 rows of processed data:
   SeniorCitizen  Tenure  MonthlyCharges  TotalCharges  Churn  \
0              0      61           66.89       7414.08      0   
1              0      68           68.88       1170.55      0   
2              0      62           76.01       7466.19      0   
3              1       1           83.63       5113.06      1   
4              0      53           84.83       5285.79      0   

   CustomerID_CUST00001  CustomerID_CUST00002  CustomerID_CUST00003  \
0                 False                 False                 False   
1                  True                 False                 False   
2                 False                  True                 False   
3                 False                 False                  True   
4                 False                 False                 False   

   CustomerID_CUST00004  CustomerID_CUST00005  ...  \
0                 Fa

In [13]:
feature_cols = [
    "SeniorCitizen",
    "Tenure",
    "MonthlyCharges",
    "TotalCharges"
]

target_col = "Churn"


In [14]:
X = df[feature_cols]
y = df[target_col]


In [16]:
# # df already loaded earlier (CSV / preprocessing step)
# X = df[feature_cols]
# preds = model.predict(X)
