Cell 1: Imports + path

In [1]:
import pandas as pd
from pathlib import Path

# notebook อยู่ใน project_3_churn_prediction/notebooks
PROJECT_ROOT = Path.cwd().parent        # -> project_3_churn_prediction
RAW_PATH = PROJECT_ROOT / "data" / "raw" / "telco-customer-churn.csv"

print("Current working dir:", Path.cwd())
print("PROJECT_ROOT:", PROJECT_ROOT)
print("RAW_PATH:", RAW_PATH)
print("Exists:", RAW_PATH.exists())

df = pd.read_csv(RAW_PATH)
df.head()


Current working dir: c:\Users\Lenovo\Desktop\Portfolio\data-portfolio-nichagan\project_3_churn_prediction\notebooks
PROJECT_ROOT: c:\Users\Lenovo\Desktop\Portfolio\data-portfolio-nichagan\project_3_churn_prediction
RAW_PATH: c:\Users\Lenovo\Desktop\Portfolio\data-portfolio-nichagan\project_3_churn_prediction\data\raw\telco-customer-churn.csv
Exists: True


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


Cell 2: Shape + info

In [2]:
df.shape, df.columns.tolist()
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


Cell 3: Missing + duplicates

In [3]:
df.isna().sum().sort_values(ascending=False).head(20)
df.duplicated().sum()


np.int64(0)

Cell 4: Target distribution

In [4]:
target_candidates = ["Churn", "churn", "Exited", "Target"]
target_col = next((c for c in target_candidates if c in df.columns), None)
target_col


'Churn'

In [5]:
df[target_col].value_counts(), df[target_col].value_counts(normalize=True)


(Churn
 No     5174
 Yes    1869
 Name: count, dtype: int64,
 Churn
 No     0.73463
 Yes    0.26537
 Name: proportion, dtype: float64)

In [7]:
from pathlib import Path

# ===============================
# 1) path
# ===============================
PROJECT_ROOT = Path.cwd().parent
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

PROCESSED_PATH = PROCESSED_DIR / "churn_processed.csv"

# ===============================
# 2) clean เบื้องต้น
# ===============================
df_processed = df.copy()

# ลบ customerID (ไม่มีประโยชน์กับ model)
if "customerID" in df_processed.columns:
    df_processed = df_processed.drop(columns=["customerID"])

# แปลง TotalCharges เป็นตัวเลข (dataset นี้ชอบพังตรงนี้)
if "TotalCharges" in df_processed.columns:
    df_processed["TotalCharges"] = pd.to_numeric(
        df_processed["TotalCharges"], errors="coerce"
    )

# ===============================
# 3) save processed
# ===============================
df_processed.to_csv(PROCESSED_PATH, index=False)

print("✅ Saved processed file at:")
print(PROCESSED_PATH)
print("Exists:", PROCESSED_PATH.exists())


✅ Saved processed file at:
c:\Users\Lenovo\Desktop\Portfolio\data-portfolio-nichagan\project_3_churn_prediction\data\processed\churn_processed.csv
Exists: True
