In [18]:
import pandas as pd

df = pd.read_csv("../dataset/churn_data.csv")
df.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [19]:
df.drop('customerID', axis=1, inplace=True)


In [20]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)


In [21]:
def profile_columns(df):
    profile = pd.DataFrame({
        
        'Data Type': df.dtypes,
        'Null Count': df.isnull().sum(),
        'Unique Values': df.nunique(),
        'Example Value': df.apply(lambda x: x.dropna().unique()[0] if x.dropna().size > 0 else None),
        'Sample Values': df.apply(lambda x: x.dropna().unique()[:5])
    })

    return profile.sort_values(by='Unique Values', ascending=False)

profile = profile_columns(df)
pd.set_option('display.max_rows', None)  
display(profile)

Unnamed: 0,Data Type,Null Count,Unique Values,Example Value,Sample Values
TotalCharges,float64,0,6531,29.85,"[29.85, 1889.5, 108.15, 1840.75, 151.65]"
MonthlyCharges,float64,0,1585,29.85,"[29.85, 56.95, 53.85, 42.3, 70.7]"
tenure,int64,0,73,1,"[1, 34, 2, 45, 8]"
PaymentMethod,object,0,4,Electronic check,"[Electronic check, Mailed check, Bank transfer..."
StreamingMovies,object,0,3,No,"[No, Yes, No internet service]"
TechSupport,object,0,3,No,"[No, Yes, No internet service]"
OnlineBackup,object,0,3,Yes,"[Yes, No, No internet service]"
StreamingTV,object,0,3,No,"[No, Yes, No internet service]"
DeviceProtection,object,0,3,No,"[No, Yes, No internet service]"
MultipleLines,object,0,3,No phone service,"[No phone service, No, Yes]"


In [22]:
# Binary mappings
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
df['Partner'] = df['Partner'].map({'Yes': 1, 'No': 0})
df['gender'] = df['gender'].map({'Female': 1, 'Male': 0})
df['Dependents'] = df['Dependents'].map({'Yes': 1, 'No': 0})
df['PhoneService'] = df['PhoneService'].map({'Yes': 1, 'No': 0})
df['PaperlessBilling'] = df['PaperlessBilling'].map({'Yes': 1, 'No': 0})

# Handle 'No service' cases as 0
df['MultipleLines'] = df['MultipleLines'].map({'Yes': 1, 'No': 0, 'No phone service': 0})
df['InternetService'] = df['InternetService'].map({'DSL': 1, 'Fiber optic': 2, 'No': 0})
df['OnlineSecurity'] = df['OnlineSecurity'].map({'Yes': 1, 'No': 0, 'No internet service': 0})
df['OnlineBackup'] = df['OnlineBackup'].map({'Yes': 1, 'No': 0, 'No internet service': 0})
df['DeviceProtection'] = df['DeviceProtection'].map({'Yes': 1, 'No': 0, 'No internet service': 0})
df['TechSupport'] = df['TechSupport'].map({'Yes': 1, 'No': 0, 'No internet service': 0})
df['StreamingTV'] = df['StreamingTV'].map({'Yes': 1, 'No': 0, 'No internet service': 0})
df['StreamingMovies'] = df['StreamingMovies'].map({'Yes': 1, 'No': 0, 'No internet service': 0})

# Ordinal mapping for Contract
df['Contract'] = df['Contract'].map({'Month-to-month': 0, 'One year': 1, 'Two year': 2})

# Nominal mapping for PaymentMethod – use Label Encoding or Manual Mapping
df['PaymentMethod'] = df['PaymentMethod'].map({
    'Electronic check': 0,
    'Mailed check': 1,
    'Bank transfer (automatic)': 2,
    'Credit card (automatic)': 3
})

