In [2]:
import pandas as pd

DATASET_PATH = 'telco_customer.csv'

print(f"Attempting to load data from: {DATASET_PATH}\n")

Attempting to load data from: telco_customer.csv



In [3]:
# --- 1. Load the Dataset ---
try:
    df = pd.read_csv(DATASET_PATH)
    print("Dataset loaded successfully!\n")
except FileNotFoundError:
    print(f"Error: The file '{DATASET_PATH}' was not found.")
    print("Please ensure the CSV file is in the correct directory or update the DATASET_PATH variable.")
    # Exit or handle the error appropriately if the file isn't found
    exit() # This will stop execution if the file is not found


Dataset loaded successfully!



In [4]:
# --- 2. Initial Inspection: View the first few rows ---
print("--- First 5 rows of the DataFrame (df.head()) ---\n")
print(df.head())
print("\n" + "="*80 + "\n") # Separator for readability

--- First 5 rows of the DataFrame (df.head()) ---

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSuppo

In [5]:
# --- 3. Initial Inspection: Get a summary of the DataFrame ---
# This shows the number of non-null values and data types for each column.
print("--- DataFrame Information (df.info()) ---\n")
df.info()
print("\n" + "="*80 + "\n")

--- DataFrame Information (df.info()) ---

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16 

In [6]:
# --- 4. Initial Inspection: Check the number of rows and columns ---
print("--- DataFrame Shape (df.shape) ---")
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")
print("\n" + "="*80 + "\n")

--- DataFrame Shape (df.shape) ---
Number of rows: 7043
Number of columns: 21




In [7]:
# --- 5. Initial Inspection: List all column names ---
print("--- Column Names (df.columns) ---\n")
print(df.columns.tolist())
print("\n" + "="*80 + "\n")

--- Column Names (df.columns) ---

['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']




In [10]:
# --- 6. Initial Inspection: Get statistical summaries of numerical columns ---
# This provides count, mean, std, min, max, and quartiles for numerical features.
print("--- Statistical Summary of Numerical Columns (df.describe()) ---\n")
print(df.describe())
print("\n" + "="*80 + "\n")

# --- Additional helpful inspection (not explicitly requested but good practice) ---
# Check for missing values in each column
print("--- Missing Values Count per Column (df.isnull().sum()) ---\n")
print(df.isnull().sum())
print("\n" + "="*80 + "\n")

# Check unique values for a few non-numerical columns to understand their categories
print("--- Unique values for 'gender' ---")
print(df['gender'].unique())
print("\n--- Unique values for 'Partner' ---")
print(df['Partner'].unique())
print("\n--- Unique values for 'Dependents' ---")
print(df['Dependents'].unique())
print("\n--- Unique values for 'Churn' (our target variable) ---")
print(df['Churn'].unique())
print("\n" + "="*80 + "\n")

--- Statistical Summary of Numerical Columns (df.describe()) ---

       SeniorCitizen       tenure  MonthlyCharges
count    7043.000000  7043.000000     7043.000000
mean        0.162147    32.371149       64.761692
std         0.368612    24.559481       30.090047
min         0.000000     0.000000       18.250000
25%         0.000000     9.000000       35.500000
50%         0.000000    29.000000       70.350000
75%         0.000000    55.000000       89.850000
max         1.000000    72.000000      118.750000


--- Missing Values Count per Column (df.isnull().sum()) ---

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
Tot

In [11]:
# --- START: Data Cleaning Steps ---

print("--- Starting Data Cleaning ---")

# 7. Convert 'TotalCharges' to numeric
# The 'TotalCharges' column was identified as 'object' type in previous inspection,
# likely due to non-numeric characters (e.g., empty strings or spaces).
# We use errors='coerce' to turn any non-convertible values into NaN.
print("Converting 'TotalCharges' to numeric and handling errors...")
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
print("Conversion complete.\n")

--- Starting Data Cleaning ---
Converting 'TotalCharges' to numeric and handling errors...
Conversion complete.



In [12]:
# 8. Handle missing values in 'TotalCharges'
# After coercing non-numeric values, some rows in 'TotalCharges' will now be NaN.
# Let's check how many and then fill them.
total_charges_missing_count = df['TotalCharges'].isnull().sum()
if total_charges_missing_count > 0:
    print(f"Found {total_charges_missing_count} missing values in 'TotalCharges' after conversion.")
    # A common approach is to fill missing TotalCharges with 0, especially if tenure is also 0.
    # Alternatively, you could fill with the median/mean, or drop rows.
    # Given that some `tenure` values are 0, filling `TotalCharges` with 0 for new customers seems appropriate.
    # Let's confirm if these NaNs correlate with tenure = 0
    nan_total_charges_and_zero_tenure = df[df['TotalCharges'].isnull() & (df['tenure'] == 0)].shape[0]
    print(f"Number of rows where 'TotalCharges' is NaN AND 'tenure' is 0: {nan_total_charges_and_zero_tenure}")

    # If the NaNs are mostly from tenure=0, filling with 0 is robust.
    # Otherwise, median imputation is a common choice for numerical features.
    if nan_total_charges_and_zero_tenure == total_charges_missing_count:
        print("All NaN 'TotalCharges' correspond to tenure = 0. Filling with 0.")
        df['TotalCharges'] = df['TotalCharges'].fillna(0)
    else:
        # If there are NaNs not explained by tenure=0, we'll use the median of the non-missing values.
        median_total_charges = df['TotalCharges'].median()
        print(f"Filling remaining {total_charges_missing_count - nan_total_charges_and_zero_tenure} NaN 'TotalCharges' with median: {median_total_charges}")
        df['TotalCharges'] = df['TotalCharges'].fillna(median_total_charges)
else:
    print("No missing values found in 'TotalCharges' after conversion. (This case is unlikely given previous info)")


Found 11 missing values in 'TotalCharges' after conversion.
Number of rows where 'TotalCharges' is NaN AND 'tenure' is 0: 11
All NaN 'TotalCharges' correspond to tenure = 0. Filling with 0.


In [13]:
# 9. Drop 'customerID' column
print("Dropping 'customerID' column...")
df = df.drop('customerID', axis=1)
print("Column 'customerID' dropped.\n")

print("--- Data Cleaning Complete ---\n")

# --- Re-inspect after cleaning to confirm changes ---
print("--- DataFrame Information (df.info()) AFTER Cleaning ---\n")
df.info()
print("\n" + "="*80 + "\n")

print("--- Statistical Summary of Numerical Columns (df.describe()) AFTER Cleaning ---\n")
print(df.describe())
print("\n" + "="*80 + "\n")

print("--- Missing Values Count per Column (df.isnull().sum()) AFTER Cleaning ---\n")
print(df.isnull().sum())
print("\n" + "="*80 + "\n")

# Show head again to see effects of cleaning
print("--- First 5 rows of the DataFrame (df.head()) AFTER Cleaning ---\n")
print(df.head())
print("\n" + "="*80 + "\n")


Dropping 'customerID' column...
Column 'customerID' dropped.

--- Data Cleaning Complete ---

--- DataFrame Information (df.info()) AFTER Cleaning ---

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null

In [14]:
# --- START: Categorical Feature Encoding ---
print("\n--- Starting Categorical Feature Encoding ---")

# 10. Encode the target variable 'Churn'
# 'Yes' -> 1, 'No' -> 0
print("Encoding 'Churn' target variable (Yes=1, No=0)...")
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
print("'Churn' encoding complete.\n")



--- Starting Categorical Feature Encoding ---
Encoding 'Churn' target variable (Yes=1, No=0)...
'Churn' encoding complete.



In [15]:
# 11. Encode simple binary columns (Yes/No, Male/Female)
# These columns have only two unique values, typically 'Yes'/'No' or 'Female'/'Male'.
binary_cols_yes_no = [
    'Partner', 'Dependents', 'PaperlessBilling'
]
binary_cols_gender = ['gender']

print("Encoding simple binary 'Yes'/'No' columns (Yes=1, No=0)...")
for col in binary_cols_yes_no:
    df[col] = df[col].map({'Yes': 1, 'No': 0})
print("Simple binary 'Yes'/'No' columns encoding complete.\n")

print("Encoding 'gender' column (Male=1, Female=0)...")
df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})
print("'gender' encoding complete.\n")


Encoding simple binary 'Yes'/'No' columns (Yes=1, No=0)...
Simple binary 'Yes'/'No' columns encoding complete.

Encoding 'gender' column (Male=1, Female=0)...
'gender' encoding complete.



In [16]:

# 12. Encode complex binary columns with 'No service' options
# These columns have 'Yes', 'No', and a 'No phone service' or 'No internet service' value.
# We'll map 'No service' to 'No', then convert 'Yes' to 1 and 'No' to 0.
complex_binary_cols = [
    'PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup',
    'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies'
]

print("Encoding complex binary columns (e.g., 'No service' to 'No', then Yes=1, No=0)...")
for col in complex_binary_cols:
    # Replace 'No phone service'/'No internet service' with 'No' for consistency
    df[col] = df[col].replace(['No phone service', 'No internet service'], 'No')
    # Then map 'Yes' to 1 and 'No' to 0
    df[col] = df[col].map({'Yes': 1, 'No': 0})
print("Complex binary columns encoding complete.\n")


Encoding complex binary columns (e.g., 'No service' to 'No', then Yes=1, No=0)...
Complex binary columns encoding complete.



In [17]:

# 13. One-Hot Encode multi-categorical columns
# These columns have more than two distinct categories and need one-hot encoding.
multi_categorical_cols = [
    'InternetService', 'Contract', 'PaymentMethod'
]

print("One-Hot Encoding multi-categorical columns...")
df = pd.get_dummies(df, columns=multi_categorical_cols, dtype=int) # Use dtype=int to get 0/1 integers
print("One-Hot Encoding complete.\n")

print("--- Data Cleaning & Encoding Complete ---\n")


# --- Re-inspect after cleaning and encoding to confirm changes ---
print("--- DataFrame Information (df.info()) AFTER Cleaning & Encoding ---\n")
df.info()
print("\n" + "="*80 + "\n")

print("--- Statistical Summary of Numerical Columns (df.describe()) AFTER Cleaning & Encoding ---\n")
print(df.describe())
print("\n" + "="*80 + "\n")

print("--- Missing Values Count per Column (df.isnull().sum()) AFTER Cleaning & Encoding ---\n")
print(df.isnull().sum())
print("\n" + "="*80 + "\n")

print("--- First 5 rows of the DataFrame (df.head()) AFTER Cleaning & Encoding ---\n")
print(df.head())
print("\n" + "="*80 + "\n")

print("--- All Column Names (df.columns) AFTER Cleaning & Encoding ---\n")
print(df.columns.tolist())
print("\n" + "="*80 + "\n")


One-Hot Encoding multi-categorical columns...
One-Hot Encoding complete.

--- Data Cleaning & Encoding Complete ---

--- DataFrame Information (df.info()) AFTER Cleaning & Encoding ---

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 27 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   gender                                   7043 non-null   int64  
 1   SeniorCitizen                            7043 non-null   int64  
 2   Partner                                  7043 non-null   int64  
 3   Dependents                               7043 non-null   int64  
 4   tenure                                   7043 non-null   int64  
 5   PhoneService                             7043 non-null   int64  
 6   MultipleLines                            7043 non-null   int64  
 7   OnlineSecurity                           7043 non-null   int64  
 8   On