In [None]:
import pandas as pd

# URL for the dataset
DATA_URL = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"

# Load the dataset into a pandas DataFrame
df = pd.read_csv(DATA_URL)


In [None]:
# 1st examine the data~
# The Shape: What are the overall dimensions of our data?
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [None]:
# It gives us a feel for the columns and the values within them.
df.head()

In [None]:
#This is the most critical diagnostic tool at this stage. It tells us about potential problems like missing values and incorrect data types.
df.info()

In [None]:
# Attempt to convert 'TotalCharges' to a number, turning problematic values into NaN
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")


In [None]:

# Now, let's run .info() again and see what changed!
df.info()

In [None]:
# Print shape before dropping to see the change
print("Shape before dropping NaN:", df.shape)


In [None]:

# Drop rows with any NaN values
df.dropna(inplace=True)

# inplace=True modifies the DataFrame directly.
# Be careful with this! It's good practice to re-assign:
# df = df.dropna()


In [None]:
# Print shape after dropping to confirm
print("Shape after dropping NaN:", df.shape)

# You can also run df.info() again to see that all columns now have the same non-null count.

In [None]:
# Create a copy to keep the original df safe
df_processed = df.copy()


In [None]:

# 1. Encode the target variable 'Churn'
df_processed["Churn"] = df_processed["Churn"].map({"No": 0, "Yes": 1})


In [None]:

# 2. Drop the unnecessary customerID column
df_processed = df_processed.drop("customerID", axis=1)


In [None]:

# 3. Identify all remaining text columns to be one-hot encoded
# We can be clever and ask pandas to find all 'object' type columns
categorical_cols = df_processed.select_dtypes(include=["object"]).columns


In [None]:

# 4. Perform one-hot encoding using pandas get_dummies()
df_processed = pd.get_dummies(df_processed, columns=categorical_cols, drop_first=True)

# 'drop_first=True' is a good practice to avoid redundancy.
# For a 'gender' column (Male/Female), it creates only one new column 'gender_Male'.
# If 'gender_Male' is 1, they are Male. If it's 0, they must be Female.


In [None]:

# Let's look at the result!
print("--- DataFrame after Encoding ---")
print(df_processed.info())

print("\n--- First 5 rows of the new data ---")
print(df_processed.head())