In [1]:
# Importing pandas
import pandas as pd

In [2]:
# Load the CSV file into a DataFrame
df = pd.read_csv('telco_churn.csv')

# Preview the first 5 rows
df.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110.0,45.07,197.4,99.0,16.78,244.7,91.0,11.01,10.0,3,2.7,1.0,False
1,OH,107,415,No,Yes,26,161.6,123.0,27.47,195.5,103.0,16.62,254.4,103.0,11.45,13.7,3,3.7,1.0,False
2,NJ,137,415,No,No,0,243.4,114.0,41.38,121.2,110.0,10.3,162.6,104.0,7.32,12.2,5,3.29,0.0,False
3,OH,84,408,Yes,No,0,299.4,71.0,50.9,61.9,88.0,5.26,196.9,89.0,8.86,6.6,7,1.78,2.0,False
4,OK,75,415,Yes,No,0,166.7,113.0,28.34,148.3,122.0,12.61,186.9,121.0,8.41,10.1,3,2.73,3.0,False


In [3]:
# Check for missing values in each column
print("Missing values per column:")
print(df.isnull().sum())

Missing values per column:
State                      0
Account length             0
Area code                  0
International plan         0
Voice mail plan            0
Number vmail messages      0
Total day minutes         10
Total day calls           10
Total day charge          18
Total eve minutes          9
Total eve calls            8
Total eve charge           0
Total night minutes        0
Total night calls          1
Total night charge         0
Total intl minutes         0
Total intl calls           0
Total intl charge          5
Customer service calls     5
Churn                      8
dtype: int64


In [6]:
# Fill missing numeric values with the average (mean) of their column
df = df.fillna(df.mean(numeric_only=True))

In [7]:
# Recheck to confirm missing values are handled
print(df.isnull().sum())


State                     0
Account length            0
Area code                 0
International plan        0
Voice mail plan           0
Number vmail messages     0
Total day minutes         0
Total day calls           0
Total day charge          0
Total eve minutes         0
Total eve calls           0
Total eve charge          0
Total night minutes       0
Total night calls         0
Total night charge        0
Total intl minutes        0
Total intl calls          0
Total intl charge         0
Customer service calls    0
Churn                     8
dtype: int64


In [8]:
# Drop rows where 'Churn' column is missing
df = df.dropna(subset=['Churn'])

# Double-check that nothing is missing anymore
print(df.isnull().sum())


State                     0
Account length            0
Area code                 0
International plan        0
Voice mail plan           0
Number vmail messages     0
Total day minutes         0
Total day calls           0
Total day charge          0
Total eve minutes         0
Total eve calls           0
Total eve charge          0
Total night minutes       0
Total night calls         0
Total night charge        0
Total intl minutes        0
Total intl calls          0
Total intl charge         0
Customer service calls    0
Churn                     0
dtype: int64


In [9]:
# Check for duplicate rows in your dataset
duplicate_rows = df.duplicated()

print("Number of duplicate rows:", duplicate_rows.sum())


Number of duplicate rows: 0


In [10]:
# Remove duplicate rows
df = df.drop_duplicates()

# Confirm it's clean
print("Number of duplicate rows after cleaning:", df.duplicated().sum())


Number of duplicate rows after cleaning: 0


In [11]:
# Check the data types of each column
print(df.dtypes)


State                      object
Account length              int64
Area code                   int64
International plan         object
Voice mail plan            object
Number vmail messages       int64
Total day minutes         float64
Total day calls           float64
Total day charge          float64
Total eve minutes         float64
Total eve calls           float64
Total eve charge          float64
Total night minutes       float64
Total night calls         float64
Total night charge        float64
Total intl minutes        float64
Total intl calls            int64
Total intl charge         float64
Customer service calls    float64
Churn                      object
dtype: object


In [14]:
# Convert 'Yes'/'No' to Boolean True/False
df['International plan'] = df['International plan'].map({'Yes': True, 'No': False})
df['Voice mail plan'] = df['Voice mail plan'].map({'Yes': True, 'No': False})

# Convert 'Churn' to Boolean True/False if it's stored as text
df['Churn'] = df['Churn'].map({'True': True, 'False': False})


In [15]:
# Save cleaned DataFrame to a new CSV file
df.to_csv('telco_churn_cleaned.csv', index=False)

# Download it to your local computer
from google.colab import files
files.download('telco_churn_cleaned.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>