Python code for data loading and analysis - 
Creating the DataDictionary

In [6]:
import pandas as pd
# Load the dataset
df = pd.read_csv('Telco-Customer-Churn.csv')
# Get data structure overview
data_dictionary = pd.DataFrame({
    "Attribute Name": df.columns,
    "Data Type": df.dtypes.values,
    "Missing Values": df.isnull().sum().values,
    "Unique Values": df.nunique().values
})
# Display the data dictionary
#print("Data Dictionary Overview:")
#print(data_dictionary)
from IPython.display import display
# Display Data Dictionary in Jupyter Notebook
display(data_dictionary)

Unnamed: 0,Attribute Name,Data Type,Missing Values,Unique Values
0,customerID,object,0,7043
1,gender,object,0,2
2,SeniorCitizen,int64,0,2
3,Partner,object,0,2
4,Dependents,object,0,2
5,tenure,int64,0,73
6,PhoneService,object,0,2
7,MultipleLines,object,0,3
8,InternetService,object,0,3
9,OnlineSecurity,object,0,3


Data Ingestion - with Pandas

In [7]:
#import pandas as pd

# Load the dataset into a Pandas DataFrame
df = pd.read_csv('Telco-Customer-Churn.csv')
# Display first few rows
df.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


Data Ingestion - SQL DB

In [15]:

#import pandas as pd
from sqlalchemy import create_engine
import psycopg2

# Load the dataset into a Pandas DataFrame
df = pd.read_csv('Telco-Customer-Churn.csv')

# Define PostgreSQL connection using default credentials
db_engine = create_engine('postgresql://postgres:postgres@localhost:5432/postgres')

# Load DataFrame into PostgreSQL (Table: telco_customer_churn)
df.to_sql('telco_customer_churn', db_engine, if_exists='replace', index=False)

print("✅ Data successfully loaded into PostgreSQL database!")

# Connect to PostgreSQL to execute a query
try:
    conn = psycopg2.connect("dbname='postgres' user='postgres' password='postgres' host='localhost' port='5432'")
    cursor = conn.cursor()

    # Execute SQL query to fetch first 5 rows
    query = "SELECT * FROM telco_customer_churn LIMIT 5;"
    cursor.execute(query)

    # Fetch and display results
    rows = cursor.fetchall()
    print("\n📊 First 5 Rows from telco_customer_churn Table:")
    for row in rows:
        print(row)

    # Close connection
    cursor.close()
    conn.close()
except Exception as e:
    print("❌ Error connecting to PostgreSQL:", e)


✅ Data successfully loaded into PostgreSQL database!

📊 First 5 Rows from telco_customer_churn Table:
('7590-VHVEG', 'Female', 0, 'Yes', 'No', 1, 'No', 'No phone service', 'DSL', 'No', 'Yes', 'No', 'No', 'No', 'No', 'Month-to-month', 'Yes', 'Electronic check', 29.85, '29.85', 'No')
('5575-GNVDE', 'Male', 0, 'No', 'No', 34, 'Yes', 'No', 'DSL', 'Yes', 'No', 'Yes', 'No', 'No', 'No', 'One year', 'No', 'Mailed check', 56.95, '1889.5', 'No')
('3668-QPYBK', 'Male', 0, 'No', 'No', 2, 'Yes', 'No', 'DSL', 'Yes', 'Yes', 'No', 'No', 'No', 'No', 'Month-to-month', 'Yes', 'Mailed check', 53.85, '108.15', 'Yes')
('7795-CFOCW', 'Male', 0, 'No', 'No', 45, 'No', 'No phone service', 'DSL', 'Yes', 'No', 'Yes', 'Yes', 'No', 'No', 'One year', 'No', 'Bank transfer (automatic)', 42.3, '1840.75', 'No')
('9237-HQITU', 'Female', 0, 'No', 'No', 2, 'Yes', 'No', 'Fiber optic', 'No', 'No', 'No', 'No', 'No', 'No', 'Month-to-month', 'Yes', 'Electronic check', 70.7, '151.65', 'Yes')


Checking for missing values using Pandas

In [16]:
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)

Missing Values:
 customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


finding missing values using SQL

In [8]:
import pandas as pd
from sqlalchemy import create_engine
import psycopg2

# Load the dataset into a Pandas DataFrame
df = pd.read_csv("Telco-Customer-Churn.csv")

# Define PostgreSQL connection using default credentials
db_engine = create_engine('postgresql://postgres:postgres@localhost:5432/postgres')

# Load DataFrame into PostgreSQL (Table: telco_customer_churn)
df.to_sql('telco_customer_churn', db_engine, if_exists='replace', index=False)

print("✅ Data successfully loaded into PostgreSQL database!")

missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)
# Convert missing values count to a DataFrame for better visualization
missing_df = missing_values.to_frame(name="Missing Count")
display(missing_df)


✅ Data successfully loaded into PostgreSQL database!
Missing Values:
 customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


Unnamed: 0,Missing Count
customerID,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0


In [19]:
# Define PostgreSQL connection
conn = psycopg2.connect("dbname='postgres' user='postgres' password='postgres' host='localhost' port='5432'")
cursor = conn.cursor()

# Function to run SQL queries and return results
def run_sql_query(query):
    cursor.execute(query)
    rows = cursor.fetchall()
    for row in rows:
        print(row)

# Ensure connection works
print("✅ Connected to PostgreSQL successfully!")


✅ Connected to PostgreSQL successfully!


Statistical summary of numeric columns - helps in understanding the range and distribution of numerical columns

In [9]:
numeric_summary = df.describe()
print("Statistical Summary:\n", numeric_summary)

Statistical Summary:
        SeniorCitizen       tenure  MonthlyCharges
count    7043.000000  7043.000000     7043.000000
mean        0.162147    32.371149       64.761692
std         0.368612    24.559481       30.090047
min         0.000000     0.000000       18.250000
25%         0.000000     9.000000       35.500000
50%         0.000000    29.000000       70.350000
75%         0.000000    55.000000       89.850000
max         1.000000    72.000000      118.750000


Check for Duplicate  records - to ensure that there is no redundant data

In [10]:
duplicate_count = df.duplicated().sum()
print("\nTotal Duplicate Records:", duplicate_count)


Total Duplicate Records: 0


Check Data Types and Unique Values - to identify categorical vs numerical columns and spot inconsistencies

In [11]:
data_info = pd.DataFrame({
    "Data Type": df.dtypes,
    "Unique Values": df.nunique()
})

print("\nData Types and Unique Values:\n", data_info)


Data Types and Unique Values:
                  Data Type  Unique Values
customerID          object           7043
gender              object              2
SeniorCitizen        int64              2
Partner             object              2
Dependents          object              2
tenure               int64             73
PhoneService        object              2
MultipleLines       object              3
InternetService     object              3
OnlineSecurity      object              3
OnlineBackup        object              3
DeviceProtection    object              3
TechSupport         object              3
StreamingTV         object              3
StreamingMovies     object              3
Contract            object              3
PaperlessBilling    object              2
PaymentMethod       object              4
MonthlyCharges     float64           1585
TotalCharges        object           6531
Churn               object              2
