Python code for data loading and analysis - 
Creating the DataDictionary

In [6]:
import pandas as pd
# Load the dataset
df = pd.read_csv('Telco-Customer-Churn.csv')
# Get data structure overview
data_dictionary = pd.DataFrame({
    "Attribute Name": df.columns,
    "Data Type": df.dtypes.values,
    "Missing Values": df.isnull().sum().values,
    "Unique Values": df.nunique().values
})
# Display the data dictionary
#print("Data Dictionary Overview:")
#print(data_dictionary)
from IPython.display import display
# Display Data Dictionary in Jupyter Notebook
display(data_dictionary)

Unnamed: 0,Attribute Name,Data Type,Missing Values,Unique Values
0,customerID,object,0,7043
1,gender,object,0,2
2,SeniorCitizen,int64,0,2
3,Partner,object,0,2
4,Dependents,object,0,2
5,tenure,int64,0,73
6,PhoneService,object,0,2
7,MultipleLines,object,0,3
8,InternetService,object,0,3
9,OnlineSecurity,object,0,3


Data Ingestion - with Pandas

In [7]:
#import pandas as pd

# Load the dataset into a Pandas DataFrame
df = pd.read_csv('Telco-Customer-Churn.csv')
# Display first few rows
df.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


Data Ingestion - SQL DB

In [15]:

#import pandas as pd
from sqlalchemy import create_engine
import psycopg2

# Load the dataset into a Pandas DataFrame
df = pd.read_csv('Telco-Customer-Churn.csv')

# Define PostgreSQL connection using default credentials
db_engine = create_engine('postgresql://postgres:postgres@localhost:5432/postgres')

# Load DataFrame into PostgreSQL (Table: telco_customer_churn)
df.to_sql('telco_customer_churn', db_engine, if_exists='replace', index=False)

print("✅ Data successfully loaded into PostgreSQL database!")

# Connect to PostgreSQL to execute a query
try:
    conn = psycopg2.connect("dbname='postgres' user='postgres' password='postgres' host='localhost' port='5432'")
    cursor = conn.cursor()

    # Execute SQL query to fetch first 5 rows
    query = "SELECT * FROM telco_customer_churn LIMIT 5;"
    cursor.execute(query)

    # Fetch and display results
    rows = cursor.fetchall()
    print("\n📊 First 5 Rows from telco_customer_churn Table:")
    for row in rows:
        print(row)

    # Close connection
    cursor.close()
    conn.close()
except Exception as e:
    print("❌ Error connecting to PostgreSQL:", e)


✅ Data successfully loaded into PostgreSQL database!

📊 First 5 Rows from telco_customer_churn Table:
('7590-VHVEG', 'Female', 0, 'Yes', 'No', 1, 'No', 'No phone service', 'DSL', 'No', 'Yes', 'No', 'No', 'No', 'No', 'Month-to-month', 'Yes', 'Electronic check', 29.85, '29.85', 'No')
('5575-GNVDE', 'Male', 0, 'No', 'No', 34, 'Yes', 'No', 'DSL', 'Yes', 'No', 'Yes', 'No', 'No', 'No', 'One year', 'No', 'Mailed check', 56.95, '1889.5', 'No')
('3668-QPYBK', 'Male', 0, 'No', 'No', 2, 'Yes', 'No', 'DSL', 'Yes', 'Yes', 'No', 'No', 'No', 'No', 'Month-to-month', 'Yes', 'Mailed check', 53.85, '108.15', 'Yes')
('7795-CFOCW', 'Male', 0, 'No', 'No', 45, 'No', 'No phone service', 'DSL', 'Yes', 'No', 'Yes', 'Yes', 'No', 'No', 'One year', 'No', 'Bank transfer (automatic)', 42.3, '1840.75', 'No')
('9237-HQITU', 'Female', 0, 'No', 'No', 2, 'Yes', 'No', 'Fiber optic', 'No', 'No', 'No', 'No', 'No', 'No', 'Month-to-month', 'Yes', 'Electronic check', 70.7, '151.65', 'Yes')


Checking for missing values using Pandas

In [16]:
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)

Missing Values:
 customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


finding missing values using SQL

In [18]:
import pandas as pd
from sqlalchemy import create_engine
import psycopg2

# Load the dataset into a Pandas DataFrame
df = pd.read_csv("Telco-Customer-Churn.csv")

# Define PostgreSQL connection using default credentials
db_engine = create_engine('postgresql://postgres:postgres@localhost:5432/postgres')

# Load DataFrame into PostgreSQL (Table: telco_customer_churn)
df.to_sql('telco_customer_churn', db_engine, if_exists='replace', index=False)

print("✅ Data successfully loaded into PostgreSQL database!")


✅ Data successfully loaded into PostgreSQL database!


In [19]:
# Define PostgreSQL connection
conn = psycopg2.connect("dbname='postgres' user='postgres' password='postgres' host='localhost' port='5432'")
cursor = conn.cursor()

# Function to run SQL queries and return results
def run_sql_query(query):
    cursor.execute(query)
    rows = cursor.fetchall()
    for row in rows:
        print(row)

# Ensure connection works
print("✅ Connected to PostgreSQL successfully!")


✅ Connected to PostgreSQL successfully!


In [20]:
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)


Missing Values:
 customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [21]:
run_sql_query("""
SELECT column_name, COUNT(*) - COUNT(column_name) AS missing_values
FROM information_schema.columns
LEFT JOIN (
    SELECT * FROM telco_customer_churn
) t ON column_name = column_name
GROUP BY column_name;
""")


('provolatile', 0)
('srsublsn', 0)
('stxkind', 0)
('aggmtransspace', 0)
('indisprimary', 0)
('funcname', 0)
('comments', 0)
('stacoll2', 0)
('mode', 0)
('virtualxid', 0)
('pubowner', 0)
('privtype', 0)
('adnum', 0)
('StreamingTV', 0)
('applied', 0)
('identity_maximum', 0)
('MultipleLines', 0)
('ref_dtd_identifier', 0)
('cfgowner', 0)
('umoptions', 0)
('last_autovacuum', 0)
('blks_exists', 0)
('datoid', 0)
('relfrozenxid', 0)
('srrelid', 0)
('amopsortfamily', 0)
('adbin', 0)
('maptokentype', 0)
('ext_stats_computed', 0)
('client_dn', 0)
('blk_write_time', 0)
('partitions_done', 0)
('relacl', 0)
('provariadic', 0)
('subslotname', 0)
('identity_start', 0)
('ordering_form', 0)
('event_object_catalog', 0)
('is_grantable', 0)
('indimmediate', 0)
('rolpassword', 0)
('seqstart', 0)
('action_timing', 0)
('column_default', 0)
('numeric_precision', 0)
('xact_start', 0)
('wal_distance', 0)
('indrelid', 0)
('lanvalidator', 0)
('relpartbound', 0)
('extrelocatable', 0)
('aggfnoid', 0)
('xact_commit',