In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression

In [None]:
# Load Integrated Dataset (Member 2's Output)

df = pd.read_csv("02.integrated_telco_data.csv")
print("Loaded integrated data:", df.shape)
df.head()

Loaded integrated data: (60175, 23)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,total_diff_abs,total_conflict
0,CUST00001,Male,0.0,No,Yes,3.0428,Yes,Yes,No,No,...,No,No,Month-to-month,No,Mailed check,71.90798,189.052945,Yes,118.99142,True
1,CUST00002,Male,1.0,Yes,No,3.0428,Yes,Yes,DSL,No,...,Unknown,No,One year,Yes,Bank transfer (automatic),21.351177,73.679395,No,34.84278,True
2,CUST00003,Female,0.0,No,No,36.87,Yes,Yes,DSL,No,...,Yes,Yes,Month-to-month,No,Electronic check,41.157794,1569.943735,Yes,209.819138,True
3,CUST00005,Male,1.0,Yes,Yes,14.1083,Yes,Unknown,Fiber optic,Yes,...,No,No,Two year,Yes,Electronic check,21.351177,307.119546,Yes,23.560582,True
4,CUST00006,Male,0.0,Yes,No,20.0413,Yes,No,Fiber optic,No,...,Unknown,No,One year,No,Electronic check,34.567089,658.802015,Yes,135.870739,True


Smoothing 

In [None]:
#smoothing

#remove noise in charges with regression

def smooth_charges(df, charge_col):
    valid = df[(df['tenure'] > 0) & (df[charge_col].notna())]
    if len(valid) > 1:
        X = valid[['tenure']]
        y = valid[charge_col]
        
        reg = LinearRegression()
        reg.fit(X, y)
        
        y_pred = reg.predict(X)
        residuals = y - y_pred
        std_residuals = np.std(residuals)
        
        outliers = np.abs(residuals) > 3 * std_residuals
        df.loc[valid[outliers].index, charge_col] = reg.predict(valid[outliers][['tenure']])
        print(f"Smoothed {outliers.sum()} outliers in {charge_col}")
    return df

df = smooth_charges(df, 'MonthlyCharges')
df = smooth_charges(df, 'TotalCharges')


Smoothed 3561 outliers in MonthlyCharges
Smoothed 1700 outliers in TotalCharges


Normalization

In [None]:

# Normalization

# Min-Max scaling for MonthlyCharges, TotalCharges
minmax = MinMaxScaler()
df[['MonthlyCharges','TotalCharges']] = minmax.fit_transform(df[['MonthlyCharges','TotalCharges']])

# Z-score scaling for tenure
zscore = StandardScaler()
df[['tenure']] = zscore.fit_transform(df[['tenure']])

df[['tenure','MonthlyCharges','TotalCharges']].head()


Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,-1.205173,0.320794,0.040216
1,-1.205173,0.0,0.0
2,1.17701,0.125677,0.521554
3,-0.425917,0.0,0.08137
4,-0.008103,0.083858,0.203956


Feature Construction

In [None]:
#Feature Construction

# Average monthly charge
df['AvgMonthlyCharge'] = df['TotalCharges'] / (df['tenure'] + 1e-5)

# Count how many services a customer has
service_cols = ['PhoneService','MultipleLines','OnlineSecurity','OnlineBackup',
                'DeviceProtection','TechSupport','StreamingTV','StreamingMovies']

df['TotalServices'] = df[service_cols].apply(lambda row: sum(row=='Yes'), axis=1)

# Premium if more than 4 services
df['PremiumCustomer'] = (df['TotalServices'] > 4).astype(int)

# Indicator for electronic payment
electronic_payments = ['Electronic check','Bank transfer (automatic)','Credit card (automatic)']
df['ElectronicPayment'] = df['PaymentMethod'].isin(electronic_payments).astype(int)

df[['AvgMonthlyCharge','TotalServices','PremiumCustomer','ElectronicPayment']].head()


Unnamed: 0,AvgMonthlyCharge,TotalServices,PremiumCustomer,ElectronicPayment
0,-0.03337,2,0,0
1,-0.0,3,0,1
2,0.443114,5,1,1
3,-0.191052,3,0,1
4,-25.202626,2,0,1


Discretization (Binning)

In [None]:
# Discretization

# Discretize tenure into Short, Medium, Long
df['TenureGroup'] = pd.cut(df['tenure'], bins=[-np.inf,-0.5,0.5,np.inf],
                           labels=['Short','Medium','Long'])

# Discretize MonthlyCharges, TotalCharges, AvgMonthlyCharge into Low, Medium, High
df['MonthlyChargeGroup'] = pd.qcut(df['MonthlyCharges'], 3, labels=['Low','Medium','High'])
df['TotalChargeGroup']   = pd.qcut(df['TotalCharges'], 3, labels=['Low','Medium','High'])
df['AvgMonthlyChargeGroup'] = pd.qcut(df['AvgMonthlyCharge'], 3, labels=['Low','Medium','High'])

df[['tenure','TenureGroup','MonthlyCharges','MonthlyChargeGroup']].head()


Unnamed: 0,tenure,TenureGroup,MonthlyCharges,MonthlyChargeGroup
0,-1.205173,Short,0.320794,High
1,-1.205173,Short,0.0,Low
2,1.17701,Long,0.125677,Medium
3,-0.425917,Medium,0.0,Low
4,-0.008103,Medium,0.083858,Low


Concept Hierarchy

In [None]:
#Concept Hierachy

# Map Contract and InternetService to ordinal levels

contract_hierarchy = {'Month-to-month':1, 'One year':2, 'Two year':3}
df['ContractLevel'] = df['Contract'].map(contract_hierarchy)

internet_hierarchy = {'No':1, 'DSL':2, 'Fiber optic':3}
df['InternetServiceLevel'] = df['InternetService'].map(internet_hierarchy)

df[['Contract','ContractLevel','InternetService','InternetServiceLevel']].head()


Unnamed: 0,Contract,ContractLevel,InternetService,InternetServiceLevel
0,Month-to-month,1.0,No,1
1,One year,2.0,DSL,2
2,Month-to-month,1.0,DSL,2
3,Two year,3.0,Fiber optic,3
4,One year,2.0,Fiber optic,3


Encoding Categorical Variables

In [None]:
# Encoding

# One-hot encoding for categorical
cat_cols = ['gender','InternetService','PaymentMethod','Contract',
            'TenureGroup','MonthlyChargeGroup','TotalChargeGroup','AvgMonthlyChargeGroup']

df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# Convert Yes/No to 1/0
binary_cols = ['Partner','Dependents','PaperlessBilling','Churn']
for col in binary_cols:
    if col in df.columns:
        df[col] = df[col].map({'Yes':1,'No':0})

df.head()


Unnamed: 0,customerID,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,Contract_One year,Contract_Two year,TenureGroup_Medium,TenureGroup_Long,MonthlyChargeGroup_Medium,MonthlyChargeGroup_High,TotalChargeGroup_Medium,TotalChargeGroup_High,AvgMonthlyChargeGroup_Medium,AvgMonthlyChargeGroup_High
0,CUST00001,0.0,0.0,1.0,-1.205173,Yes,Yes,No,No,No,...,False,False,False,False,False,True,False,False,True,False
1,CUST00002,1.0,1.0,0.0,-1.205173,Yes,Yes,No,No,No,...,True,False,False,False,False,False,False,False,True,False
2,CUST00003,0.0,0.0,0.0,1.17701,Yes,Yes,No,Yes,No,...,False,False,False,True,True,False,False,True,False,True
3,CUST00005,1.0,1.0,1.0,-0.425917,Yes,Unknown,Yes,No,Yes,...,False,True,True,False,False,False,False,False,True,False
4,CUST00006,0.0,1.0,0.0,-0.008103,Yes,No,No,No,Unknown,...,True,False,True,False,False,False,True,False,False,False


In [None]:

df.to_csv("03.transformed_telco_data.csv", index=False)

print(" Data Transformation & Discretization complete. Saved as 03.transformed_telco_data.csv")
print("Final dataset shape:", df.shape)

 Data Transformation & Discretization complete. Saved as 03.transformed_telco_data.csv
Final dataset shape: (60175, 44)
