In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression

In [2]:
# Load Integrated Dataset (Member 2's Output)

df = pd.read_csv("02.integrated_telco_data.csv")
print("Loaded integrated data:", df.shape)
df.head()

Loaded integrated data: (70000, 21)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,CUST00001,Male,0.0,No,Yes,3,Yes,Yes,No,No,...,No,No,No,No,Month-to-Month,No,Mailed check,68.61,205.83,Yes
1,CUST00002,Male,1.0,Yes,No,2,Yes,Yes,DSL,No,...,No,Yes,Unknown,No,One year,Yes,Bank transfer (automatic),23.15,46.3,No
2,CUST00003,Female,0.0,No,No,42,Yes,Yes,DSL,No,...,No,Unknown,Yes,Yes,Month-to-Month,No,Electronic check,42.63,1790.46,Yes
3,CUST00004,Female,0.0,No,Yes,40,Yes,Yes,Fiber optic,No,...,Yes,No,No,No,Month-to-Month,No,Electronic check,75.04,3001.6,No
4,CUST00005,Male,1.0,Yes,Yes,17,Yes,Unknown,Fiber optic,Yes,...,Yes,No,No,No,Two year,Yes,Electronic check,22.38,380.46,Yes


Normalization

In [3]:
# Normalization

# Min-Max scaling for MonthlyCharges, TotalCharges
minmax = MinMaxScaler()
df[['MonthlyCharges','TotalCharges']] = minmax.fit_transform(df[['MonthlyCharges','TotalCharges']])

# Z-score scaling for tenure
zscore = StandardScaler()
df[['tenure']] = zscore.fit_transform(df[['tenure']])

df[['tenure','MonthlyCharges','TotalCharges']].head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,-1.244974,0.034155,0.000306
1,-1.307522,0.003476,6.9e-05
2,1.194413,0.016622,0.002662
3,1.069316,0.038495,0.004463
4,-0.369296,0.002956,0.000566


In [4]:
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,CUST00001,Male,0.0,No,Yes,-1.244974,Yes,Yes,No,No,...,No,No,No,No,Month-to-Month,No,Mailed check,0.034155,0.000306,Yes
1,CUST00002,Male,1.0,Yes,No,-1.307522,Yes,Yes,DSL,No,...,No,Yes,Unknown,No,One year,Yes,Bank transfer (automatic),0.003476,0.000069,No
2,CUST00003,Female,0.0,No,No,1.194413,Yes,Yes,DSL,No,...,No,Unknown,Yes,Yes,Month-to-Month,No,Electronic check,0.016622,0.002662,Yes
3,CUST00004,Female,0.0,No,Yes,1.069316,Yes,Yes,Fiber optic,No,...,Yes,No,No,No,Month-to-Month,No,Electronic check,0.038495,0.004463,No
4,CUST00005,Male,1.0,Yes,Yes,-0.369296,Yes,Unknown,Fiber optic,Yes,...,Yes,No,No,No,Two year,Yes,Electronic check,0.002956,0.000566,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,CUST69996,Male,1.0,Yes,Yes,0.443833,Yes,Yes,Fiber optic,Yes,...,No,Yes,No,No,Two year,No,Electronic check,0.038366,0.003339,No
69996,CUST69997,Male,0.0,Yes,Unknown,-0.494393,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-Month,Yes,Bank transfer (automatic),0.015299,0.000907,Yes
69997,CUST69998,Male,0.0,Unknown,No,-0.181651,Yes,No,DSL,Yes,...,No,No,Unknown,No,Month-to-Month,Yes,Mailed check,0.007720,0.000876,No
69998,CUST69999,Male,0.0,No,No,-0.119103,Yes,No,Fiber optic,No,...,No,No,Yes,No,Month-to-Month,Yes,Electronic check,0.019416,0.001460,No


In [5]:
import pickle

# --- 1. Save the Z-Score Scaler (for 'tenure') ---
zscore_filename = 'tenure_zscore_scaler.pkl'
with open(zscore_filename, 'wb') as f:
    pickle.dump(zscore, f)
print(f"Z-score scaler saved as {zscore_filename}")

# --- 2. Save the Min-Max Scaler (for 'MonthlyCharges' & 'TotalCharges') ---
minmax_filename = 'charges_minmax_scaler.pkl'
with open(minmax_filename, 'wb') as f:
    pickle.dump(minmax, f)
print(f"Min-Max scaler saved as {minmax_filename}")

Z-score scaler saved as tenure_zscore_scaler.pkl
Min-Max scaler saved as charges_minmax_scaler.pkl


Encoding Categorical Variables

this is beacuse this encoding is best for MLP classifire. or other wise is it has 1,2,3 like this the classifire may get this as the count 

In [6]:
# Encoding
# One-hot encoding for categorical
cat_cols = ['Contract','PaymentMethod','gender','Partner','Dependents','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies',]
df = pd.get_dummies(df, columns=cat_cols) # removed drop_first=True because of mlp classifier. and to avoid information loss
# and this will remove the original cat_cols columns


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 49 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   customerID                               70000 non-null  object 
 1   SeniorCitizen                            70000 non-null  float64
 2   tenure                                   70000 non-null  float64
 3   PhoneService                             70000 non-null  object 
 4   PaperlessBilling                         70000 non-null  object 
 5   MonthlyCharges                           70000 non-null  float64
 6   TotalCharges                             70000 non-null  float64
 7   Churn                                    70000 non-null  object 
 8   Contract_Month-to-Month                  70000 non-null  bool   
 9   Contract_One year                        70000 non-null  bool   
 10  Contract_Two year                        70000

In [8]:
# these only have two categories so we can use label encoding
# ['PhoneService','PaperlessBilling','Churn']

In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
encorder = LabelEncoder()
encorder.fit_transform(df['Churn'])

array([1, 0, 1, ..., 0, 0, 1], shape=(70000,))

churn 

Yes -> 1
No -> 0

In [11]:
df['Churn'] = encorder.fit_transform(df['Churn'])

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 49 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   customerID                               70000 non-null  object 
 1   SeniorCitizen                            70000 non-null  float64
 2   tenure                                   70000 non-null  float64
 3   PhoneService                             70000 non-null  object 
 4   PaperlessBilling                         70000 non-null  object 
 5   MonthlyCharges                           70000 non-null  float64
 6   TotalCharges                             70000 non-null  float64
 7   Churn                                    70000 non-null  int64  
 8   Contract_Month-to-Month                  70000 non-null  bool   
 9   Contract_One year                        70000 non-null  bool   
 10  Contract_Two year                        70000

In [13]:
mapping = {'Yes': 1, 'No': 0}

# Convert PhoneService
df['PhoneService'] = df['PhoneService'].map(mapping).astype('bool') 

# Convert PaperlessBilling
df['PaperlessBilling'] = df['PaperlessBilling'].map(mapping).astype('bool') 

In [14]:
df

Unnamed: 0,customerID,SeniorCitizen,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,Contract_Month-to-Month,Contract_One year,...,DeviceProtection_Yes,TechSupport_No,TechSupport_Unknown,TechSupport_Yes,StreamingTV_No,StreamingTV_Unknown,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_Unknown,StreamingMovies_Yes
0,CUST00001,0.0,-1.244974,True,False,0.034155,0.000306,1,True,False,...,False,True,False,False,True,False,False,True,False,False
1,CUST00002,1.0,-1.307522,True,True,0.003476,0.000069,0,False,True,...,False,False,False,True,False,True,False,True,False,False
2,CUST00003,0.0,1.194413,True,False,0.016622,0.002662,1,True,False,...,False,False,True,False,False,False,True,False,False,True
3,CUST00004,0.0,1.069316,True,False,0.038495,0.004463,0,True,False,...,True,True,False,False,True,False,False,True,False,False
4,CUST00005,1.0,-0.369296,True,True,0.002956,0.000566,1,False,False,...,True,True,False,False,True,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,CUST69996,1.0,0.443833,True,False,0.038366,0.003339,0,False,False,...,False,False,False,True,True,False,False,True,False,False
69996,CUST69997,0.0,-0.494393,True,True,0.015299,0.000907,1,True,False,...,False,True,False,False,True,False,False,True,False,False
69997,CUST69998,0.0,-0.181651,True,True,0.007720,0.000876,0,True,False,...,False,True,False,False,False,True,False,True,False,False
69998,CUST69999,0.0,-0.119103,True,True,0.019416,0.001460,0,True,False,...,False,True,False,False,False,False,True,True,False,False


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 49 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   customerID                               70000 non-null  object 
 1   SeniorCitizen                            70000 non-null  float64
 2   tenure                                   70000 non-null  float64
 3   PhoneService                             70000 non-null  bool   
 4   PaperlessBilling                         70000 non-null  bool   
 5   MonthlyCharges                           70000 non-null  float64
 6   TotalCharges                             70000 non-null  float64
 7   Churn                                    70000 non-null  int64  
 8   Contract_Month-to-Month                  70000 non-null  bool   
 9   Contract_One year                        70000 non-null  bool   
 10  Contract_Two year                        70000

In [16]:

df.to_csv("03.transformed_telco_data.csv", index=False)

print(" Data Transformation & Discretization complete. Saved as 03.transformed_telco_data.csv")
print("Final dataset shape:", df.shape)

 Data Transformation & Discretization complete. Saved as 03.transformed_telco_data.csv
Final dataset shape: (70000, 49)
