In [None]:
import sys
sys.path.append("..")
from src.data_exploratory.load_data import load_data
from src.data_exploratory.clean_data import data_info, replace_empty_string, drop_unused_columns, duplicated_rows, drop_duplicated_rows, check_missing_values, change_to_numeric, check_number_of_unique_values
from src.data_exploratory.split_data import split_data
from sklearn.metrics import roc_auc_score, classification_report
from src.data_exploratory.split_data import split_features_target
from src.pipeline.train import run_pipeline

In [2]:
df = load_data("../data/raw/Telco_customer_churn.xlsx") # Load the data

df.head() # Display the first few rows of the dataframe

Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,...,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV,Churn Reason
0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,...,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,86,3239,Competitor made better offer
1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,Female,...,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,67,2701,Moved
2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,...,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,86,5372,Moved
3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,Female,...,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes,1,84,5003,Moved
4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,Male,...,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Yes,1,89,5340,Competitor had better devices


In [3]:
df = drop_unused_columns(df, ['Country', 'Zip Code', 'State', 'City', 'CustomerID', 'Longitude', 'Latitude', 'Lat Long', 'Count', 'Churn Label', 'Churn Score', 'Churn Reason', 'CLTV']) # Drop unused columns

df.head() # Display the first few rows of the dataframe after dropping unused columns


Unnamed: 0,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Value
0,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
1,Female,No,No,Yes,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1
2,Female,No,No,Yes,8,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,1
3,Female,No,Yes,Yes,28,Yes,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,1
4,Male,No,No,Yes,49,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,1


In [4]:
replace_empty_string(df) # Replace empty strings with NaN

data_info(df) # Get basic information about the dataframe after replacing empty strings

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             7043 non-null   object 
 1   Senior Citizen     7043 non-null   object 
 2   Partner            7043 non-null   object 
 3   Dependents         7043 non-null   object 
 4   Tenure Months      7043 non-null   int64  
 5   Phone Service      7043 non-null   object 
 6   Multiple Lines     7043 non-null   object 
 7   Internet Service   7043 non-null   object 
 8   Online Security    7043 non-null   object 
 9   Online Backup      7043 non-null   object 
 10  Device Protection  7043 non-null   object 
 11  Tech Support       7043 non-null   object 
 12  Streaming TV       7043 non-null   object 
 13  Streaming Movies   7043 non-null   object 
 14  Contract           7043 non-null   object 
 15  Paperless Billing  7043 non-null   object 
 16  Payment Method     7043 

  return df.replace(r'^\s*$', np.nan, regex=True)


In [5]:
duplicated_rows(df) # Check for duplicated rows

np.int64(22)

In [6]:
df = drop_duplicated_rows(df) # Drop duplicated rows

duplicated_rows(df) # Check again for duplicated rows

np.int64(0)

In [7]:
change_to_numeric(df, 'Total Charges') # Change 'Total Charges' column to numeric

<class 'pandas.core.series.Series'>
Index: 7021 entries, 0 to 7042
Series name: Total Charges
Non-Null Count  Dtype  
--------------  -----  
7010 non-null   float64
dtypes: float64(1)
memory usage: 109.7 KB


In [8]:
check_missing_values(df) # Check for missing values in the dataframe

Total Charges    11
dtype: int64

In [9]:
train_df, val_df, test_df = split_data(df) # Split the data into train, validation, and test sets

print('train set',train_df.shape)
print('validation set', val_df.shape)
print('test set', test_df.shape)



train set (4212, 20)
validation set (1404, 20)
test set (1405, 20)


In [10]:
X_train, y_train = split_features_target(train_df) # Split features and target for training set
X_val, y_val = split_features_target(val_df) # Split features and target for validation set

In [12]:
pipeline = run_pipeline(X_train, y_train) # Run the pipeline to train the model

In [13]:
pipeline.score(X_val, y_val) # Evaluate the model on the validation set

0.8148148148148148

In [None]:
pred_proba = pipeline.predict_proba(X_val)[:, 1] # Get predicted probabilities for the positive class
pipeline_pred_proba = (pred_proba >= 0.3).astype(int) # Get predicted probabilities for the positive class

print('AUC-ROC:', roc_auc_score(y_val, pred_proba)) # Calculate and print AUC-ROC score
print("CLASSIFICATION REPORTS:\n", classification_report(y_val, pipeline_pred_proba))   # Print classification report


AUC-ROC: 0.8577885981375181
CLASSIFICATION REPORTS:
               precision    recall  f1-score   support

           0       0.92      0.76      0.83      1061
           1       0.52      0.79      0.63       343

    accuracy                           0.77      1404
   macro avg       0.72      0.78      0.73      1404
weighted avg       0.82      0.77      0.78      1404

