In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # For creating plots
import matplotlib.ticker as mtick # For specifying the axes tick format 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib
import os
print(os.listdir("../"))


['Data_Acquisition_and_Understanding', '.DS_Store', 'Modeling', 'Readme.md', 'Deployment']


In [3]:
df = pd.read_csv('../../Sample_Data/Raw/WA_Fn-UseC_-Telco-Customer-Churn.csv')


In [4]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
# Checking the data types of all the columns
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [6]:
# Drop any columns not needed for prediction
df2 = df.drop(['customerID'], axis=1)

In [18]:
# Drop missing values if any
df2 = df2.dropna()

In [19]:
# Converting Total Charges to a numerical data type.
df2.TotalCharges = pd.to_numeric(df2.TotalCharges, errors='coerce')

In [20]:
#Convertin the churn results into a binary numeric variable
df2['Churn'].replace(to_replace='Yes', value=1, inplace=True)
df2['Churn'].replace(to_replace='No',  value=0, inplace=True)

In [21]:
df2.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


In [22]:
# Convert categorical columns to numerical using one-hot encoding
df3 = pd.get_dummies(df2, drop_first=True)
df3.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,0,1,0,0,1,...,0,0,0,0,0,0,1,0,1,0
1,0,34,56.95,1889.5,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
2,0,2,53.85,108.15,1,1,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1
3,0,45,42.3,1840.75,0,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,0,2,70.7,151.65,1,0,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0


In [23]:
# Split the data into features and target variable
X = df3.drop('Churn', axis=1)
Y = df3['Churn']

In [24]:
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=100)

In [25]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, Y_train)

LogisticRegression()

In [26]:
# Evaluate the model
Y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(Y_test, Y_pred)}')
print(classification_report(Y_test, Y_pred))

Accuracy: 0.7882018479033405
              precision    recall  f1-score   support

           0       0.84      0.87      0.86      1013
           1       0.64      0.57      0.60       394

    accuracy                           0.79      1407
   macro avg       0.74      0.72      0.73      1407
weighted avg       0.78      0.79      0.78      1407



In [27]:
# Save the model and scaler to disk
joblib.dump(model, 'churn_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [29]:
print(df.describe())

       SeniorCitizen       tenure  MonthlyCharges
count    7043.000000  7043.000000     7043.000000
mean        0.162147    32.371149       64.761692
std         0.368612    24.559481       30.090047
min         0.000000     0.000000       18.250000
25%         0.000000     9.000000       35.500000
50%         0.000000    29.000000       70.350000
75%         0.000000    55.000000       89.850000
max         1.000000    72.000000      118.750000


In [31]:
print(df3.columns.tolist())

['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'Churn', 'gender_Male', 'Partner_Yes', 'Dependents_Yes', 'PhoneService_Yes', 'MultipleLines_No phone service', 'MultipleLines_Yes', 'InternetService_Fiber optic', 'InternetService_No', 'OnlineSecurity_No internet service', 'OnlineSecurity_Yes', 'OnlineBackup_No internet service', 'OnlineBackup_Yes', 'DeviceProtection_No internet service', 'DeviceProtection_Yes', 'TechSupport_No internet service', 'TechSupport_Yes', 'StreamingTV_No internet service', 'StreamingTV_Yes', 'StreamingMovies_No internet service', 'StreamingMovies_Yes', 'Contract_One year', 'Contract_Two year', 'PaperlessBilling_Yes', 'PaymentMethod_Credit card (automatic)', 'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check']


In [32]:
csv_filename = 'processed_dataset.csv'
df3.to_csv(csv_filename, index=False)

print(f"DataFrame saved to '{csv_filename}'")

DataFrame saved to 'processed_dataset.csv'


In [33]:
print(df.columns.tolist())

['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']


In [34]:
data = np.array([data["gender"], data["SeniorcITIZEN"], data["Partner"], data["Dependents"], \
                data["tenure"], data["PhoneService"], data["MultipleLines"], data["InternetService"], \
                data[""], data["PhoneService"]])
print()

NameError: name 'data' is not defined

In [38]:
import os

def print_directory_structure(startpath, indent_level=0):
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, '').count(os.sep)
        indent = '-' * 4 * (level + indent_level)
        print('{}{}/'.format(indent, os.path.basename(root)))
        subindent = ' ' * 4 * (level + 1 + indent_level)
        for f in files:
            print('{}{}'.format(subindent, f))

if __name__ == "__main__":
    base_path = '/Users/hanhan/Desktop/CMU/Software_Design_for_Data_Scientists/GitHub/cmu-95829-Azure-TDSP'
    print_directory_structure(base_path)

cmu-95829-Azure-TDSP/
    .flaskenv
    .DS_Store
    LICENSE-CODE.TXT
    __init__.py
    README.md
    NOTICE.TXT
    requirement.txt
    LICENSE.TXT
    main.py
----Code/
        .DS_Store
        Readme.md
--------Data_Acquisition_and_Understanding/
            scaler.pkl
            EDA.Rmd
            .DS_Store
            dataPrep.py
            DataReport1.txt
            Readme.md
            EDA-SimpleModel.ipynb
            churn_model.pkl
            datapipeline.json
------------.ipynb_checkpoints/
                EDA-SimpleModel-checkpoint.ipynb
--------Modeling/
            Readme.md
            model.R
--------Deployment/
            operationalization.py
            Readme.md
----__pycache__/
        main.cpython-311.pyc
----Docs/
        .DS_Store
        README.md
--------Project/
            System Architecture.docx
            Exit Report.md
            README.md
            Charter.md
--------Data_Dictionaries/
            data-dictionary-from-sql-table.PNG
      

                                    utils.cpython-311.pyc
                                    metadata.cpython-311.pyc
                                    _tokenizer.cpython-311.pyc
                                    _manylinux.cpython-311.pyc
                                    _parser.cpython-311.pyc
                                    _elffile.cpython-311.pyc
                                    version.cpython-311.pyc
                                    specifiers.cpython-311.pyc
                                    __init__.cpython-311.pyc
                                    _structures.cpython-311.pyc
----------------------------jaraco/
                                functools.py
                                __init__.py
                                context.py
--------------------------------__pycache__/
                                    functools.cpython-311.pyc
                                    context.cpython-311.pyc
                                    __init__.cpytho

                8965ba78f223747d79638f36ff094c50d410e3
                7a4b286174fdf26f3251631a2066eda2fa5bea
                cf2a7b0ee2898b72714b756e4b27fbbad4beab
------------a8/
                8b39903beba444bab6aae39f7459c1db3f2131
                00496cbcb7113ded50e997c64ee8e8eabdd9d3
                b301cb5bcfcc64e142f089e44221f077076f5d
                9428a855949d5e2b60245cc28eb2fcc0371218
                727ed8592533a009b6202be92f438d4152e793
                dff5a93b1e446f9e1de5c6102e6699348a8c53
                4721ccd68c33d98f54f5bfbfc2150dccf377a6
                52a7738c70c00a76093b560cfc03cf6088613d
                a3b09774e3b423d3345c742eadac7c1bbeb86d
------------de/
                80122400c8b6bced72768e9531599679ddc8a0
                a9201ca36ef426d7a821c38de485fd79533192
                6a0153b777f255a754c1ca9f8e4dc55cd3934b
                9a09a4ed3b078b37e7490a6686f660ae935aca
                240189f070e3586a26a609a3dd14edf8e1f672
                b4937f74f9a1ccc5f