# Churn - 01 - Import

## Setup

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import yaml

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
# pd.set_option('display.max_rows', None)

sns.set_style("darkgrid")

from IPython.display import display, Markdown
from pprint import pprint 

DEBUG = True
SEED = 666

In [16]:
DATASET = "Churn"

import os, sys
COLAB = 'google.colab' in sys.modules
ROOT = "./"

if COLAB:
  from google.colab import drive
  if not os.path.isdir("/content/gdrive"):
    drive.mount("/content/gdrive")
    d = "/content/gdrive/MyDrive/datasets"
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
  if not os.path.isdir(ROOT): os.makedirs(ROOT)


def makedirs(d):
  if COLAB:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  else:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['orig','data','output']: makedirs(d)

## Dataset

In [17]:
BASE_URL = "https://SETU-DataMining2.github.io/live/resources/churn"

for filename in ['data.csv','datasheet.yaml']:
  source = f"{BASE_URL}/{filename}"
  target = f"{ROOT}/orig/{filename}"

  if not os.path.isfile(target):
    print (f"Downloading remote file {filename}", sep="")
    import urllib.request
    urllib.request.urlretrieve(source, target)
  else:
    print(f"Using local copy of {filename}")

Using local copy of data.csv
Using local copy of datasheet.yaml


## Import

Standardise columns names.
Check for missing values.
I have used the datasheet to convert some of the labels to text you have the rest.
Convert variables to more appropriate datatype, to simplify EDA.
Save cleaned files to data subfolder (using pickle format).

### Comments

* Fix missing values in `TotalCharges` and replacing with NaN
* `TotalCharges` is object, should be float
* `SeniorSitizen` is int, should be bool
* `customerID` is a unique identifier, should be index
* Convert all object columns to category

In [18]:
df = pd.read_csv(ROOT+"orig/data.csv")
print(df.shape)
df.head(5)

(7043, 21)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,1,Female,0,Yes,No,1,No,No,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,2,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,4,Male,0,No,No,45,No,No,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,5,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [19]:
for col in df.columns:
  df.rename(columns={col:col.capitalize()}, inplace=True)
df.head(1)

Unnamed: 0,Customerid,Gender,Seniorcitizen,Partner,Dependents,Tenure,Phoneservice,Multiplelines,Internetservice,Onlinesecurity,Onlinebackup,Deviceprotection,Techsupport,Streamingtv,Streamingmovies,Contract,Paperlessbilling,Paymentmethod,Monthlycharges,Totalcharges,Churn
0,1,Female,0,Yes,No,1,No,No,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No


## Correct data types

### Functions

In [20]:
def categorize(data, col_to_categorize, cols_order=None):
  display(Markdown(f"### {col_to_categorize}"))
  if data[col_to_categorize].dtype == "O":
    print(f"Converting {col_to_categorize} to category from {data[col_to_categorize].dtype}")
    if cols_order is None:
      data[col_to_categorize] = pd.Categorical(data[col_to_categorize])
    else:
      data[col_to_categorize] = pd.Categorical(data[col_to_categorize], categories=cols_order, ordered=True)
  print(f"{col_to_categorize} is {data[col_to_categorize].dtype}")
  pprint(data[col_to_categorize].value_counts(dropna=False))
  return data

### Special cases

In [21]:
df.Seniorcitizen = df.Seniorcitizen.map({0:"Yes", 1:"No", "No":"No", "Yes":"Yes"})

In [22]:
df.loc[df.Totalcharges==" ", "Totalcharges"] = np.nan
df.Totalcharges = df.Totalcharges.astype(float)

In [23]:
df = categorize(df, "Contract", ["Month-to-month", "One year", "Two year"])

### Contract

Converting Contract to category from object
Contract is category
Month-to-month    3875
Two year          1695
One year          1473
Name: Contract, dtype: int64


### All columns

In [24]:
for col in df.columns:
  if df[col].dtype == "O":
    df = categorize(df, col)

### Gender

Converting Gender to category from object
Gender is category
Male      3555
Female    3488
Name: Gender, dtype: int64


### Seniorcitizen

Converting Seniorcitizen to category from object
Seniorcitizen is category
Yes    5901
No     1142
Name: Seniorcitizen, dtype: int64


### Partner

Converting Partner to category from object
Partner is category
No     3641
Yes    3402
Name: Partner, dtype: int64


### Dependents

Converting Dependents to category from object
Dependents is category
No     4933
Yes    2110
Name: Dependents, dtype: int64


### Phoneservice

Converting Phoneservice to category from object
Phoneservice is category
Yes    6361
No      682
Name: Phoneservice, dtype: int64


### Multiplelines

Converting Multiplelines to category from object
Multiplelines is category
No     4072
Yes    2971
Name: Multiplelines, dtype: int64


### Internetservice

Converting Internetservice to category from object
Internetservice is category
Fiber optic    3096
DSL            2421
No             1526
Name: Internetservice, dtype: int64


### Onlinesecurity

Converting Onlinesecurity to category from object
Onlinesecurity is category
No     5024
Yes    2019
Name: Onlinesecurity, dtype: int64


### Onlinebackup

Converting Onlinebackup to category from object
Onlinebackup is category
No     4614
Yes    2429
Name: Onlinebackup, dtype: int64


### Deviceprotection

Converting Deviceprotection to category from object
Deviceprotection is category
No     4621
Yes    2422
Name: Deviceprotection, dtype: int64


### Techsupport

Converting Techsupport to category from object
Techsupport is category
No     4999
Yes    2044
Name: Techsupport, dtype: int64


### Streamingtv

Converting Streamingtv to category from object
Streamingtv is category
No     4336
Yes    2707
Name: Streamingtv, dtype: int64


### Streamingmovies

Converting Streamingmovies to category from object
Streamingmovies is category
No     4311
Yes    2732
Name: Streamingmovies, dtype: int64


### Paperlessbilling

Converting Paperlessbilling to category from object
Paperlessbilling is category
Yes    4171
No     2872
Name: Paperlessbilling, dtype: int64


### Paymentmethod

Converting Paymentmethod to category from object
Paymentmethod is category
Electronic check             2365
Mailed check                 1612
Bank transfer (automatic)    1544
Credit card (automatic)      1522
Name: Paymentmethod, dtype: int64


### Churn

Converting Churn to category from object
Churn is category
No     5174
Yes    1869
Name: Churn, dtype: int64


In [25]:
if "Customerid" in df.columns:
  df.drop(columns=["Customerid"], inplace=True)

## Output

In [26]:
df.to_pickle(ROOT+"data/data.pkl")
