### Using the Titanic dataset

 1. Use the function defined in acquire.py to load the Titanic data.
 2. Use the function defined in prepare.py to prepare the titanic data.
 3. Encode the categorical columns on train dataset. Create dummy variables of the categorical columns and concatenate them onto the dataframe. Remove the columns they are replacing. Repeat on validate and test.
 4. Create a function named preprocess_titanic that accepts the train, validate, and test titanic data, and returns the dataframes ready for modeling.

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import acquire
import prepare

In [2]:
# get titanic data

df=acquire.get_titanic_data()

In [3]:
df

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.2500,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.9250,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1000,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.0500,S,Third,,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,27.0,0,0,13.0000,S,Second,,Southampton,1
887,887,1,1,female,19.0,0,0,30.0000,S,First,B,Southampton,1
888,888,0,3,female,,1,2,23.4500,S,Third,,Southampton,0
889,889,1,1,male,26.0,0,0,30.0000,C,First,C,Cherbourg,1


In [4]:
# prepare titanic data

df = prepare.clean_titanic(df)

In [5]:
df

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone
0,0,3,male,1,0,7.2500,Southampton,0
1,1,1,female,1,0,71.2833,Cherbourg,0
2,1,3,female,0,0,7.9250,Southampton,1
3,1,1,female,1,0,53.1000,Southampton,0
4,0,3,male,0,0,8.0500,Southampton,1
...,...,...,...,...,...,...,...,...
886,0,2,male,0,0,13.0000,Southampton,1
887,1,1,female,0,0,30.0000,Southampton,1
888,0,3,female,1,2,23.4500,Southampton,0
889,1,1,male,0,0,30.0000,Cherbourg,1


In [6]:
# split titanic data
train, validate, test = prepare.split_data(df, 'survived')

train: 534 (60.0% of 891)
validate: 178 (20.0% of 891)
test: 179 (20.0% of 891)


In [7]:
train

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone
776,0,3,male,0,0,7.7500,Queenstown,1
829,1,1,female,0,0,80.0000,Southampton,1
215,1,1,female,1,0,113.2750,Cherbourg,0
258,1,1,female,0,0,512.3292,Cherbourg,1
129,0,3,male,0,0,6.9750,Southampton,1
...,...,...,...,...,...,...,...,...
125,1,3,male,1,0,11.2417,Cherbourg,0
360,0,3,male,1,4,27.9000,Southampton,0
55,1,1,male,0,0,35.5000,Southampton,1
298,1,1,male,0,0,30.5000,Southampton,1


In [8]:
# Using drop_first leaves sex_male, embark_town_Queenstown, and embark_town_Southampton.

dummy_train = pd.get_dummies(train[['sex','embark_town']], dummy_na=False, drop_first=[True, True]).astype(int)
dummy_train

Unnamed: 0,sex_male,embark_town_Queenstown,embark_town_Southampton
776,1,1,0
829,0,0,1
215,0,0,0
258,0,0,0
129,1,0,1
...,...,...,...
125,1,0,0
360,1,0,1
55,1,0,1
298,1,0,1


In [9]:
# Concatenate the dummy_df dataframe above with the original df and verify.

train = pd.concat([train, dummy_train], axis=1)
train

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
776,0,3,male,0,0,7.7500,Queenstown,1,1,1,0
829,1,1,female,0,0,80.0000,Southampton,1,0,0,1
215,1,1,female,1,0,113.2750,Cherbourg,0,0,0,0
258,1,1,female,0,0,512.3292,Cherbourg,1,0,0,0
129,0,3,male,0,0,6.9750,Southampton,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...
125,1,3,male,1,0,11.2417,Cherbourg,0,1,0,0
360,0,3,male,1,4,27.9000,Southampton,0,1,0,1
55,1,1,male,0,0,35.5000,Southampton,1,1,0,1
298,1,1,male,0,0,30.5000,Southampton,1,1,0,1


In [10]:
# Drop string values that have been replaced with encoded values.

train = train.drop(columns=['sex', 'embark_town'])
train.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
776,0,3,0,0,7.75,1,1,1,0
829,1,1,0,0,80.0,1,0,0,1
215,1,1,1,0,113.275,0,0,0,0
258,1,1,0,0,512.3292,1,0,0,0
129,0,3,0,0,6.975,1,1,0,1


In [11]:
# Using drop_first leaves sex_male, embark_town_Queenstown, and embark_town_Southampton.

dummy_val = pd.get_dummies(validate[['sex','embark_town']], dummy_na=False, drop_first=[True, True]).astype(int)
dummy_test = pd.get_dummies(test[['sex','embark_town']], dummy_na=False, drop_first=[True, True]).astype(int)

# Concatenate the dummy_df dataframe above with the original df.

validate = pd.concat([validate, dummy_val], axis=1)
test = pd.concat([test, dummy_test], axis=1)

# Drop string values that have been replaced with encoded values.

validate = validate.drop(columns=['sex', 'embark_town'])
test = test.drop(columns=['sex', 'embark_town'])


In [12]:
validate

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
541,0,3,4,2,31.2750,0,0,0,1
204,1,3,0,0,8.0500,1,1,0,1
108,0,3,0,0,7.8958,1,1,0,1
88,1,1,3,2,263.0000,0,0,0,1
677,1,3,0,0,9.8417,1,0,0,1
...,...,...,...,...,...,...,...,...,...
171,0,3,4,1,29.1250,0,1,1,0
369,1,1,0,0,69.3000,1,0,0,0
114,0,3,0,0,14.4583,1,0,0,0
855,1,3,0,1,9.3500,0,0,0,1


In [13]:
test

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
763,1,1,1,2,120.0000,0,0,0,1
112,0,3,0,0,8.0500,1,1,0,1
230,1,1,1,0,83.4750,0,0,0,1
500,0,3,0,0,8.6625,1,1,0,1
237,1,2,0,2,26.2500,0,0,0,1
...,...,...,...,...,...,...,...,...,...
883,0,2,0,0,10.5000,1,1,0,1
149,0,2,0,0,13.0000,1,1,0,1
843,0,3,0,0,6.4375,1,1,0,0
581,1,1,1,1,110.8833,0,0,0,0


In [14]:
train

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
776,0,3,0,0,7.7500,1,1,1,0
829,1,1,0,0,80.0000,1,0,0,1
215,1,1,1,0,113.2750,0,0,0,0
258,1,1,0,0,512.3292,1,0,0,0
129,0,3,0,0,6.9750,1,1,0,1
...,...,...,...,...,...,...,...,...,...
125,1,3,1,0,11.2417,0,1,0,0
360,0,3,1,4,27.9000,0,1,0,1
55,1,1,0,0,35.5000,1,1,0,1
298,1,1,0,0,30.5000,1,1,0,1


### Using the Telco dataset

 1. Use the function defined in acquire.py to load the Telco data.
 2. Use the function defined in prepare.py to prepare the Telco data.
 3. Encode the categorical columns on train.
     - a. Encode at least one column using .replace
     - b. Encode at least one column using .map
     - c. Encode the rest of the columns by creating dummy variables and concatenating them onto the dataframe.
 4. Repeat the same steps on validate and test.
 5. Create a function named prep_telco that accepts the train, validate, and test telco data, and returns the dataframes ready for modeling.

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import acquire
import prepare

In [2]:
# get telco data

df=acquire.get_telco_data()

In [3]:
df

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,2,0002-ORFBO,Female,0,Yes,Yes,9,Yes,...,Yes,Yes,No,Yes,65.60,593.3,No,One year,DSL,Mailed check
1,2,1,1,0003-MKNFE,Male,0,No,No,9,Yes,...,No,No,Yes,No,59.90,542.4,No,Month-to-month,DSL,Mailed check
2,1,2,1,0004-TLHLJ,Male,0,No,No,4,Yes,...,No,No,No,Yes,73.90,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,1,2,1,0011-IGKFF,Male,1,Yes,No,13,Yes,...,No,Yes,Yes,Yes,98.00,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,2,2,1,0013-EXCHZ,Female,1,Yes,No,3,Yes,...,Yes,Yes,No,Yes,83.90,267.4,Yes,Month-to-month,Fiber optic,Mailed check
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,2,1,2,9987-LUTYD,Female,0,No,No,13,Yes,...,Yes,No,No,No,55.15,742.9,No,One year,DSL,Mailed check
7039,1,2,1,9992-RRAMN,Male,0,Yes,No,22,Yes,...,No,No,Yes,Yes,85.10,1873.7,Yes,Month-to-month,Fiber optic,Electronic check
7040,2,1,1,9992-UJOEL,Male,0,No,No,2,Yes,...,No,No,No,Yes,50.30,92.75,No,Month-to-month,DSL,Mailed check
7041,2,1,3,9993-LHIEB,Male,0,Yes,Yes,67,Yes,...,Yes,No,Yes,No,67.85,4627.65,No,Two year,DSL,Mailed check


In [4]:
# prepare telco data

df = prepare.prep_telco(df)

In [5]:
df

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,Female,No,Yes,Yes,9,Yes,No,No,Yes,No,Yes,Yes,No,Yes,65.60,593.30,No,One year,DSL,Mailed check
1,Male,No,No,No,9,Yes,Yes,No,No,No,No,No,Yes,No,59.90,542.40,No,Month-to-month,DSL,Mailed check
2,Male,No,No,No,4,Yes,No,No,No,Yes,No,No,No,Yes,73.90,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,Male,Yes,Yes,No,13,Yes,No,No,Yes,Yes,No,Yes,Yes,Yes,98.00,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,Female,Yes,Yes,No,3,Yes,No,No,No,No,Yes,Yes,No,Yes,83.90,267.40,Yes,Month-to-month,Fiber optic,Mailed check
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Female,No,No,No,13,Yes,No,Yes,No,No,Yes,No,No,No,55.15,742.90,No,One year,DSL,Mailed check
7039,Male,No,Yes,No,22,Yes,Yes,No,No,No,No,No,Yes,Yes,85.10,1873.70,Yes,Month-to-month,Fiber optic,Electronic check
7040,Male,No,No,No,2,Yes,No,No,Yes,No,No,No,No,Yes,50.30,92.75,No,Month-to-month,DSL,Mailed check
7041,Male,No,Yes,Yes,67,Yes,No,Yes,No,Yes,Yes,No,Yes,No,67.85,4627.65,No,Two year,DSL,Mailed check


In [6]:
# split telco data
train, validate, test = prepare.split_data(df, 'churn')

train: 4225 (60.0% of 7043)
validate: 1409 (20.0% of 7043)
test: 1409 (20.0% of 7043)


In [7]:
train

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
2865,Female,Yes,Yes,No,38,Yes,Yes,Yes,Yes,No,No,No,Yes,No,95.00,3591.25,No,One year,Fiber optic,Credit card (automatic)
4107,Female,Yes,No,No,2,Yes,No,No,No,No,No,No,No,Yes,70.35,139.05,Yes,Month-to-month,Fiber optic,Electronic check
3453,Male,Yes,Yes,No,11,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,No,19.95,214.75,Yes,Month-to-month,No internet service,Mailed check
923,Male,No,No,No,10,Yes,Yes,No,No,No,No,Yes,No,Yes,86.05,834.10,Yes,Month-to-month,Fiber optic,Bank transfer (automatic)
2370,Male,No,No,No,43,Yes,No,No,Yes,No,No,No,Yes,Yes,60.00,2548.55,No,Two year,DSL,Electronic check
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1286,Female,No,No,No,1,Yes,No,No,No,No,No,No,No,No,70.50,70.50,Yes,Month-to-month,Fiber optic,Electronic check
4999,Male,Yes,No,No,47,Yes,Yes,No,Yes,No,Yes,No,No,No,59.60,2754.00,No,Two year,DSL,Bank transfer (automatic)
6892,Male,No,Yes,No,72,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,116.95,8594.40,No,Two year,Fiber optic,Credit card (automatic)
3718,Male,No,No,No,8,Yes,Yes,No,No,No,No,No,Yes,Yes,86.55,649.65,Yes,Month-to-month,Fiber optic,Electronic check


In [8]:
# a. Encode at least one column using .replace

# Use .replace() to encode 'Yes' and 'No' to 1 and 0
train['senior_citizen'] = train['senior_citizen'].replace({'Yes': 1, 'No': 0})

# Display the updated DataFrame
train

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
2865,Female,1,Yes,No,38,Yes,Yes,Yes,Yes,No,No,No,Yes,No,95.00,3591.25,No,One year,Fiber optic,Credit card (automatic)
4107,Female,1,No,No,2,Yes,No,No,No,No,No,No,No,Yes,70.35,139.05,Yes,Month-to-month,Fiber optic,Electronic check
3453,Male,1,Yes,No,11,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,No,19.95,214.75,Yes,Month-to-month,No internet service,Mailed check
923,Male,0,No,No,10,Yes,Yes,No,No,No,No,Yes,No,Yes,86.05,834.10,Yes,Month-to-month,Fiber optic,Bank transfer (automatic)
2370,Male,0,No,No,43,Yes,No,No,Yes,No,No,No,Yes,Yes,60.00,2548.55,No,Two year,DSL,Electronic check
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1286,Female,0,No,No,1,Yes,No,No,No,No,No,No,No,No,70.50,70.50,Yes,Month-to-month,Fiber optic,Electronic check
4999,Male,1,No,No,47,Yes,Yes,No,Yes,No,Yes,No,No,No,59.60,2754.00,No,Two year,DSL,Bank transfer (automatic)
6892,Male,0,Yes,No,72,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,116.95,8594.40,No,Two year,Fiber optic,Credit card (automatic)
3718,Male,0,No,No,8,Yes,Yes,No,No,No,No,No,Yes,Yes,86.55,649.65,Yes,Month-to-month,Fiber optic,Electronic check


In [9]:
# b. Encode at least one column using .map
train.loc[:,'has_partner'] = train.partner.map({'No': 0, 'Yes': 1})

In [None]:
train.drop(columns=['partner'], inplace=True)


In [14]:
train

Unnamed: 0,gender,senior_citizen,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type,has_partner
2865,Female,1,No,38,Yes,Yes,Yes,Yes,No,No,No,Yes,No,95.00,3591.25,No,One year,Fiber optic,Credit card (automatic),1
4107,Female,1,No,2,Yes,No,No,No,No,No,No,No,Yes,70.35,139.05,Yes,Month-to-month,Fiber optic,Electronic check,0
3453,Male,1,No,11,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,No,19.95,214.75,Yes,Month-to-month,No internet service,Mailed check,1
923,Male,0,No,10,Yes,Yes,No,No,No,No,Yes,No,Yes,86.05,834.10,Yes,Month-to-month,Fiber optic,Bank transfer (automatic),0
2370,Male,0,No,43,Yes,No,No,Yes,No,No,No,Yes,Yes,60.00,2548.55,No,Two year,DSL,Electronic check,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1286,Female,0,No,1,Yes,No,No,No,No,No,No,No,No,70.50,70.50,Yes,Month-to-month,Fiber optic,Electronic check,0
4999,Male,1,No,47,Yes,Yes,No,Yes,No,Yes,No,No,No,59.60,2754.00,No,Two year,DSL,Bank transfer (automatic),0
6892,Male,0,No,72,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,116.95,8594.40,No,Two year,Fiber optic,Credit card (automatic),1
3718,Male,0,No,8,Yes,Yes,No,No,No,No,No,Yes,Yes,86.55,649.65,Yes,Month-to-month,Fiber optic,Electronic check,0


In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4225 entries, 2865 to 5354
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   gender                 4225 non-null   object 
 1   senior_citizen         4225 non-null   int64  
 2   dependents             4225 non-null   object 
 3   tenure                 4225 non-null   int64  
 4   phone_service          4225 non-null   object 
 5   multiple_lines         4225 non-null   object 
 6   online_security        4225 non-null   object 
 7   online_backup          4225 non-null   object 
 8   device_protection      4225 non-null   object 
 9   tech_support           4225 non-null   object 
 10  streaming_tv           4225 non-null   object 
 11  streaming_movies       4225 non-null   object 
 12  paperless_billing      4225 non-null   object 
 13  monthly_charges        4225 non-null   float64
 14  total_charges          4225 non-null   float64
 15  churn 

In [15]:
def preprocess_telco(train_df, val_df, test_df):
    '''
    preprocess_telco will take in three pandas dataframes
    of our telco data, expected as cleaned versions of this 
    telco data set (see documentation on acquire.py and prepare.py)
    
    output:
    encoded, ML-ready versions of our clean data, with 
    columns sex and embark_town encoded in the one-hot fashion
    return: (pd.DataFrame, pd.DataFrame, pd.DataFrame)
    '''
    # with a looping structure:
    # go through the three dfs, set the index to customer id
    for df in [train_df, val_df, test_df]:
        df = df.set_index('customer_id')
        df['total_charges'] = df['total_charges'].astype(float)
    # initialize an empty list to see what needs to be encoded:
    encoding_vars = []
    # loop through the columns to fill encoded_vars with appropriate
    # datatype field names
    for col in train_df.columns:
        if train_df[col].dtype == 'O':
            encoding_vars.append(col)
    encoding_vars.remove('customer_id')
    # initialize an empty list to hold our encoded dataframes:
    encoded_dfs = []
    for df in [train_df, val_df, test_df]:
        df_encoded_cats = pd.get_dummies(
            df[encoding_vars],
              drop_first=True).astype(int)
        encoded_dfs.append(pd.concat(
            [df,
            df_encoded_cats],
            axis=1).drop(columns=encoding_vars))
    return encoded_dfs

In [18]:
preped_telco_train, preped_telco_val, preped_telco_test = preprocess_telco(train, validate, test)


KeyError: "None of ['customer_id'] are in the columns"

In [None]:
# c. Encode the rest of the columns by creating dummy variables and concatenating them onto the dataframe.


dummy_train = train[['dependents']].replace({'Yes': 1, 'No': 0})
dummy_train

In [29]:
dummy_train = pd.get_dummies(train[['gender', 'dependents','phone_service']], dummy_na=False, drop_first=[True, True])
# .astype(int)
dummy_train.dtypes

dependents_Yes       bool
phone_service_Yes    bool
dtype: object