In [101]:
# Import libraries
import pandas as pd
import numpy as np
import acquire
import matplotlib.pyplot as plt
import env

# Import functions
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

### Using the Iris Data:

1. Use the function defined in `acquire.py` to load the iris data.

In [105]:
iris = acquire.get_iris_data()

Reading from file...


2. Clean up the column names - replace the period with an underscore and lowercase.

3. Drop the `species_id` and `measurement_id` columns.

In [109]:
iris = iris.drop(columns=['species_id','measurement_id'])

4. Rename the `species_name` column to just `species`.

In [112]:
iris = iris.rename(columns={'species_name':'species'})

5. Create a function named `prep_iris` that accepts the untransformed iris data, and returns the data with the transformations above applied.

In [115]:
def prep_iris(iris):
    """
    Cleans iris data. Takes a raw dataframe, drops species and measurement IDs, then renames species column. Returns cleaned dataframe.
    """
    iris = iris.drop(columns=['species_id','measurement_id'])
    iris = iris.rename(columns={'species_name':'species'})
    
    return iris

In [117]:
iris.head()

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
4,setosa,5.0,3.6,1.4,0.2


### Using the Titanic dataset

1. Use the function defined in `acquire.py` to load the Titanic data.

In [120]:
titanic = acquire.get_titanic_data()

Reading from file...


In [121]:
env.df_info(titanic)

Unnamed: 0,nunique,dtypes,isnull,sample
passenger_id,891,int64,0,749
survived,2,int64,0,0
pclass,3,int64,0,3
sex,2,object,0,male
age,88,float64,177,31.0
sibsp,7,int64,0,0
parch,7,int64,0,0
fare,248,float64,0,7.75
embarked,3,object,2,Q
class,3,object,0,Third


2. Drop any unnecessary, unhelpful, or duplicated columns.

In [123]:
titanic = titanic.drop(columns=['deck','class','embarked'])

In [199]:
env.df_info(titanic)

Unnamed: 0,nunique,dtypes,isnull,sample
passenger_id,891,int64,0,648
survived,2,int64,0,0
pclass,3,int64,0,3
sex,2,object,0,male
age,88,float64,177,
sibsp,7,int64,0,0
parch,7,int64,0,0
fare,248,float64,0,7.55
embark_town,3,object,2,Southampton
alone,2,int64,0,1


In [210]:
# turn categoricals into objects
for col in titanic.columns:
        if titanic[col].dtype != 'object' and titanic[col].nunique() < 10:
            titanic[col] = titanic[col].astype(object)

In [216]:
env.df_info(titanic)

Unnamed: 0,nunique,dtypes,isnull,sample
passenger_id,891,int64,0,118
survived,2,object,0,0
pclass,3,object,0,1
sex,2,object,0,male
age,88,float64,177,24.0
sibsp,7,object,0,0
parch,7,object,0,1
fare,248,float64,0,247.5208
embark_town,3,object,2,Cherbourg
alone,2,object,0,0


In [222]:
# manually assign passenger_id to object
titanic.passenger_id = titanic.passenger_id.astype(object)
env.df_info(titanic)

Unnamed: 0,nunique,dtypes,isnull,sample
passenger_id,891,object,0,280
survived,2,object,0,0
pclass,3,object,0,3
sex,2,object,0,male
age,88,float64,177,65.0
sibsp,7,object,0,0
parch,7,object,0,0
fare,248,float64,0,7.75
embark_town,3,object,2,Queenstown
alone,2,object,0,1


3. Create a function named `prep_titanic` that accepts the raw titanic data, and returns the data with the transformations above applied.


In [225]:
def prep_titanic(titanic):
    """
    Cleans and prepares titanic dataset. Removes deck, class, and embarked columns. Turns columns with fewer than 10 unique values into object series. 
    Turns passenger_id into object series. Returns dataframe containing cleaned titanic data.
    """
    # Drop unnecessary columns
    titanic = titanic.drop(columns=['deck','class','embarked'])
    
    # turn categoricals into objects
    for col in titanic.columns:
            if titanic[col].dtype != 'object' and titanic[col].nunique() < 10:
                titanic[col] = titanic[col].astype(object)
    
    # manually assign passenger_id to object
    titanic.passenger_id = titanic.passenger_id.astype(object)
    
    return titanic

### Using the Telco dataset

1. Use the function defined in `acquire.py` to load the Telco data.

In [127]:
telco = acquire.get_telco_data()

Reading from file...


2. Drop any unnecessary, unhelpful, or duplicated columns. This could mean dropping foreign key columns but keeping the corresponding string values, for example.

In [129]:
telco.head(1)

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,2,0002-ORFBO,Female,0,Yes,Yes,9,Yes,...,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check


In [130]:
# Drop extra ID columns
telco = telco.drop(columns=['payment_type_id','internet_service_type_id','contract_type_id'],errors='ignore')

In [131]:
# telco.nunique()
# telco.dtypes
# telco.sample(1).iloc[0]

In [132]:
# create a dataframe holding info about telco
telco_info = env.df_info(telco)
telco_info

Unnamed: 0,nunique,dtypes,isnull,sample
customer_id,7043,object,0,5356-KZCKT
gender,2,object,0,Male
senior_citizen,2,int64,0,0
partner,2,object,0,No
dependents,2,object,0,No
tenure,73,int64,0,58
phone_service,2,object,0,Yes
multiple_lines,3,object,0,Yes
online_security,3,object,0,No internet service
online_backup,3,object,0,No internet service


In [133]:
# Convert categoricals into objects
for col in telco.columns:
    # print(col)
    if telco[col].dtype != 'object' and telco[col].nunique() < 10:
        telco[col] = telco[col].astype(object)
        # print(col,'is not object')
    # if telco[col].nunique() < 10:
    #     print(col,'is probably cat var')

In [134]:
# Handle total_charges (should be a float)
# Find out why it's an object
telco.total_charges.value_counts()

total_charges
           11
20.2       11
19.75       9
19.9        8
20.05       8
           ..
2387.75     1
6302.8      1
2058.5      1
829.55      1
3707.6      1
Name: count, Length: 6531, dtype: int64

In [135]:
# Examine records with these fake null values
telco[telco.total_charges == ' ']

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
945,1371-DWPAZ,Female,0,Yes,Yes,0,No,No phone service,Yes,Yes,...,Yes,Yes,No,No,56.05,,No,Two year,DSL,Credit card (automatic)
1731,2520-SGTTA,Female,0,Yes,Yes,0,Yes,No,No internet service,No internet service,...,No internet service,No internet service,No internet service,No,20.0,,No,Two year,,Mailed check
1906,2775-SEFEE,Male,0,No,Yes,0,Yes,Yes,Yes,Yes,...,Yes,No,No,Yes,61.9,,No,Two year,DSL,Bank transfer (automatic)
2025,2923-ARZLG,Male,0,Yes,Yes,0,Yes,No,No internet service,No internet service,...,No internet service,No internet service,No internet service,Yes,19.7,,No,One year,,Mailed check
2176,3115-CZMZD,Male,0,No,Yes,0,Yes,No,No internet service,No internet service,...,No internet service,No internet service,No internet service,No,20.25,,No,Two year,,Mailed check
2250,3213-VVOLG,Male,0,Yes,Yes,0,Yes,Yes,No internet service,No internet service,...,No internet service,No internet service,No internet service,No,25.35,,No,Two year,,Mailed check
2855,4075-WKNIU,Female,0,Yes,Yes,0,Yes,Yes,No,Yes,...,Yes,Yes,No,No,73.35,,No,Two year,DSL,Mailed check
3052,4367-NUYAO,Male,0,Yes,Yes,0,Yes,Yes,No internet service,No internet service,...,No internet service,No internet service,No internet service,No,25.75,,No,Two year,,Mailed check
3118,4472-LVYGI,Female,0,Yes,Yes,0,No,No phone service,Yes,No,...,Yes,Yes,No,Yes,52.55,,No,Two year,DSL,Bank transfer (automatic)
4054,5709-LVOEQ,Female,0,Yes,Yes,0,Yes,No,Yes,Yes,...,No,Yes,Yes,No,80.85,,No,Two year,DSL,Mailed check


In [136]:
# Correct fake nulls
telco.total_charges = np.where(telco.total_charges==' ',telco.tenure*telco.monthly_charges,telco.total_charges)

In [137]:
telco.total_charges = telco.total_charges.astype(float)

3. Handle null values.

In [139]:
# See why there are so many nulls in internet_service_type
telco.internet_service_type.value_counts(dropna=False)

internet_service_type
Fiber optic    3096
DSL            2421
NaN            1526
Name: count, dtype: int64

In [140]:
pd.crosstab(telco.internet_service_type,telco.streaming_movies,dropna=False)

streaming_movies,No,No internet service,Yes
internet_service_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DSL,1440,0,981
Fiber optic,1345,0,1751
,0,1526,0


In [141]:
# Replace nulls with None
telco.internet_service_type = np.where(telco.internet_service_type.isnull(),'None',telco.internet_service_type)

4. Create a function named `prep_telco` that accepts the raw telco data, and returns the data with the transformations above applied.

In [143]:
def prep_telco(telco):
    """
    ADD ME
    """
    # Drop extra ID columns
    telco = telco.drop(columns=['payment_type_id','internet_service_type_id','contract_type_id'],errors='ignore')
    
    # Replace nulls in internet_service_type with None
    telco.internet_service_type = np.where(telco.internet_service_type.isnull(),'None',telco.internet_service_type)
    
    # Fix fake nulls in total_charges
    telco.total_charges = np.where(telco.total_charges==' ',telco.tenure*telco.monthly_charges,telco.total_charges)
    telco.total_charges = telco.total_charges.astype(float)
    
    # Convert categoricals into objects
    for col in telco.columns:
        if telco[col].dtype != 'object' and telco[col].nunique() < 10:
            telco[col] = telco[col].astype(object)
    
    return telco

In [144]:
# test the function
telco = acquire.get_telco_data()

Reading from file...


In [145]:
telco.head()

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,2,0002-ORFBO,Female,0,Yes,Yes,9,Yes,...,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,2,1,1,0003-MKNFE,Male,0,No,No,9,Yes,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,1,2,1,0004-TLHLJ,Male,0,No,No,4,Yes,...,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,1,2,1,0011-IGKFF,Male,1,Yes,No,13,Yes,...,No,Yes,Yes,Yes,98.0,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,2,2,1,0013-EXCHZ,Female,1,Yes,No,3,Yes,...,Yes,Yes,No,Yes,83.9,267.4,Yes,Month-to-month,Fiber optic,Mailed check


In [146]:
telco = prep_telco(telco)

In [147]:
telco.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,No,Yes,...,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,No,No,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,0004-TLHLJ,Male,0,No,No,4,Yes,No,No,No,...,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,0011-IGKFF,Male,1,Yes,No,13,Yes,No,No,Yes,...,No,Yes,Yes,Yes,98.0,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,0013-EXCHZ,Female,1,Yes,No,3,Yes,No,No,No,...,Yes,Yes,No,Yes,83.9,267.4,Yes,Month-to-month,Fiber optic,Mailed check


In [148]:
env.df_info(telco)

Unnamed: 0,nunique,dtypes,isnull,sample
customer_id,7043,object,0,1125-SNVCK
gender,2,object,0,Female
senior_citizen,2,object,0,1
partner,2,object,0,No
dependents,2,object,0,No
tenure,73,int64,0,49
phone_service,2,object,0,Yes
multiple_lines,3,object,0,No
online_security,3,object,0,No
online_backup,3,object,0,No


### Split your data

1. Write a function to split your data into train, test and validate datasets. Add this function to prepare.py.

In [151]:
# Run test to see if I can pass a string for usage in train_test_split
strat_var = 'churn'

In [152]:
train,test = train_test_split(telco,
                              train_size=0.60,
                              random_state=123,
                              stratify=telco[strat_var]
)
print(train)

     customer_id  gender senior_citizen partner dependents  tenure  \
2865  4083-BFNYK  Female              1     Yes         No      38   
4107  5804-LEPIM  Female              1      No         No       2   
3453  4895-TMWIR    Male              1     Yes         No      11   
923   1342-JPNKI    Male              0      No         No      10   
2370  3397-AVTKU    Male              0      No         No      43   
...          ...     ...            ...     ...        ...     ...   
1286  1915-IOFGU  Female              0      No         No       1   
4999  7025-WCBNE    Male              1      No         No      47   
6892  9788-HNGUT    Male              0     Yes         No      72   
3718  5229-PRWKT    Male              0      No         No       8   
5354  7562-UXTPG  Female              0      No         No      13   

     phone_service multiple_lines      online_security        online_backup  \
2865           Yes            Yes                  Yes                  Yes   
4

In [153]:
def split_df(df,strat_var):
    """
    ADD ME!
    """
    # Run first split
    train, validate_test = train_test_split(df,
                 train_size=0.60,
                random_state=123,
                 stratify=df[strat_var]
                )
    
    # Run second split
    validate, test = train_test_split(validate_test,
                test_size=0.50,
                 random_state=123,
                 stratify=validate_test[strat_var]
                )
    
    return train, validate, test

2. Run the function in your notebook on the Iris dataset, returning 3 datasets, train_iris, validate_iris and test_iris.

In [155]:
train_iris,validate_iris,test_iris = split_df(iris,'species')

In [156]:
print(f"""
train: {len(train_iris)}
validate: {len(validate_iris)}
test: {len(test_iris)}
""")


train: 90
validate: 30
test: 30



3. Run the function on the Titanic dataset, returning 3 datasets, train_titanic, validate_titanic and test_titanic.

In [233]:
titanic = acquire.get_titanic_data()

Reading from file...


In [237]:
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [239]:
titanic = prep_titanic(titanic)

In [241]:
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,Southampton,1


In [249]:
train_titanic,validate_titanic,test_titanic = split_df(titanic,'survived')

In [251]:
print(f"""
train: {len(train_titanic)}
validate: {len(validate_titanic)}
test: {len(test_titanic)}
""")


train: 534
validate: 178
test: 179



4. Run the function on the Telco dataset, returning 3 datasets, train_telco, validate_telco and test_telco.

In [159]:
train_telco,validate_telco,test_telco = split_df(telco,'churn')

In [160]:
print(f"""
train: {len(train_telco)}
validate: {len(validate_telco)}
test: {len(test_telco)}
""")


train: 4225
validate: 1409
test: 1409

