# Exercises: Data Preparation

<hr style="border:2px solid gray">

In [1]:
#standard ds libraries
import pandas as pd
import numpy as np

#acquire module
import acquire

# import splitting functions
from sklearn.model_selection import train_test_split

<b>A. Using the Iris Data</b>:

1. Use the function defined in acquire.py to load the iris data.

2. Drop the species_id and measurement_id columns.

3. Rename the species_name column to just species.

4. Create dummy variables of the species name and concatenate onto the iris dataframe. (This is for practice, we don't always have to encode the target, but if we used species as a feature, we would need to encode it).

5. Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.

<b>B. Using the Titanic dataset</b>:

1. Use the function defined in acquire.py to load the Titanic data.

2. Drop any unnecessary, unhelpful, or duplicated columns.

3. Encode the categorical columns. Create dummy variables of the categorical columns and concatenate them onto the dataframe.

4. Create a function named prep_titanic that accepts the raw titanic data, and returns the data with the transformations above applied.

<b>C. Using the Telco dataset</b>:

1. Use the function defined in acquire.py to load the Telco data.

2. Drop any unnecessary, unhelpful, or duplicated columns. This could mean dropping foreign key columns but keeping the corresponding string values, for example.

3. Encode the categorical columns. Create dummy variables of the categorical columns and concatenate them onto the dataframe.

4. Create a function named prep_telco that accepts the raw telco data, and returns the data with the transformations above applied.

<b>D. Split your data</b>:

1. Write a function to split your data into train, test and validate datasets. Add this function to prepare.py.

2. Run the function in your notebook on the Iris dataset, returning 3 datasets, train_iris, validate_iris and test_iris.

3. Run the function on the Titanic dataset, returning 3 datasets, train_titanic, validate_titanic and test_titanic.

4. Run the function on the Telco dataset, returning 3 datasets, train_telco, validate_telco and test_telco.

<hr style="border:1.5px solid black">
<hr style="border:1.5px solid black">

### A. Using the Iris Data

<b>1. Use the function defined in acquire.py to load the iris data.</b>

In [2]:
from acquire import new_iris_data

In [3]:
iris = new_iris_data()

In [4]:
iris.head()

Unnamed: 0,species_id,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species_name
0,1,1,5.1,3.5,1.4,0.2,setosa
1,1,2,4.9,3.0,1.4,0.2,setosa
2,1,3,4.7,3.2,1.3,0.2,setosa
3,1,4,4.6,3.1,1.5,0.2,setosa
4,1,5,5.0,3.6,1.4,0.2,setosa


In [5]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   species_id      150 non-null    int64  
 1   measurement_id  150 non-null    int64  
 2   sepal_length    150 non-null    float64
 3   sepal_width     150 non-null    float64
 4   petal_length    150 non-null    float64
 5   petal_width     150 non-null    float64
 6   species_name    150 non-null    object 
dtypes: float64(4), int64(2), object(1)
memory usage: 8.3+ KB


In [6]:
# check again for nulls after info
# iris, where iris has a nan value present, tabulated by sum
iris.isna().sum()

species_id        0
measurement_id    0
sepal_length      0
sepal_width       0
petal_length      0
petal_width       0
species_name      0
dtype: int64

<b>2. Drop the species_id and measurement_id columns.</b>

In [7]:
#make sure these columns exist in my dataset
iris.columns

Index(['species_id', 'measurement_id', 'sepal_length', 'sepal_width',
       'petal_length', 'petal_width', 'species_name'],
      dtype='object')

In [8]:
#assign columns that will be dropped
columns_to_drop = ['species_id', 'measurement_id']

In [9]:
#drop those columns and assign them to a variable
cleaning_iris_data = iris.drop(columns=columns_to_drop)

In [10]:
#take a look at the clean dataframe
cleaning_iris_data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species_name
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


<b>3. Rename the species_name column to just species.</b>

In [11]:
#reassign variable with renamed column
cleaning_iris_data = cleaning_iris_data.rename(columns = {'species_name': 'species'})

In [12]:
#take a look at the renamed column
cleaning_iris_data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


<b>4. Create dummy variables of the species name and concatenate onto the iris dataframe. (This is for practice, we don't always have to encode the target, but if we used species as a feature, we would need to encode it).</b>

In [13]:
#create dummy variables to change object to numeric dtype
species_encoded = pd.get_dummies(cleaning_iris_data[['species']], dummy_na = False, drop_first=[True])

In [14]:
#take a look
species_encoded.head()

Unnamed: 0,species_versicolor,species_virginica
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [15]:
#combine clean dataframe and dummy variable dataframe
clean_iris = pd.concat([cleaning_iris_data, species_encoded], axis=1)

In [16]:
#make sure the dataframes were concatinated
clean_iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_versicolor,species_virginica
0,5.1,3.5,1.4,0.2,setosa,0,0
1,4.9,3.0,1.4,0.2,setosa,0,0
2,4.7,3.2,1.3,0.2,setosa,0,0
3,4.6,3.1,1.5,0.2,setosa,0,0
4,5.0,3.6,1.4,0.2,setosa,0,0


<b>5. Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.</b>

In [17]:
def prep_iris(df):
    '''
    accepts the untrasformed iris data
    returns: the data with cleaning operations performed on it 
    '''
    df = df.drop(columns= (['species_id', 'measurement_id']))
    df = df.rename(columns = {'species_name': 'species'})
    dummy_name = pd.get_dummies(df[['species']], dummy_na = False, drop_first=[True])
    df = pd.concat([df, dummy_name], axis=1)
    return df

In [18]:
#use our new function 'prep_iris'
iris_df = prep_iris(acquire.new_iris_data())

In [19]:
#take a look at the data
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_versicolor,species_virginica
0,5.1,3.5,1.4,0.2,setosa,0,0
1,4.9,3.0,1.4,0.2,setosa,0,0
2,4.7,3.2,1.3,0.2,setosa,0,0
3,4.6,3.1,1.5,0.2,setosa,0,0
4,5.0,3.6,1.4,0.2,setosa,0,0


In [20]:
#see what kinds of dtypes and columns we're dealing with
iris_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   sepal_length        150 non-null    float64
 1   sepal_width         150 non-null    float64
 2   petal_length        150 non-null    float64
 3   petal_width         150 non-null    float64
 4   species             150 non-null    object 
 5   species_versicolor  150 non-null    uint8  
 6   species_virginica   150 non-null    uint8  
dtypes: float64(4), object(1), uint8(2)
memory usage: 6.3+ KB


<hr style="border:1.5px solid black">

### B. Using the Titanic dataset:

<b>1. Use the function defined in acquire.py to load the Titanic data.</b>

In [21]:
#use acquire module to import titanic data and assign to variable
titanic = acquire.get_titanic_data()

In [22]:
#call the variable and look at the data
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [23]:
#take a look at datatypes, nulls and columns
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   passenger_id  891 non-null    int64  
 1   survived      891 non-null    int64  
 2   pclass        891 non-null    int64  
 3   sex           891 non-null    object 
 4   age           714 non-null    float64
 5   sibsp         891 non-null    int64  
 6   parch         891 non-null    int64  
 7   fare          891 non-null    float64
 8   embarked      889 non-null    object 
 9   class         891 non-null    object 
 10  deck          203 non-null    object 
 11  embark_town   889 non-null    object 
 12  alone         891 non-null    int64  
dtypes: float64(2), int64(6), object(5)
memory usage: 97.5+ KB


<b>2. Drop any unnecessary, unhelpful, or duplicated columns.</b>

<div class="alert alert-block alert-info">
<b>Instructor Note:</b> 
    <br>
        <br>
Walk through it:  
        <br>
- embark or embark_town?
    <br>
-- embark has a truncated version of what embark_town carries
    <br>
-- embark_town will be a little more detailed for the user and carry a more descriptive version as we encode it
<br>
<br>
- class or pclass?
<br>
-- pclass is pre-encoded and may contain useful ordinality. 
<br>
-- we will drop 'class'
<br>
<br>
- I may want to impute values into age in a more nuanced way on my second draft of this procedure, but for this first MVP run, I will just drop it out    
    </div>

In [24]:
#assign columns that will be dropped
columns_to_drop = ['class', 
                    'embarked',
                    'passenger_id',
                    'deck',
                    'age']

In [25]:
#drop those columns and assign them to a variable
cleaning_titanic_data = titanic.drop(columns=columns_to_drop)

In [26]:
#take a look at new dataframe
cleaning_titanic_data.head()

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone
0,0,3,male,1,0,7.25,Southampton,0
1,1,1,female,1,0,71.2833,Cherbourg,0
2,1,3,female,0,0,7.925,Southampton,1
3,1,1,female,1,0,53.1,Southampton,0
4,0,3,male,0,0,8.05,Southampton,1


In [27]:
#look at data types
cleaning_titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     891 non-null    int64  
 1   pclass       891 non-null    int64  
 2   sex          891 non-null    object 
 3   sibsp        891 non-null    int64  
 4   parch        891 non-null    int64  
 5   fare         891 non-null    float64
 6   embark_town  889 non-null    object 
 7   alone        891 non-null    int64  
dtypes: float64(1), int64(5), object(2)
memory usage: 62.6+ KB


<b>3. Encode the categorical columns. Create dummy variables of the categorical columns and concatenate them onto the dataframe.</b>

In [28]:
#create dummy variables to change object to numeric dtype
encoded_cols = pd.get_dummies(cleaning_titanic_data[['sex', 'embark_town']], dummy_na = False, drop_first=[True])

In [29]:
#take a look at the our new columns
encoded_cols.head()

Unnamed: 0,sex_male,embark_town_Queenstown,embark_town_Southampton
0,1,0,1
1,0,0,0
2,0,0,1
3,0,0,1
4,1,0,1


In [30]:
#combine clean dataframe and dummy variable dataframe
clean_titanic = pd.concat([cleaning_titanic_data, encoded_cols], axis=1)

In [31]:
#take a look at concatinated dataframe
clean_titanic.head()

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,male,1,0,7.25,Southampton,0,1,0,1
1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,1,3,female,0,0,7.925,Southampton,1,0,0,1
3,1,1,female,1,0,53.1,Southampton,0,0,0,1
4,0,3,male,0,0,8.05,Southampton,1,1,0,1


<b>4. Create a function named prep_titanic that accepts the raw titanic data, and returns the data with the transformations above applied.</b>

In [32]:
#first let's check for nulls
clean_titanic.isnull().sum()

survived                   0
pclass                     0
sex                        0
sibsp                      0
parch                      0
fare                       0
embark_town                2
alone                      0
sex_male                   0
embark_town_Queenstown     0
embark_town_Southampton    0
dtype: int64

In [33]:
#let's drop those nulls
clean_titanic.dropna()

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,male,1,0,7.2500,Southampton,0,1,0,1
1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,1,3,female,0,0,7.9250,Southampton,1,0,0,1
3,1,1,female,1,0,53.1000,Southampton,0,0,0,1
4,0,3,male,0,0,8.0500,Southampton,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,0,0,13.0000,Southampton,1,1,0,1
887,1,1,female,0,0,30.0000,Southampton,1,0,0,1
888,0,3,female,1,2,23.4500,Southampton,0,0,0,1
889,1,1,male,0,0,30.0000,Cherbourg,1,1,0,0


In [34]:
def prep_titanic(titanic):
    titanic = titanic.drop(columns=['class', 
                                'embarked',
                                'passenger_id',
                                'deck',
                                'age'])
    titanic = titanic.dropna()
    encoded_cols = pd.get_dummies(titanic[['embark_town', 'sex']], drop_first=True)
    titanic = pd.concat([titanic, encoded_cols], axis=1)
    return titanic

In [35]:
#let's use our new function
new_titanic = prep_titanic(titanic)

#and take a look at the data
new_titanic.head()

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
0,0,3,male,1,0,7.25,Southampton,0,0,1,1
1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,1,3,female,0,0,7.925,Southampton,1,0,1,0
3,1,1,female,1,0,53.1,Southampton,0,0,1,0
4,0,3,male,0,0,8.05,Southampton,1,0,1,1


<hr style="border:1.5px solid black">

### C. Using the Telco dataset:

<b>1. Use the function defined in acquire.py to load the Telco data.</b>

In [36]:
#use the acquire module and assign variable telco to the dataframe
telco = acquire.get_telco_data()

In [37]:
#take a look at the data
telco.head()

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,2,0002-ORFBO,Female,0,Yes,Yes,9,Yes,...,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,2,1,1,0003-MKNFE,Male,0,No,No,9,Yes,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,1,2,1,0004-TLHLJ,Male,0,No,No,4,Yes,...,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,1,2,1,0011-IGKFF,Male,1,Yes,No,13,Yes,...,No,Yes,Yes,Yes,98.0,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,2,2,1,0013-EXCHZ,Female,1,Yes,No,3,Yes,...,Yes,Yes,No,Yes,83.9,267.4,Yes,Month-to-month,Fiber optic,Mailed check


In [38]:
#let's look at the datatypes, columns and nulls
telco.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7043 entries, 0 to 7042
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   payment_type_id           7043 non-null   int64  
 1   internet_service_type_id  7043 non-null   int64  
 2   contract_type_id          7043 non-null   int64  
 3   customer_id               7043 non-null   object 
 4   gender                    7043 non-null   object 
 5   senior_citizen            7043 non-null   int64  
 6   partner                   7043 non-null   object 
 7   dependents                7043 non-null   object 
 8   tenure                    7043 non-null   int64  
 9   phone_service             7043 non-null   object 
 10  multiple_lines            7043 non-null   object 
 11  online_security           7043 non-null   object 
 12  online_backup             7043 non-null   object 
 13  device_protection         7043 non-null   object 
 14  tech_sup

<b>2. Drop any unnecessary, unhelpful, or duplicated columns. This could mean dropping foreign key columns but keeping the corresponding string values, for example.</b>

In [39]:
#use 'crosstab to check values
telco[['payment_type_id', 'payment_type']]
pd.crosstab(telco.payment_type_id, telco.payment_type)

payment_type,Bank transfer (automatic),Credit card (automatic),Electronic check,Mailed check
payment_type_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,0,2365,0
2,0,0,0,1612
3,1544,0,0,0
4,0,1522,0,0


<div class="alert alert-block alert-info">
<b>Instructor Note:</b> 
    <br>
        <br>
Walk through dropping columns:  
        <br>
        <br>
- drop out all of the ids that appear in my dataset in this manner
<br>
-- including: payment_type_id, internet_service_type_id, contract_type_id
    </div>

In [40]:
# make that drop and reassign telco
telco = telco.drop(columns=['payment_type_id', 
                    'internet_service_type_id', 
                    'contract_type_id'])

In [41]:
telco.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   customer_id            7043 non-null   object 
 1   gender                 7043 non-null   object 
 2   senior_citizen         7043 non-null   int64  
 3   partner                7043 non-null   object 
 4   dependents             7043 non-null   object 
 5   tenure                 7043 non-null   int64  
 6   phone_service          7043 non-null   object 
 7   multiple_lines         7043 non-null   object 
 8   online_security        7043 non-null   object 
 9   online_backup          7043 non-null   object 
 10  device_protection      7043 non-null   object 
 11  tech_support           7043 non-null   object 
 12  streaming_tv           7043 non-null   object 
 13  streaming_movies       7043 non-null   object 
 14  paperless_billing      7043 non-null   object 
 15  mont

<b>3. Encode the categorical columns. Create dummy variables of the categorical columns and concatenate them onto the dataframe.</b>

<div class="alert alert-block alert-info">
<b>Instructor Note:</b> 
    <br>
        <br>
Walk through changing datatypes:  
        <br>
        <br>
- total_charges is currently an object
    <br>
-- We can't convert an empty string into a float, but i can concatenate a zero as a string to that empty and then convert that!
    </div>

In [42]:
#change dtype for total_charges column
telco['total_charges'] = (telco.total_charges + '0').astype('float')

In [43]:
#create dummy variables for all object dtypes (categorical columns)
encoded_df = pd.get_dummies(telco[['multiple_lines', \
                              'online_security', \
                              'online_backup', \
                              'device_protection', \
                              'tech_support', \
                              'streaming_tv', \
                              'streaming_movies', \
                              'contract_type', \
                              'internet_service_type', \
                              'payment_type'
                            ]],
                              drop_first=True)

In [44]:
#assign telco to concatinated original dataframe and encoded dataframe
telco = pd.concat( [telco, encoded_df], axis=1 )

In [45]:
#take a look
telco.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,...,streaming_tv_Yes,streaming_movies_No internet service,streaming_movies_Yes,contract_type_One year,contract_type_Two year,internet_service_type_Fiber optic,internet_service_type_None,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check
0,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,No,Yes,...,1,0,0,1,0,0,0,0,0,1
1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,No,No,...,0,0,1,0,0,0,0,0,0,1
2,0004-TLHLJ,Male,0,No,No,4,Yes,No,No,No,...,0,0,0,0,0,1,0,0,1,0
3,0011-IGKFF,Male,1,Yes,No,13,Yes,No,No,Yes,...,1,0,1,0,0,1,0,0,1,0
4,0013-EXCHZ,Female,1,Yes,No,3,Yes,No,No,No,...,1,0,0,0,0,1,0,0,0,1


In [46]:
#let's make sure all of our columns are correct
telco.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7043 entries, 0 to 7042
Data columns (total 42 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   customer_id                            7043 non-null   object 
 1   gender                                 7043 non-null   object 
 2   senior_citizen                         7043 non-null   int64  
 3   partner                                7043 non-null   object 
 4   dependents                             7043 non-null   object 
 5   tenure                                 7043 non-null   int64  
 6   phone_service                          7043 non-null   object 
 7   multiple_lines                         7043 non-null   object 
 8   online_security                        7043 non-null   object 
 9   online_backup                          7043 non-null   object 
 10  device_protection                      7043 non-null   object 
 11  tech

<b>4. Create a function named prep_telco that accepts the raw telco data, and returns the data with the transformations above applied.</b>

In [47]:
def prep_telco(telco):
    telco['total_charges'] = (telco.total_charges + '0').astype('float')
    telco = telco.drop(columns=['internet_service_type_id', 'contract_type_id', 'payment_type_id'])
    telco['gender_encoded'] = telco.gender.map({'Female': 1, 'Male': 0})
    telco['partner_encoded'] = telco.partner.map({'Yes': 1, 'No': 0})
    telco['dependents_encoded'] = telco.dependents.map({'Yes': 1, 'No': 0})
    telco['phone_service_encoded'] = telco.phone_service.map({'Yes': 1, 'No': 0})
    telco['paperless_billing_encoded'] = telco.paperless_billing.map({'Yes': 1, 'No': 0})
    telco['churn_encoded'] = telco.churn.map({'Yes': 1, 'No': 0})
    dummy_df = pd.get_dummies(telco[['multiple_lines', \
                              'online_security', \
                              'online_backup', \
                              'device_protection', \
                              'tech_support', \
                              'streaming_tv', \
                              'streaming_movies', \
                              'contract_type', \
                              'internet_service_type', \
                              'payment_type'
                            ]],
                              drop_first=True)
    telco = pd.concat( [telco, dummy_df], axis=1 )
    
    return telco

In [48]:
#call our new function
prepped_telco = prep_telco(acquire.get_telco_data())

In [49]:
#look at the data
prepped_telco.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,...,streaming_tv_Yes,streaming_movies_No internet service,streaming_movies_Yes,contract_type_One year,contract_type_Two year,internet_service_type_Fiber optic,internet_service_type_None,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check
0,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,No,Yes,...,1,0,0,1,0,0,0,0,0,1
1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,No,No,...,0,0,1,0,0,0,0,0,0,1
2,0004-TLHLJ,Male,0,No,No,4,Yes,No,No,No,...,0,0,0,0,0,1,0,0,1,0
3,0011-IGKFF,Male,1,Yes,No,13,Yes,No,No,Yes,...,1,0,1,0,0,1,0,0,1,0
4,0013-EXCHZ,Female,1,Yes,No,3,Yes,No,No,No,...,1,0,0,0,0,1,0,0,0,1


<hr style="border:1.5px solid black">

### D. Split your data:

In [50]:
train, test = train_test_split(telco, 
                               train_size = 0.8,
                               random_state=1349,
                              stratify=telco.churn)

In [51]:
train, val = train_test_split(train,
                             train_size = 0.7,
                             random_state=1349,
                             stratify=train.churn)

In [52]:
train.shape, val.shape, test.shape

((3943, 42), (1691, 42), (1409, 42))

<b>1. Write a function to split your data into train, test and validate datasets. Add this function to prepare.py.</b>

In [53]:
def split_data(df, target=''):
    '''
    split_data will take in a single pandas dataframe
    it will split it into a train, validate, and test set
    and it will return three values:
    train, val, test (in this order) -- all pandas Dataframes
    '''
    train, test = train_test_split(df, 
                               train_size = 0.8,
                               random_state=1349,
                              stratify=df[target])
    train, val = train_test_split(train,
                             train_size = 0.7,
                             random_state=1349,
                             stratify=train[target])
    return train, val, test

<b>2. Run the function in your notebook on the Iris dataset, returning 3 datasets, train_iris, validate_iris and test_iris.</b>

In [55]:
train, validate, test = split_data(iris_df, target='species')

In [56]:
train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_versicolor,species_virginica
138,6.0,3.0,4.8,1.8,virginica,0,1
7,5.0,3.4,1.5,0.2,setosa,0,0
79,5.7,2.6,3.5,1.0,versicolor,1,0
74,6.4,2.9,4.3,1.3,versicolor,1,0
97,6.2,2.9,4.3,1.3,versicolor,1,0


In [57]:
train.shape, validate.shape, test.shape

((84, 7), (36, 7), (30, 7))

<b>3. Run the function on the Titanic dataset, returning 3 datasets, train_titanic, validate_titanic and test_titanic.</b>

In [58]:
titanic_train, titanic_val, titanic_test = split_data(new_titanic, target='survived')

In [59]:
titanic_train.shape, titanic_val.shape, titanic_test.shape

((497, 11), (214, 11), (178, 11))

In [60]:
titanic_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 497 entries, 474 to 336
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   survived                 497 non-null    int64  
 1   pclass                   497 non-null    int64  
 2   sex                      497 non-null    object 
 3   sibsp                    497 non-null    int64  
 4   parch                    497 non-null    int64  
 5   fare                     497 non-null    float64
 6   embark_town              497 non-null    object 
 7   alone                    497 non-null    int64  
 8   embark_town_Queenstown   497 non-null    uint8  
 9   embark_town_Southampton  497 non-null    uint8  
 10  sex_male                 497 non-null    uint8  
dtypes: float64(1), int64(5), object(2), uint8(3)
memory usage: 36.4+ KB


In [61]:
titanic_train.head()

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
474,0,3,female,0,0,9.8375,Southampton,1,0,1,0
167,0,3,female,1,4,27.9,Southampton,0,0,1,0
271,1,3,male,0,0,0.0,Southampton,1,0,1,1
887,1,1,female,0,0,30.0,Southampton,1,0,1,0
118,0,1,male,0,1,247.5208,Cherbourg,0,0,0,1


<b>4. Run the function on the Telco dataset, returning 3 datasets, train_telco, validate_telco and test_telco.</b>

In [62]:
telco_train, telco_val, telco_test = split_data(prepped_telco, target='churn')

In [63]:
telco_train.shape, telco_val.shape, telco_test.shape

((3943, 48), (1691, 48), (1409, 48))

In [64]:
telco_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3943 entries, 6832 to 2320
Data columns (total 48 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   customer_id                            3943 non-null   object 
 1   gender                                 3943 non-null   object 
 2   senior_citizen                         3943 non-null   int64  
 3   partner                                3943 non-null   object 
 4   dependents                             3943 non-null   object 
 5   tenure                                 3943 non-null   int64  
 6   phone_service                          3943 non-null   object 
 7   multiple_lines                         3943 non-null   object 
 8   online_security                        3943 non-null   object 
 9   online_backup                          3943 non-null   object 
 10  device_protection                      3943 non-null   object 
 11  t

In [65]:
telco_train.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,...,streaming_tv_Yes,streaming_movies_No internet service,streaming_movies_Yes,contract_type_One year,contract_type_Two year,internet_service_type_Fiber optic,internet_service_type_None,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check
6832,9705-IOVQQ,Male,1,Yes,Yes,61,Yes,Yes,No,Yes,...,1,0,1,1,0,1,0,0,1,0
433,0635-WKOLD,Male,0,Yes,No,40,Yes,Yes,No,Yes,...,1,0,0,1,0,0,0,1,0,0
25,0032-PGELS,Female,0,Yes,Yes,1,No,No phone service,Yes,No,...,0,0,0,0,0,0,0,0,0,0
3196,4581-LNWUM,Female,0,No,No,13,No,No phone service,No,No,...,1,0,1,0,0,0,0,0,1,0
4466,6297-NOOPG,Female,0,Yes,No,70,Yes,Yes,No,Yes,...,1,0,1,0,1,1,0,0,1,0
