In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import acquire

## Using the iris data:

In [2]:
# Use the function defined in acquire.py to load 
# the iris data.

In [3]:
from pydataset import data
data('iris')

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa
6,5.4,3.9,1.7,0.4,setosa
7,4.6,3.4,1.4,0.3,setosa
8,5.0,3.4,1.5,0.2,setosa
9,4.4,2.9,1.4,0.2,setosa
10,4.9,3.1,1.5,0.1,setosa


In [4]:
# Use the function defined in acquire.py to load the iris data.
iris = acquire.get_iris_data()
iris

Unnamed: 0,species_id,species_name,sepal_length,sepal_width,petal_length,petal_width
0,1,setosa,5.1,3.5,1.4,0.2
1,1,setosa,4.9,3.0,1.4,0.2
2,1,setosa,4.7,3.2,1.3,0.2
3,1,setosa,4.6,3.1,1.5,0.2
4,1,setosa,5.0,3.6,1.4,0.2
5,1,setosa,5.4,3.9,1.7,0.4
6,1,setosa,4.6,3.4,1.4,0.3
7,1,setosa,5.0,3.4,1.5,0.2
8,1,setosa,4.4,2.9,1.4,0.2
9,1,setosa,4.9,3.1,1.5,0.1


In [5]:
# Drop the species_id and measurement_id columns.

In [6]:
iris = iris.drop(columns=['species_id'])

In [7]:
iris.head()

Unnamed: 0,species_name,sepal_length,sepal_width,petal_length,petal_width
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
4,setosa,5.0,3.6,1.4,0.2


In [8]:
# Rename the species_name column to just species.

In [9]:
iris.rename(columns={'species_name': 'species'}, inplace=True)
iris.head()

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
4,setosa,5.0,3.6,1.4,0.2


In [10]:
# Create dummy variables of the species name and concatenate onto the iris dataframe.
# (This is for practice, we don't always have to encode the target, but if we used species as a feature, we would need to encode it).

In [11]:
species_encoded = pd.get_dummies(iris.species, drop_first=True)
iris = pd.concat([iris, species_encoded], axis=1)
iris

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width,versicolor,virginica
0,setosa,5.1,3.5,1.4,0.2,0,0
1,setosa,4.9,3.0,1.4,0.2,0,0
2,setosa,4.7,3.2,1.3,0.2,0,0
3,setosa,4.6,3.1,1.5,0.2,0,0
4,setosa,5.0,3.6,1.4,0.2,0,0
5,setosa,5.4,3.9,1.7,0.4,0,0
6,setosa,4.6,3.4,1.4,0.3,0,0
7,setosa,5.0,3.4,1.5,0.2,0,0
8,setosa,4.4,2.9,1.4,0.2,0,0
9,setosa,4.9,3.1,1.5,0.1,0,0


In [12]:
# Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.

In [13]:
def prep_iris(iris):
    
    iris = iris.drop(columns=['species_id'])
    iris.rename(columns={'species_name': 'species'}, inplace=True)
    species_encoded = pd.get_dummies(iris.species, drop_first=True)
    iris = pd.concat([iris, species_encoded], axis=1)
    return iris

In [14]:
prep_iris(acquire.get_iris_data())

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width,versicolor,virginica
0,setosa,5.1,3.5,1.4,0.2,0,0
1,setosa,4.9,3.0,1.4,0.2,0,0
2,setosa,4.7,3.2,1.3,0.2,0,0
3,setosa,4.6,3.1,1.5,0.2,0,0
4,setosa,5.0,3.6,1.4,0.2,0,0
5,setosa,5.4,3.9,1.7,0.4,0,0
6,setosa,4.6,3.4,1.4,0.3,0,0
7,setosa,5.0,3.4,1.5,0.2,0,0
8,setosa,4.4,2.9,1.4,0.2,0,0
9,setosa,4.9,3.1,1.5,0.1,0,0


In [15]:
iris.to_csv('anewiris.csv', index=False)

## Using the Titanic dataset


In [16]:
# Use the function defined in acquire.py to load the Titanic data.

In [17]:
titanic = acquire.get_titanic_data()


In [18]:
titanic.head()


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [19]:
# Drop any unnecessary, unhelpful, or duplicated columns.

In [20]:
titanic = titanic.drop(columns=['class', 
                                'embarked',
                                'passenger_id',
                                'deck',
                                'age'])

In [21]:
titanic

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone
0,0,3,male,1,0,7.2500,Southampton,0
1,1,1,female,1,0,71.2833,Cherbourg,0
2,1,3,female,0,0,7.9250,Southampton,1
3,1,1,female,1,0,53.1000,Southampton,0
4,0,3,male,0,0,8.0500,Southampton,1
...,...,...,...,...,...,...,...,...
886,0,2,male,0,0,13.0000,Southampton,1
887,1,1,female,0,0,30.0000,Southampton,1
888,0,3,female,1,2,23.4500,Southampton,0
889,1,1,male,0,0,30.0000,Cherbourg,1


In [22]:
# Encode the categorical columns. Create dummy variables of the categorical columns and concatenate them onto the dataframe.

In [23]:
encoded_vars = pd.get_dummies(titanic[['embark_town', 'sex']], drop_first=True)
titanic = pd.concat([titanic, encoded_vars], axis=1)
titanic

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
0,0,3,male,1,0,7.2500,Southampton,0,0,1,1
1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,1,3,female,0,0,7.9250,Southampton,1,0,1,0
3,1,1,female,1,0,53.1000,Southampton,0,0,1,0
4,0,3,male,0,0,8.0500,Southampton,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,0,0,13.0000,Southampton,1,0,1,1
887,1,1,female,0,0,30.0000,Southampton,1,0,1,0
888,0,3,female,1,2,23.4500,Southampton,0,0,1,0
889,1,1,male,0,0,30.0000,Cherbourg,1,0,0,1


In [24]:
# Create a function named prep_titanic that accepts the raw titanic data, and returns the data with the transformations above applied.

In [25]:
titanic.isna().sum()


survived                   0
pclass                     0
sex                        0
sibsp                      0
parch                      0
fare                       0
embark_town                2
alone                      0
embark_town_Queenstown     0
embark_town_Southampton    0
sex_male                   0
dtype: int64

In [26]:
# remove embark_town null rows
titanic.dropna()

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
0,0,3,male,1,0,7.2500,Southampton,0,0,1,1
1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,1,3,female,0,0,7.9250,Southampton,1,0,1,0
3,1,1,female,1,0,53.1000,Southampton,0,0,1,0
4,0,3,male,0,0,8.0500,Southampton,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,0,0,13.0000,Southampton,1,0,1,1
887,1,1,female,0,0,30.0000,Southampton,1,0,1,0
888,0,3,female,1,2,23.4500,Southampton,0,0,1,0
889,1,1,male,0,0,30.0000,Cherbourg,1,0,0,1


In [27]:
def prep_titanic(titanic):
    titanic = titanic.drop(columns=['class', 
                                'embarked',
                                'passenger_id',
                                'deck',
                                'age'])
    tianic = titanic.dropna()
    encoded_vars = pd.get_dummies(titanic[['embark_town', 'sex']], drop_first=True)
    titanic = pd.concat([titanic, encoded_vars], axis=1)
    return titanic


# Using the Telco dataset

In [28]:
# Use the function defined in acquire.py to load the Telco data.

In [29]:
telco = acquire.get_telco_data()


In [30]:
telco.head()

Unnamed: 0_level_0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
customerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [31]:
telco.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7043 entries, 7590-VHVEG to 3186-AJIEK
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null 

In [32]:
# Drop any unnecessary, unhelpful, or duplicated columns. This could mean dropping foreign key columns but keeping the corresponding string values, for example.

In [33]:
telco[['payment_type_id', 'payment_type']]
pd.crosstab(telco.payment_type_id, telco.payment_type)

KeyError: "None of [Index(['payment_type_id', 'payment_type'], dtype='object')] are in the [columns]"

In [None]:
telco = telco.drop(columns=['payment_type_id', 
                    'internet_service_type_id', 
                    'contract_type_id'])

In [None]:
telco.info()

In [None]:
telco['TotalCharges'] = telco['TotalCharges'].fillna(telco.TotalCharges == '0').astype(float)

In [None]:
telco.info()

In [None]:
telco['TechSupport'].value_counts().size


In [None]:
telco.Churn.value_counts().size

In [None]:
pd.get_dummies(telco.gender)

In [None]:
categorical_columns = []
bin_cats = []
mult_cats = []
numerical_columns = []
for col in telco.columns:
    if telco[col].dtype == 'object':
        categorical_columns.append(col)
        if telco[col].value_counts().size > 2:
            mult_cats.append(col)
        else:
            bin_cats.append(col)
    else:
        numerical_columns.append(col)

In [None]:
categorical_columns

In [None]:
# Encode the categorical columns. Create dummy variables of the categorical columns and concatenate them onto the dataframe.

In [None]:
telco['gender'] = telco['gender'].map({'Male': 0, 'Female': 1})
telco['Partner'] = telco['Partner'].map({'No': 0, 'Yes': 1})
telco['Dependents'] = telco['Dependents'].map({'No': 0, 'Yes': 1})
telco['PhoneService'] = telco['PhoneService'].map({'No': 0, 'Yes': 1})
telco['PaperlessBilling'] = telco['PaperlessBilling'].map({'No': 0, 'Yes': 1})

In [36]:
telco = pd.concat([
    telco,
    pd.get_dummies(telco[mult_cats])
], axis=1)

NameError: name 'mult_cats' is not defined

In [None]:
# Create a function named prep_telco that accepts the raw telco data, and returns the data with the transformations above applied.

In [34]:
def prep_telco(telco):
    telco['total_charges'] = (telco.total_charges + '0').astype('float')
    telco = telco.drop(columns=['internet_service_type_id', 'contract_type_id', 'payment_type_id'])
    telco['gender_encoded'] = telco.gender.map({'Female': 1, 'Male': 0})
    telco['partner_encoded'] = telco.partner.map({'Yes': 1, 'No': 0})
    telco['dependents_encoded'] = telco.dependents.map({'Yes': 1, 'No': 0})
    telco['phone_service_encoded'] = telco.phone_service.map({'Yes': 1, 'No': 0})
    telco['paperless_billing_encoded'] = telco.paperless_billing.map({'Yes': 1, 'No': 0})
    telco['churn_encoded'] = telco.churn.map({'Yes': 1, 'No': 0})
    dummy_df = pd.get_dummies(telco[['multiple_lines', \
                              'online_security', \
                              'online_backup', \
                              'device_protection', \
                              'tech_support', \
                              'streaming_tv', \
                              'streaming_movies', \
                              'contract_type', \
                              'internet_service_type', \
                              'payment_type'
                            ]],
                              drop_first=True)
    telco = pd.concat( [telco, dummy_df], axis=1 )
    
    return telco

In [35]:
prepped_telco = prep_telco(acquire.get_telco_data())


AttributeError: 'DataFrame' object has no attribute 'total_charges'

# Split your data



In [None]:
# Write a function to split your data into train, test and validate datasets. Add this function to prepare.py.



In [None]:
train, test = train_test_split(telco, 
                               train_size = 0.8,
                               random_state=1349,
                              stratify=telco.Churn)

In [None]:
train, val = train_test_split(train,
                             train_size = 0.7,
                             random_state=1349,
                             stratify=train.Churn)


In [None]:
train.shape, val.shape, test.shape

In [38]:
def split_data(df, target=''):
    '''
    split_data will take in a single pandas dataframe
    it will split it into a train, validate, and test set
    and it will return three values:
    train, val, test (in this order) -- all pandas Dataframes
    '''
    train, test = train_test_split(df, 
                               train_size = 0.8,
                               random_state=1349,
                              stratify=df[target])
    train, val = train_test_split(train,
                             train_size = 0.7,
                             random_state=1349,
                             stratify=train[target])
    return train, val, test

In [None]:
# Run the function in your notebook on the Iris dataset, returning 3 datasets, train_iris, validate_iris and test_iris.

In [None]:
iris = acquire.get_iris_data()

In [None]:
train, validate, test = split_data(iris, target='species_name')


In [None]:
train.head()

In [None]:
train.size, validate.size, test.size

In [None]:
# Run the function on the Titanic dataset, returning 3 datasets, train_titanic, validate_titanic and test_titanic.

In [None]:
titanic_train, titanic_val, titanic_test = split_data(
titanic, target='survived')


In [None]:
titanic_train.size, titanic_val.size, titanic_test.size


In [None]:
# Run the function on the Telco dataset, returning 3 datasets, train_telco, validate_telco and test_telco.

In [None]:
telco_train, telco_val, telco_test = split_data(
telco, target='Churn')

In [None]:
telco_train.size, telco_val.size, telco_test.size


NameError: name 'split_data' is not defined

In [42]:
titanic_train, titanic_val, titanic_test = split_data(
titanic, target='survived')

In [44]:
train = titanic_train
val = titanic_val
test = titanic_test

In [48]:
cat_cols, num_cols = [], []
for col in train.columns:
    if train.dtype == '0':
        cat_cols.append(col)
    else:
        if train[col].nunique() < 10:
            cat_cols.append(col)
        else:
            num_cols.append(col)

AttributeError: 'DataFrame' object has no attribute 'dtype'