In [1]:
import pandas as pd
import numpy as np
from acquire import get_titanic_data 
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [2]:
df = get_titanic_data()

In [3]:
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [4]:
def impute_mode(train, validate, test):
    '''
    impute mode for embark_town
    '''
    imputer = SimpleImputer(strategy='most_frequent', missing_values=None)
    train['embark_town'] = imputer.fit_transform(train[['embark_town']])
    validate['embark_town'] = imputer.transform(validate[['embark_town']])
    test['embark_town'] = imputer.transform(test[['embark_town']])
    return train, validate, test

In [5]:
# imputing needs to be done POST split. So this function, is not best practice
def my_imputer(df):
    '''
    my crack at the imputer function
    '''
    imputer = SimpleImputer(strategy='most_frequent', missing_values=None)
    df['embark_town'] = imputer.fit_transform(df[['embark_town']])
    return df
    

In [6]:
def prep_titanic_data(df):
    '''
    takes in a dataframe of the titanic dataset as it is acquired and returns a cleaned dataframe
    arguments: df: a pandas DataFrame with the expected feature names and columns
    return: train, test, split: three dataframes with the cleaning operations performed on them
    '''
    df = df.drop_duplicates()
    df = df.drop(columns=['deck', 'embarked', 'class', 'age', 'passenger_id'])
    train, test = train_test_split(df, test_size=0.2, random_state=1349, stratify=df.survived)
    train, validate = train_test_split(train, train_size=0.7, random_state=1349, stratify=train.survived)
    train, validate, test = impute_mode(train, validate, test)
    train = pd.get_dummies(data = train, columns=['sex', 'embark_town'], drop_first=[True,True])
    validate = pd.get_dummies(data=validate, columns=['sex', 'embark_town'], drop_first=[True,True])
    test = pd.get_dummies(data=test, columns = ['sex', 'embark_town'], drop_first=[True,True])
    return train, validate, test

In [7]:
# Same as above. Imputing needs to be done POST split. Why? data leaks.
# You don't want to let any data info get into the validate and test. spoilers
# the test and validate aren't supposed to be peaking at the whole set prematurely
def prep_titanic_data_new(df):
    '''
    takes in a dataframe of the titanic dataset as it is acquired and returns a cleaned dataframe
    arguments: df: a pandas DataFrame with the expected feature names and columns
    return: train, test, split: three dataframes with the cleaning operations performed on them
    '''
    df = df.drop_duplicates()
    df = df.drop(columns=['deck', 'embarked', 'class', 'age', 'passenger_id'])
    df = my_imputer(df)
    train, test = train_test_split(df, test_size=0.2, random_state=1349, stratify=df.survived)
    train, validate = train_test_split(train, train_size=0.7, random_state=1349, stratify=train.survived)
    train = pd.get_dummies(data = train, columns=['sex', 'embark_town'], drop_first=[True,True])
    validate = pd.get_dummies(data=validate, columns=['sex', 'embark_town'], drop_first=[True,True])
    test = pd.get_dummies(data=test, columns = ['sex', 'embark_town'], drop_first=[True,True])
    return train, validate, test

In [13]:
# set with function from class 
train, validate, test = prep_titanic_data(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['embark_town'] = imputer.transform(test[['embark_town']])


In [14]:
train_new, validate_new, test_new = prep_titanic_data_new(df)

In [10]:
train.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
474,0,3,0,0,9.8375,1,0,0,1
370,1,1,1,0,55.4417,0,1,0,0
573,1,3,0,0,7.75,1,0,1,0
110,0,1,0,0,52.0,1,1,0,1
167,0,3,1,4,27.9,0,0,0,1


In [15]:
train_new.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
474,0,3,0,0,9.8375,1,0,0,1
370,1,1,1,0,55.4417,0,1,0,0
573,1,3,0,0,7.75,1,0,1,0
110,0,1,0,0,52.0,1,1,0,1
167,0,3,1,4,27.9,0,0,0,1


In [11]:
validate.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
569,1,3,0,0,7.8542,1,1,0,1
532,0,3,1,1,7.2292,0,1,0,0
223,0,3,0,0,7.8958,1,1,0,1
35,0,1,1,0,52.0,0,1,0,1
93,0,3,1,2,20.575,0,1,0,1


In [16]:
validate_new.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
569,1,3,0,0,7.8542,1,1,0,1
532,0,3,1,1,7.2292,0,1,0,0
223,0,3,0,0,7.8958,1,1,0,1
35,0,1,1,0,52.0,0,1,0,1
93,0,3,1,2,20.575,0,1,0,1


In [12]:
test.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
691,1,3,0,1,13.4167,0,0,0,0
762,1,3,0,0,7.2292,1,1,0,0
869,1,3,1,1,11.1333,0,1,0,1
174,0,1,0,0,30.6958,1,1,0,0
45,0,3,0,0,8.05,1,1,0,1


In [17]:
test_new.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
691,1,3,0,1,13.4167,0,0,0,0
762,1,3,0,0,7.2292,1,1,0,0
869,1,3,1,1,11.1333,0,1,0,1
174,0,1,0,0,30.6958,1,1,0,0
45,0,3,0,0,8.05,1,1,0,1
