# Using the Iris data complele the following transformations in an notebook, then create a file named prepare.py that hold a function that will take in the data acquired from the aquire lesson and apply each of the transformations.

## Use the function defined in acquire.py to load the iris data.

In [1]:
import pandas as pd
import numpy as np

import acquire as a

# import data from acquire file as df, also creates a csv of the data if not already present
df = a.get_iris_data() 
df.head()

Unnamed: 0,species_id,species_name,sepal_length,sepal_width,petal_length,petal_width
0,1,setosa,5.1,3.5,1.4,0.2
1,1,setosa,4.9,3.0,1.4,0.2
2,1,setosa,4.7,3.2,1.3,0.2
3,1,setosa,4.6,3.1,1.5,0.2
4,1,setosa,5.0,3.6,1.4,0.2


## Drop the species_id and measurement_id columns.

In [2]:
# drop column using .drop(columns=column_name)
df = df.drop(columns='species_id')
df.head()

Unnamed: 0,species_name,sepal_length,sepal_width,petal_length,petal_width
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
4,setosa,5.0,3.6,1.4,0.2


## Rename the species_name column to just species.

In [3]:
# remame column using .rename(columns={current_column_name : replacement_column_name})
df = df.rename(columns={'species_name':'species'})
df.head()

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
4,setosa,5.0,3.6,1.4,0.2


## Create dummy variables of the species name.

In [4]:
# create dummies using .get_dummies(column_name,not dropping any of the dummy columns)
dummy_df = pd.get_dummies(df['species'], drop_first=False)
dummy_df.head()

Unnamed: 0,setosa,versicolor,virginica
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0


In [5]:
# join original df with dummies df using .concat([original_df,dummy_df], join along the index)
df = pd.concat([df, dummy_df], axis=1)
df.head()

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width,setosa,versicolor,virginica
0,setosa,5.1,3.5,1.4,0.2,1,0,0
1,setosa,4.9,3.0,1.4,0.2,1,0,0
2,setosa,4.7,3.2,1.3,0.2,1,0,0
3,setosa,4.6,3.1,1.5,0.2,1,0,0
4,setosa,5.0,3.6,1.4,0.2,1,0,0


## Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.

In [6]:
# get back original data
df = a.get_iris_data() 

In [7]:
def prep_iris(df):
    '''Prepares acquired Iris data for exploration'''
    
    # drop column using .drop(columns=column_name)
    df = df.drop(columns='species_id')
    
    # remame column using .rename(columns={current_column_name : replacement_column_name})
    df = df.rename(columns={'species_name':'species'})
    
    # create dummies dataframe using .get_dummies(column_name,not dropping any of the dummy columns)
    dummy_df = pd.get_dummies(df['species'], drop_first=False)
    
    # join original df with dummies df using .concat([original_df,dummy_df], join along the index)
    df = pd.concat([df, dummy_df], axis=1)
    
    return df

In [8]:
prep_iris(df).head()

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width,setosa,versicolor,virginica
0,setosa,5.1,3.5,1.4,0.2,1,0,0
1,setosa,4.9,3.0,1.4,0.2,1,0,0
2,setosa,4.7,3.2,1.3,0.2,1,0,0
3,setosa,4.6,3.1,1.5,0.2,1,0,0
4,setosa,5.0,3.6,1.4,0.2,1,0,0


## Adding Train/Validate/Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# get back original data
df = a.get_iris_data() 

In [None]:
# adding function definition for function to split data into train/validate/test

def split_data(df):
    '''
    take in a DataFrame and return train, validate, and test DataFrames; stratify on species.
    return train, validate, test DataFrames.
    '''
    
    # splits df into train_validate and test using train_test_split() stratifying on species to get an even mix of each species
    train_validate, test = train_test_split(df, test_size=.2, random_state=123, stratify=df.species)
    
    # splits train_validate into train and validate using train_test_split() stratifying on species to get an even mix of each species
    train, validate = train_test_split(train_validate, 
                                       test_size=.3, 
                                       random_state=123, 
                                       stratify=train_validate.species)
    return train, validate, test

In [None]:
# adding function split_data to prep_iris function

def prep_iris_with_split(df):
    '''Prepares acquired Iris data for exploration'''
    
    # drop column using .drop(columns=column_name)
    df = df.drop(columns='species_id')
    
    # remame column using .rename(columns={current_column_name : replacement_column_name})
    df = df.rename(columns={'species_name':'species'})
    
    # create dummies dataframe using .get_dummies(column_name,not dropping any of the dummy columns)
    dummy_df = pd.get_dummies(df['species'], drop_first=False)
    
    # join original df with dummies df using .concat([original_df,dummy_df], join along the index)
    df = pd.concat([df, dummy_df], axis=1)
    
    # split data into train/validate/test using split_data function
    train, validate, test = split_data(df)
    
    return train, validate, test

In [None]:
# unpacking train, validate, test from prep_iris_with_split function
train, validate, test = prep_iris_with_split(df)

In [None]:
print(f'train:{train.shape} validate:{validate.shape} test:{test.shape}')

## Importing Prepared Data

In [None]:
import prepare as p

# getting original acquire data using get_iris_data imported from aquire file
df = a.get_iris_data() 

# preparing data using prep_iris imported form prepare 
df = p.prep_iris(df)

df.head()

## Importing Prepared Data With Split

In [None]:
# getting original acquire data using get_iris_data imported from aquire file
df = a.get_iris_data() 

# preparing data using prep_iris_with_split imported form prepare 
train, validate, test = p.prep_iris_with_split(df)

In [None]:
print(f'train:{train.shape} validate:{validate.shape} test:{test.shape}')

In [None]:
train.head()