# data_prep_exercises

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# import splitting and imputing functions
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# turn off pink boxes for demo?
import warnings
warnings.filterwarnings('ignore')

# import my acquire and functions files
import acquire
import functions

### 1. Iris Data

In [2]:
# 1. Use the function defined in acquire.py to load the iris data.

df_iris = acquire.new_iris_data()
df_iris.head(2)

Unnamed: 0,species_id,species_name,sepal_length,sepal_width,petal_length,petal_width
0,1,setosa,5.1,3.5,1.4,0.2
1,1,setosa,4.9,3.0,1.4,0.2


In [3]:
# # 2. Drop the species_id and measurement_id columns.

df_iris = df_iris.drop(columns=['species_id'])#, 'measurement_id'])
df_iris.head(2)


Unnamed: 0,species_name,sepal_length,sepal_width,petal_length,petal_width
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2


In [4]:
# 3. Rename the species_name column to just species.

df_iris = df_iris.rename(columns = {'species_name': 'species'})
df_iris.head(2)

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2


In [5]:
# 4. Create dummy variables of the species name and concatenate onto the iris dataframe.

# to create the dummy column
dummy_df_iris = pd.get_dummies(df_iris[['species']], drop_first = True)

# concatenate original df with dummy df
df_iris = pd.concat([df_iris, dummy_df_iris], axis=1)

In [6]:
df_iris.head(2)

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width,species_versicolor,species_virginica
0,setosa,5.1,3.5,1.4,0.2,0,0
1,setosa,4.9,3.0,1.4,0.2,0,0


In [7]:
# 5. Create a function named prep_iris that accepts the untransformed iris data, and returns the data with 
#     the transformations above applied.

def prep_iris(df):
    '''
    This function accepts the original iris data and returns the df:
        - drops the 'species_id' column
        - renames the 'species_name' col to just 'species'
        - creates dummy variables for the 'species' col for species names and concatenates onto the iris df
    '''
    df = df.drop(columns=['species_id']) #drops 'species_id' column
    df = df.rename(columns = {'species_name': 'species'}) #renames 'species_name' column to just 'species'
    dummy_df = pd.get_dummies(df[['species']], drop_first = True) #creates the dummy variable column
    df = pd.concat([df, dummy_df], axis=1) #concatenates original df with dummy variable columns
    
    return df #returns the new df w/above changes
    

### 2. Titanic Data

In [8]:
# 1. Use the function defined in acquire.py to load the Titanic data.

df_titanic = acquire.new_titanic_data()

In [9]:
df_titanic.head(2)

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0


In [10]:
# 2. Drop any unnecessary, unhelpful, or duplicated columns.

df_titanic.columns

Index(['passenger_id', 'survived', 'pclass', 'sex', 'age', 'sibsp', 'parch',
       'fare', 'embarked', 'class', 'deck', 'embark_town', 'alone'],
      dtype='object')

In [11]:
# for loop that prints the number of unique values in each column

for column in df_titanic.columns:
    print(f'{column}: {df_titanic[column].nunique()} unique values')
    print()

passenger_id: 891 unique values

survived: 2 unique values

pclass: 3 unique values

sex: 2 unique values

age: 88 unique values

sibsp: 7 unique values

parch: 7 unique values

fare: 248 unique values

embarked: 3 unique values

class: 3 unique values

deck: 7 unique values

embark_town: 3 unique values

alone: 2 unique values



In [12]:
# for loop that prints the value_counts() for each column in the titanic DataFrame

for column in df_titanic.columns[1:]: #skipping the first column (all are unique values)
    print(f'{column}') #prints column name
    print()
    
    print(df_titanic[column].value_counts()) #prints value_counts() for the column
    print()
    print()

survived

0    549
1    342
Name: survived, dtype: int64


pclass

3    491
1    216
2    184
Name: pclass, dtype: int64


sex

male      577
female    314
Name: sex, dtype: int64


age

24.00    30
22.00    27
18.00    26
28.00    25
19.00    25
         ..
55.50     1
74.00     1
0.92      1
70.50     1
12.00     1
Name: age, Length: 88, dtype: int64


sibsp

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: sibsp, dtype: int64


parch

0    678
1    118
2     80
3      5
5      5
4      4
6      1
Name: parch, dtype: int64


fare

8.0500     43
13.0000    42
7.8958     38
7.7500     34
26.0000    31
           ..
50.4958     1
13.8583     1
8.4583      1
7.7250      1
7.5208      1
Name: fare, Length: 248, dtype: int64


embarked

S    644
C    168
Q     77
Name: embarked, dtype: int64


class

Third     491
First     216
Second    184
Name: class, dtype: int64


deck

C    59
B    47
D    33
E    32
A    15
F    13
G     4
Name: deck, dtype: int64


embark_town



In [13]:
# dropping duplicates and uneccesary columns

df_titanic.drop_duplicates()
df_titanic = df_titanic.drop(columns = ['deck', 'embarked', 'class', 'age'])
df_titanic.head(2)

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embark_town,alone
0,0,0,3,male,1,0,7.25,Southampton,0
1,1,1,1,female,1,0,71.2833,Cherbourg,0


In [28]:
df_titanic.columns

Index(['passenger_id', 'survived', 'pclass', 'sex', 'sibsp', 'parch', 'fare',
       'embark_town', 'alone'],
      dtype='object')

In [29]:
# for loop that loops through each column, counts the nulls, and returns columns with nulls and the number of nulls

for column in df_titanic.columns:
    if df_titanic[column].isnull().sum() > 0: #conditional, only print next line if # of nulls > 0
        print(f'The {column} column has {df_titanic[column].isnull().sum()} null value(s)') #returns sum of nulls
        print()
    
# for column in df_titanic.columns:
#     print(f'{column}: {df_titanic[column].nunique()} unique values')
#     print()

The embark_town column has 2 null value(s)



In [33]:
# rows 61 and 829 have null values for embark_town

df_titanic[df_titanic.embark_town.isnull()]

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embark_town,alone
61,61,1,1,female,0,0,80.0,,1
829,829,1,1,female,0,0,80.0,,1


In [37]:
# filling na values with most common value

df_titanic.embark_town.value_counts() #Southampton most prevalent in this column

df_titanic['embark_town'] = df_titanic.embark_town.fillna(value='Southampton')
df_titanic.embark_town.value_counts()

Southampton    646
Cherbourg      168
Queenstown      77
Name: embark_town, dtype: int64

In [40]:
# 2. Encode the categorical columns. Create dummy variables of the categorical columns and concatenate them onto 
#     the dataframe.
# Categorical variables:
#         - pclass
#         - sex
#         - embark_town

df_titanic.info()
# df_titanic.embark_town.value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   passenger_id  891 non-null    int64  
 1   survived      891 non-null    int64  
 2   pclass        891 non-null    int64  
 3   sex           891 non-null    object 
 4   sibsp         891 non-null    int64  
 5   parch         891 non-null    int64  
 6   fare          891 non-null    float64
 7   embark_town   891 non-null    object 
 8   alone         891 non-null    int64  
dtypes: float64(1), int64(6), object(2)
memory usage: 62.8+ KB
