# 1. Introduction

## Instructions
Read the file titanic_survival.csv into a dataframe called titanic_survival.

In [97]:
import pandas as pd

titanic_survival = pd.read_csv('titanic_survival.csv')
titanic_survival.head(3)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


# 2. Finding the Missing Data

## Instructions
Count how many values in the "age" column have null values:
Use pandas.isnull() on age variable to create a Series of True and False values.
Use the resulting series to select only the elements in age that are null, and assign the result to age_null_true
Assign the length of age_null_true to age_null_count.
Print age_null_count to see how many null values are in the "age" column.

In [98]:
age = titanic_survival["age"]
age_is_null = pd.isnull(age)
age_null_true = age[age_is_null]
age_null_count = len(age_null_true)
print(age_null_count)

264


# 3. Whats the big deal with missing data?

## Instructions
Use age_is_null to create a vector that only contains values from the "age" column that aren't NaN.
Calculate the mean of the new vector, and assign the result to correct_mean_age.

In [99]:
age_is_null = pd.isnull(titanic_survival["age"])

In [100]:
good_ages = titanic_survival["age"][age_is_null == False]
print(good_ages)

0       29.0000
1        0.9167
2        2.0000
3       30.0000
4       25.0000
5       48.0000
6       63.0000
7       39.0000
8       53.0000
9       71.0000
10      47.0000
11      18.0000
12      24.0000
13      26.0000
14      80.0000
16      24.0000
17      50.0000
18      32.0000
19      36.0000
20      37.0000
21      47.0000
22      26.0000
23      42.0000
24      29.0000
25      25.0000
26      25.0000
27      19.0000
28      35.0000
29      28.0000
30      45.0000
         ...   
1269    33.0000
1270    28.0000
1271    28.0000
1272    47.0000
1273    18.0000
1274    31.0000
1275    16.0000
1276    31.0000
1277    22.0000
1278    20.0000
1279    14.0000
1280    22.0000
1281    22.0000
1285    32.5000
1286    38.0000
1287    51.0000
1288    18.0000
1289    21.0000
1290    47.0000
1294    28.5000
1295    21.0000
1296    27.0000
1298    36.0000
1299    27.0000
1300    15.0000
1301    45.5000
1304    14.5000
1306    26.5000
1307    27.0000
1308    29.0000
Name: age, Length: 1046,

In [101]:
correct_mean_age = sum(good_ages) / len(good_ages)
print(correct_mean_age)

29.8811345124


# 4. Easier Ways to Do Math

## Instructions
Assign the mean of the "fare" column to correct_mean_fare.

In [102]:
correct_mean_fare = titanic_survival["fare"].mean()
correct_mean_fare

33.29547928134572

# 5. Calculating Summary Statistics

## Instructions
Use a for loop to iterate over passenger_classes. Within the for loop:
Select just the rows in titanic_survival where the pclass value is equivalent to the current iterator value (class).
Select just the fare column for the current subset of rows.
Use the Series.mean method to calculate the mean of this subset.
Add the mean of the class to the fares_by_class dictionary with class as the key.
Once the loop completes, the dictionary fares_by_class should have 1, 2, and 3 as keys, with the average fares as the corresponding values.

In [103]:
passenger_classes = [1, 2, 3]
fares_by_class = {}

for this_class in passenger_classes:
    pclass_row = titanic_survival[titanic_survival["pclass"] == this_class]
    fare_class = pclass_row["fare"]
    fare_for_class = fare_class.mean()
    fares_by_class[this_class] = fare_for_class
print(fares_by_class)

{1: 87.50899164086687, 2: 21.1791963898917, 3: 13.302888700564957}


# 6. Making Pivot Tables

## Instructions
Use the DataFrame.pivot_table() method to calculate the mean age for each passenger class ("pclass").
Assign the result to passenger_age.
Display the passenger_age pivot table using the print() function.

In [104]:
passenger_survival = titanic_survival.pivot_table(index="pclass", values="survived")
print(passenger_survival)

        survived
pclass          
1.0     0.619195
2.0     0.429603
3.0     0.255289


In [105]:
passenger_age = titanic_survival.pivot_table(index="pclass", values="age")
print(passenger_age)

              age
pclass           
1.0     39.159918
2.0     29.506705
3.0     24.816367


# 7. More Complex Pivot Tables

## Instructions
Make a pivot table that calculates the total fares collected ("fare") and total number of survivors ("survived") for each embarkation port ("embarked").
Assign the result to port_stats.
Display port_stats using the print() function.

In [106]:
import numpy as np

port_stats = titanic_survival.pivot_table(index="embarked", values=["fare", "survived"], aggfunc=np.sum)
print(port_stats)

                fare  survived
embarked                      
C         16830.7922     150.0
Q          1526.3085      44.0
S         25033.3862     304.0


# 8. Drop Missing Values

## Instructions
Drop all columns in titanic_survival that have missing values and assign the result to drop_na_columns.
Drop all rows in titanic_survival where the columns "age" or "sex" have missing values and assign the result to new_titanic_survival.

In [107]:
drop_na_rows = titanic_survival.dropna(axis=0)
drop_na_columns = titanic_survival.dropna(axis=1)

In [108]:
new_titanic_survival = titanic_survival.dropna(axis=0, how='any', subset=['age', 'sex'])
new_titanic_survival.head(10)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
5,1.0,1.0,"Anderson, Mr. Harry",male,48.0,0.0,0.0,19952,26.55,E12,S,3,,"New York, NY"
6,1.0,1.0,"Andrews, Miss. Kornelia Theodosia",female,63.0,1.0,0.0,13502,77.9583,D7,S,10,,"Hudson, NY"
7,1.0,0.0,"Andrews, Mr. Thomas Jr",male,39.0,0.0,0.0,112050,0.0,A36,S,,,"Belfast, NI"
8,1.0,1.0,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0,2.0,0.0,11769,51.4792,C101,S,D,,"Bayside, Queens, NY"
9,1.0,0.0,"Artagaveytia, Mr. Ramon",male,71.0,0.0,0.0,PC 17609,49.5042,,C,,22.0,"Montevideo, Uruguay"


# 9. Using iloc to Access Rows by Position

In [109]:
first_five_rows = new_titanic_survival.iloc[0:5]
print(first_five_rows)

   pclass  survived                                             name     sex  \
0     1.0       1.0                    Allen, Miss. Elisabeth Walton  female   
1     1.0       1.0                   Allison, Master. Hudson Trevor    male   
2     1.0       0.0                     Allison, Miss. Helen Loraine  female   
3     1.0       0.0             Allison, Mr. Hudson Joshua Creighton    male   
4     1.0       0.0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female   

       age  sibsp  parch  ticket      fare    cabin embarked boat   body  \
0  29.0000    0.0    0.0   24160  211.3375       B5        S    2    NaN   
1   0.9167    1.0    2.0  113781  151.5500  C22 C26        S   11    NaN   
2   2.0000    1.0    2.0  113781  151.5500  C22 C26        S  NaN    NaN   
3  30.0000    1.0    2.0  113781  151.5500  C22 C26        S  NaN  135.0   
4  25.0000    1.0    2.0  113781  151.5500  C22 C26        S  NaN    NaN   

                         home.dest  
0                     St 

In [110]:
first_ten_rows = new_titanic_survival.iloc[0:10]
print(first_ten_rows)

   pclass  survived                                             name     sex  \
0     1.0       1.0                    Allen, Miss. Elisabeth Walton  female   
1     1.0       1.0                   Allison, Master. Hudson Trevor    male   
2     1.0       0.0                     Allison, Miss. Helen Loraine  female   
3     1.0       0.0             Allison, Mr. Hudson Joshua Creighton    male   
4     1.0       0.0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female   
5     1.0       1.0                              Anderson, Mr. Harry    male   
6     1.0       1.0                Andrews, Miss. Kornelia Theodosia  female   
7     1.0       0.0                           Andrews, Mr. Thomas Jr    male   
8     1.0       1.0    Appleton, Mrs. Edward Dale (Charlotte Lamson)  female   
9     1.0       0.0                          Artagaveytia, Mr. Ramon    male   

       age  sibsp  parch    ticket      fare    cabin embarked boat   body  \
0  29.0000    0.0    0.0     24160  211.3

In [111]:
row_position_fifth = new_titanic_survival.iloc[4]
print(row_position_fifth)

pclass                                                     1
survived                                                   0
name         Allison, Mrs. Hudson J C (Bessie Waldo Daniels)
sex                                                   female
age                                                       25
sibsp                                                      1
parch                                                      2
ticket                                                113781
fare                                                  151.55
cabin                                                C22 C26
embarked                                                   S
boat                                                     NaN
body                                                     NaN
home.dest                    Montreal, PQ / Chesterville, ON
Name: 4, dtype: object


In [112]:
row_index_25 = new_titanic_survival.loc[25]
print(row_index_25)

pclass                         1
survived                       0
name         Birnbaum, Mr. Jakob
sex                         male
age                           25
sibsp                          0
parch                          0
ticket                     13905
fare                          26
cabin                        NaN
embarked                       C
boat                         NaN
body                         148
home.dest      San Francisco, CA
Name: 25, dtype: object


# 10. Using Column Indexes

## Instructions
Assign the value at row index label 1100, column index label "age" from new_titanic_survival to row_index_1100_age.
Assign the value at row index label 25, column index label "survived" from new_titanic_survival to row_index_25_survived.
Assign the first 5 rows and first three columns from new_titanic_survival to five_rows_three_cols.

In [113]:
first_row_first_column = new_titanic_survival.iloc[0,0]
print(first_row_first_column)

1.0


In [114]:
all_rows_first_three_columns = new_titanic_survival.iloc[:,0:3]
print(all_rows_first_three_columns)

      pclass  survived                                               name
0        1.0       1.0                      Allen, Miss. Elisabeth Walton
1        1.0       1.0                     Allison, Master. Hudson Trevor
2        1.0       0.0                       Allison, Miss. Helen Loraine
3        1.0       0.0               Allison, Mr. Hudson Joshua Creighton
4        1.0       0.0    Allison, Mrs. Hudson J C (Bessie Waldo Daniels)
5        1.0       1.0                                Anderson, Mr. Harry
6        1.0       1.0                  Andrews, Miss. Kornelia Theodosia
7        1.0       0.0                             Andrews, Mr. Thomas Jr
8        1.0       1.0      Appleton, Mrs. Edward Dale (Charlotte Lamson)
9        1.0       0.0                            Artagaveytia, Mr. Ramon
10       1.0       0.0                             Astor, Col. John Jacob
11       1.0       1.0  Astor, Mrs. John Jacob (Madeleine Talmadge Force)
12       1.0       1.0                

In [115]:
all_rows_first_three_columns = new_titanic_survival.iloc[:,0:3]
print(all_rows_first_three_columns[0:5])

   pclass  survived                                             name
0     1.0       1.0                    Allen, Miss. Elisabeth Walton
1     1.0       1.0                   Allison, Master. Hudson Trevor
2     1.0       0.0                     Allison, Miss. Helen Loraine
3     1.0       0.0             Allison, Mr. Hudson Joshua Creighton
4     1.0       0.0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)


In [116]:
row_index_83_age = new_titanic_survival.loc[83,"age"]
print(row_index_83_age)

64.0


In [117]:
row_index_766_pclass = new_titanic_survival.loc[766,"pclass"]
print(row_index_766_pclass)

3.0


In [118]:
row_index_1100_age = new_titanic_survival.loc[1100, "age"]
print(row_index_1100_age)

29.0


In [119]:
row_index_25_survived = new_titanic_survival.loc[25, "survived"]
print(row_index_25_survived)

0.0


In [120]:
five_rows_three_cols = new_titanic_survival.iloc[0:5, 0:3]
print(five_rows_three_cols)

   pclass  survived                                             name
0     1.0       1.0                    Allen, Miss. Elisabeth Walton
1     1.0       1.0                   Allison, Master. Hudson Trevor
2     1.0       0.0                     Allison, Miss. Helen Loraine
3     1.0       0.0             Allison, Mr. Hudson Joshua Creighton
4     1.0       0.0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)


# 11. Reindexing Rows

## Instructions
Reindex the new_titanic_survival dataframe so the row indexes start from 0, and the old index is dropped.
Assign the final result to titanic_reindexed.
Print the first 5 rows and the first 3 columns of titanic_reindexed.

In [121]:
titanic_reindexed = new_titanic_survival.reset_index(drop=True)
print(titanic_reindexed.iloc[0:5,0:3])

   pclass  survived                                             name
0     1.0       1.0                    Allen, Miss. Elisabeth Walton
1     1.0       1.0                   Allison, Master. Hudson Trevor
2     1.0       0.0                     Allison, Miss. Helen Loraine
3     1.0       0.0             Allison, Mr. Hudson Joshua Creighton
4     1.0       0.0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)


# 12. Apply Functions Over a DataFrame

# Instructions
Write a function that counts the number of null elements in a Series.
Use the DataFrame.apply() method along with your function to run across all the columns in titanic_survival.
Assign the result to column_null_count.

In [122]:
def hundredth_row(column):
    hundredth_item = column.iloc[99]
    return hundredth_item

hundredth_row_var = titanic_survival.apply(hundredth_row)

In [123]:
import pandas as pd

def null_count(column):
    not_null = pd.isnull(column)
    null = column[not_null]
    return len(null)
    
column_null_count = titanic_survival.apply(null_count)
print(column_null_count)

pclass          1
survived        1
name            1
sex             1
age           264
sibsp           1
parch           1
ticket          1
fare            2
cabin        1015
embarked        3
boat          824
body         1189
home.dest     565
dtype: int64


# 13. Applying a Function to a Row

## Instructions
Create a function that returns the string "minor" if someone is under 18, "adult" if they are equal to or over 18, and "unknown" if their age is null.
Then, use the function along with .apply() to find the correct label for everyone in the titanic_survival dataframe.
Assign the result to age_labels.
You can use pd.isnull to check if a value is null or not.

In [124]:
def is_minor(row):
    if row["age"] < 18:
        return True
    else:
        return False
    
minors = titanic_survival.apply(is_minor, axis=1)

In [125]:
def categorize_age(row):
    age = row["age"]
    if pd.isnull(age):
        return "unknown"
    elif age < 18:
        return "minor"
    else:
        return "adult"
    
age_labels = titanic_survival.apply(categorize_age, axis=1)
print(age_labels[0:5])

0    adult
1    minor
2    minor
3    adult
4    adult
dtype: object


# 14. Calculating Survival Percentage by Age Group

## Instructions
Create a pivot table that calculates the mean survival chance("survived") for each age group ("age_labels") of the dataframe titanic_survival.
Assign the resulting Series object to age_group_survival.

In [None]:
age_group_survival = titanic_survival.pivot_table(index="age_labels", values="survived", aggfunc=np.mean)