## Using Pandas for Data Processing

In [1]:
import pandas as pd

In [2]:
# Read the Titanic dataset into a DataFrame
df = pd.read_csv('titanic.csv')

# Display the top 5 rows of the DataFrame
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [3]:
# Display last 5 rows
df.tail()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
1305,3.0,0.0,"Zabour, Miss. Thamine",female,,1.0,0.0,2665.0,14.4542,,C,,,
1306,3.0,0.0,"Zakarian, Mr. Mapriededer",male,26.5,0.0,0.0,2656.0,7.225,,C,,304.0,
1307,3.0,0.0,"Zakarian, Mr. Ortin",male,27.0,0.0,0.0,2670.0,7.225,,C,,,
1308,3.0,0.0,"Zimmerman, Mr. Leo",male,29.0,0.0,0.0,315082.0,7.875,,S,,,
1309,,,,,,,,,,,,,,


In [4]:
# Drop rows with NaNs in sensitive columns
df = df.dropna(subset=['pclass', 'survived'])
df.tail()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
1304,3.0,0.0,"Zabour, Miss. Hileni",female,14.5,1.0,0.0,2665,14.4542,,C,,328.0,
1305,3.0,0.0,"Zabour, Miss. Thamine",female,,1.0,0.0,2665,14.4542,,C,,,
1306,3.0,0.0,"Zakarian, Mr. Mapriededer",male,26.5,0.0,0.0,2656,7.225,,C,,304.0,
1307,3.0,0.0,"Zakarian, Mr. Ortin",male,27.0,0.0,0.0,2670,7.225,,C,,,
1308,3.0,0.0,"Zimmerman, Mr. Leo",male,29.0,0.0,0.0,315082,7.875,,S,,,


In [5]:
# Perform statistical analysis on columns
df.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0,121.0
mean,2.294882,0.381971,29.881135,0.498854,0.385027,33.295479,160.809917
std,0.837836,0.486055,14.4135,1.041658,0.86556,51.758668,97.696922
min,1.0,0.0,0.1667,0.0,0.0,0.0,1.0
25%,2.0,0.0,21.0,0.0,0.0,7.8958,72.0
50%,3.0,0.0,28.0,0.0,0.0,14.4542,155.0
75%,3.0,1.0,39.0,1.0,0.0,31.275,256.0
max,3.0,1.0,80.0,8.0,9.0,512.3292,328.0


In [6]:
# Rename Confusing Columns
df = df.rename(columns={'parch': 'Parents and Children Aboard', 
                        'sibsp': 'Siblings and Spouses Aboard'})

# Rename confusing values
df['embarked'] = df['embarked'].map({'C': 'Cherbourg', 
                                     'Q': 'Queenstown', 
                                     'S': 'Southampton'})

df.head()

Unnamed: 0,pclass,survived,name,sex,age,Siblings and Spouses Aboard,Parents and Children Aboard,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,Southampton,2.0,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,Southampton,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,Southampton,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,Southampton,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,Southampton,,,"Montreal, PQ / Chesterville, ON"


In [7]:
# Get unique values
df['embarked'].unique()

array(['Southampton', 'Cherbourg', nan, 'Queenstown'], dtype=object)

In [8]:
# Replace NaN values in a column
df['embarked'] = df['embarked'].fillna('Unknown')
df['embarked'].unique()

array(['Southampton', 'Cherbourg', 'Unknown', 'Queenstown'], dtype=object)

In [9]:
# We can also remove all rows with a specific value

df = df[df['embarked'].ne('Unknown')] # ne = not equal to

df['embarked'].unique()

array(['Southampton', 'Cherbourg', 'Queenstown'], dtype=object)

In [10]:
# Add in calculated columns
df['cabin_letter'] = df['cabin'].str[0]
df['is_male'] = (df['sex'] == 'male').map({True: 1, False :0 })
df['Total Family Aboard'] = df['Parents and Children Aboard'] + df['Siblings and Spouses Aboard']

# Drop the columns we no longer care about
df = df.drop(columns=['name', 'cabin', 'ticket', 'home.dest', 'body', 'boat'])

df.head()

Unnamed: 0,pclass,survived,sex,age,Siblings and Spouses Aboard,Parents and Children Aboard,fare,embarked,cabin_letter,is_male,Total Family Aboard
0,1.0,1.0,female,29.0,0.0,0.0,211.3375,Southampton,B,0,0.0
1,1.0,1.0,male,0.9167,1.0,2.0,151.55,Southampton,C,1,3.0
2,1.0,0.0,female,2.0,1.0,2.0,151.55,Southampton,C,0,3.0
3,1.0,0.0,male,30.0,1.0,2.0,151.55,Southampton,C,1,3.0
4,1.0,0.0,female,25.0,1.0,2.0,151.55,Southampton,C,0,3.0


In [11]:
# Export to a new CSV file now that processing is completed
df.to_csv('Titanic_Processed.csv')