In [298]:
#By: Jesus Plascencia 
#September 12, 2025
#Code used to analyze titanic dataset

In [19]:
# Some basic package imports
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as pxd
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.defaule = 'colab'

------------------------------------------------------

## Pandas Analysis - Day2 HW

In [15]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("yasserh/titanic-dataset")

print("Path to dataset files:", path)

Path to dataset files: /Users/jesusplascencia/.cache/kagglehub/datasets/yasserh/titanic-dataset/versions/1


In [21]:
# Copy the path to the data set
# For me this was:
file = '/Users/jesusplascencia/.cache/kagglehub/datasets/yasserh/titanic-dataset/versions/1/Titanic-Dataset.csv'
# Yours will be different!

df = pd. read_csv(file)

**Your goal is to do a quick analysis of the Titanic data! You can answer any questions that you find interesting but here are some things to start with:**

1. How many variables and observations? Which are Numerical/Categorical?
2. Do any of the columns have NaNs in them? What do NaNs mean?
3. How many passengers survived?
4. Is survival correlated with Fare?
5. How many passengers were alone vs. traveling with family?
6. Were people traveling alone more or less likely to survive?
7. Do the basic statistics change if you group by class?

and so on... see if you can come up with some questions of your own! Curiosity is a big part of data science!

How far can you get in just an hour or two?


---------------------------------

**Variable Notes**
- PassengerId:   Unique ID of the passenger
- Survived:   Survived (1) or died (0)
- Pclass:   Passenger’s class (1st, 2nd, or 3rd)
- Name:   Passenger’s name
- Sex:   Passenger’s sex
- Age:   Passenger’s age
- SibSp:   Number of siblings/spouses aboard the Titanic
- Parch:   Number of parents/children aboard the Titanic
- Ticket:   Ticket number
- Fare:   Fare paid for ticket
- Cabin:   Cabin number
- Embarked:   Where the passenger got on the ship (C — Cherbourg, S — Southampton, Q = Queenstown)

------------------------------------

Your final notebook should:

- [ ] Be a completely new notebook with just the Titanic stuff in it: HW2-Titanic.ipynb
- [ ] Be reproducible with junk code removed.
- [ ] Have lots of language describing what you are doing, especially for questions you are asking or things that you find interesting about the data. Use complete sentences, nice headings, and good markdown formatting: https://www.markdownguide.org/cheat-sheet/
- [ ] It should run without errors from start to finish.

In [407]:
#1. How many variables and observations? Which are Numerical/Categorical?
#df.value_counts().sort_index()  # I tried this but it only gave me a length of 183 which is not correct.

#or
#df.info()

#or

obs = list(df.shape)
var = list(df.columns)
    
print(f"There are {obs[0]} observations and {obs[1]} variables: {var}")


There are 891 observations and 12 variables: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [None]:
#1
#Which are Numerical/categorical?
#df.dtypes

#or 

df.info()

#This tells us that there are 5 categorical columns and 7 numerical columns that are either float or
#integers. There are 891 observations and 12 columns. 

In [405]:
#2. Do any of the columns have NaNs in them? What do NaNs mean?

print("Below are the columns along with their NaN values: ")
print(df.isna().sum())

#The NaNs means data is mising or the value is not a number. Only Age, Cabin, and Embarked have NaN values. 

Below are the columns along with their NaN values: 
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [421]:
#3. How many passengers survived?
survive = df['Survived'].value_counts()[1]

print(f'In total only {survive} passengers survived!')



In total only 342 passengers survived!


In [163]:
#4. Is survival correlated with Fare?
df['Survived'].corr(df['Fare'])

# Those who survived and their fare paid had a weak positive correlation. This means that as fare tickets
#become more expensive, the higher the likelihood to survive. 

0.2573065223849622

In [423]:
#5 How many passengers were alone vs. traveling with family?
df['Alone'] = (df['SibSp'] == 0) & (df['Parch'] == 0)

print(f"{df['Alone'].sum()} passengers were alone")

df['Company'] = (df['SibSp'] > 0) | (df['Parch'] > 0)
print(f"{df['Company'].sum()} passengers were with a family")


537 passengers were alone
354 passengers were with a family


In [251]:
#6] Were people traveling alone more or less likely to survive?
df['Survived'].corr(df['Alone'])

#The people traveling alone were not more likely to survive. There is a moderately negative 
#correlation between people who survived and people who survived when traveling alone. 


-0.20336708569989204

In [25]:
#7. Do the basic statistics change if you group by class?

#Grouping by categorical data: Gender
group_gender = df.groupby('Sex').describe()
#group_gender.describe()

#Grouping by gender does result in different summary statistics 

#Grouping by class 

groups_class = df.groupby('Pclass').describe()
groups_class.describe()

#Yes grouping my class does result in differnet summary statistics because each group is unique
#For example there are three classes, all of classs 1 is grouped together, group 2 is grouped together, 
#and group 3 is grouped together. Grouping reduces how many observations I have. So it represents an 
#aggregate but it ignores individual data which is why our summary stats differ. 


Unnamed: 0_level_0,PassengerId,PassengerId,PassengerId,PassengerId,PassengerId,PassengerId,PassengerId,PassengerId,Survived,Survived,...,Parch,Parch,Fare,Fare,Fare,Fare,Fare,Fare,Fare,Fare
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
count,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
mean,297.0,448.902843,254.01041,4.333333,235.083333,446.5,668.333333,889.333333,297.0,0.448273,...,0.333333,4.333333,297.0,39.497474,34.525304,0.0,17.22465,27.529167,45.0,218.459733
std,168.769073,11.507665,9.264846,4.932883,35.378607,22.152878,2.020726,2.081666,168.769073,0.194798,...,0.57735,1.527525,168.769073,38.831731,37.988446,0.0,12.150874,28.538418,42.329068,254.506087
min,184.0,439.154786,246.737616,1.0,200.0,432.0,666.5,887.0,184.0,0.242363,...,0.0,3.0,184.0,13.67555,11.778142,0.0,7.75,8.05,15.5,69.55
25%,200.0,442.555654,248.794888,1.5,217.25,433.75,667.25,888.5,200.0,0.357594,...,0.0,3.5,200.0,17.168867,12.59777,0.0,10.375,11.15,20.75,71.525
50%,216.0,445.956522,250.852161,2.0,234.5,435.5,668.0,890.0,216.0,0.472826,...,0.0,4.0,216.0,20.662183,13.417399,0.0,13.0,14.25,26.0,73.5
75%,353.5,453.776872,257.646807,6.0,252.625,453.75,669.25,890.5,353.5,0.551228,...,0.5,5.0,353.5,52.408435,45.898886,0.0,21.961975,37.26875,59.75,292.9146
max,491.0,461.597222,264.441453,10.0,270.75,472.0,670.5,891.0,491.0,0.62963,...,1.0,6.0,491.0,84.154687,78.380373,0.0,30.92395,60.2875,93.5,512.3292


In [439]:
#8.Come up with your own questions to explore?
#Before grouping I wanted do see if I got different summary stats or correlations if I sorted
df['Survived'].corr(df['Alone'])

Df_new = df.sort_values(by= 'Age', ascending = False)
df.describe()
#Df_new.describe()
    #No difference, theirs only a difference when grouping

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [29]:
#8. What happens if I try to chain more than two variables on the .corr function? Will I get a correlation for three variales?
df['Survived'].corr(df['Fare','Age'])
    #Not possible


KeyError: ('Fare', 'Age')

In [48]:
#If I create a new column called fare_per_age to see if older individuals paid less or more for the titanic fare.

df['fare_per_age'] = df['Fare'] / (df['Age'])
df.head(100)

#It is difficult to tell if older people were charged more based on their age. That is why a 
#correlation coefficient between Fare and Age is so useful.

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,cor,fare_per_age
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,0.257307,0.329545
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0.257307,1.875876
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,0.257307,0.304808
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,0.257307,1.517143
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0.257307,0.230000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,0,3,"Shorney, Mr. Charles Joseph",male,,0,0,374910,8.0500,,S,0.257307,
96,97,0,1,"Goldschmidt, Mr. George B",male,71.0,0,0,PC 17754,34.6542,A5,C,0.257307,0.488087
97,98,1,1,"Greenfield, Mr. William Bertram",male,23.0,0,1,PC 17759,63.3583,D10 D12,C,0.257307,2.754709
98,99,1,2,"Doling, Mrs. John T (Ada Julia Bone)",female,34.0,0,1,231919,23.0000,,S,0.257307,0.676471
