# Tasks

1. Load the Titanic dataset from a CSV.
2. Display the first 10 rows.
3. Show the number of missing values per column.
4. Show the number of unique values in the Sex and Embarked columns.
5. Show summary statistics for numerical features.
6. Show mean Age and Fare grouped by Pclass.
7. Count the number of survivors (Survived == 1) by gender.
8. Calculate the correlation between Fare, Age, and Survived.
9. Create a new column AgeGroup categorizing ages into child (0-12), teen (13-19), adult (20-59), senior (60+).
10. Use .groupby() to find the average survival rate by Pclass and Sex.

In [2]:
# Task 1. Load the titanic datset from a csv

import pandas as pd 

url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)


In [None]:
# Task 2. Display the first 10 rows of the dataframe
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [None]:
# Task 3. Show the number of missing values in each column

df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
# Task 4. Show the number of unique values in sex and embarked columns
df['Sex'].nunique(), df['Embarked'].nunique()

(2, 3)

In [7]:
# Task 5. Show sumary statistics for numerical columns
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [9]:
# Task 6. Show mean Age and Fare grouped by Pclass.

df.groupby('Pclass')[['Age', 'Fare']].mean()

Unnamed: 0_level_0,Age,Fare
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,38.233441,84.154687
2,29.87763,20.662183
3,25.14062,13.67555


In [12]:
# Task 7. Count the number of survivors (Survived == 1) by gender 

df.groupby('Sex')['Survived'].sum()

Sex
female    233
male      109
Name: Survived, dtype: int64

In [13]:
# Task 8. Calculate the correlation between Fare, Age, and Survived.
correlation = df[['Fare', 'Age', 'Survived']].corr()
print(correlation)

              Fare       Age  Survived
Fare      1.000000  0.096067  0.257307
Age       0.096067  1.000000 -0.077221
Survived  0.257307 -0.077221  1.000000


Correlation measures how strongly two variables are related (ranges from -1 to 1):
- 1 means perfect positive correlation,
- -1 means perfect negative correlation,
- 0 means no correlation.

In [16]:
# Task 9. Create a new column AgeGroup categorizing ages into child (0-12), teen (13-19), adult (20-59), senior (60+)
def categorize_age(age):
    if pd.isnull(age):
        return 'Unknown'
    if age <= 12:
        return 'Child'
    elif age <= 19:
        return 'Teen'
    elif age <= 59:
        return 'Adult'
    else:
        return 'Senior'
df['AgeGroup'] = df['Age'].apply(categorize_age)
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeGroup
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Adult
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Adult
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Adult
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Adult
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Adult
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,Unknown
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,Adult
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,Child
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,Adult
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,Teen


In [22]:
# Task 10. Use .groupby() to find the average survival rate by Pclass and Sex.
average_survival = df.groupby(['Pclass','Sex'])['Survived'].mean().unstack()
average_survival

Sex,female,male
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.968085,0.368852
2,0.921053,0.157407
3,0.5,0.135447
