#   BOOTSTRAP    T-DAT-901 - Big Data - Jonathan Khalifa
##  SURVIVE THE TITANIC



## First we import the data into a dataframe and then extract the first ten rows.

In [1]:
import pandas as pd
import math
data = pd.read_csv('/Users/jonathankhalifa/Desktop/T-DAT-901/Bootstrap-20210923/titanic.csv')

data10 = data[:10]
data10

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
5,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.55,E12,S,3,,"New York, NY"
6,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0,1,0,13502,77.9583,D7,S,10,,"Hudson, NY"
7,1,0,"Andrews, Mr. Thomas Jr",male,39.0,0,0,112050,0.0,A36,S,,,"Belfast, NI"
8,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0,2,0,11769,51.4792,C101,S,D,,"Bayside, Queens, NY"
9,1,0,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C,,22.0,"Montevideo, Uruguay"


## Then we find out how many people’s ages are missing

In [3]:
# missing age count
missingAge = data['age'].isnull().sum()
print("Null ages : ",missingAge)

# total rows 
print("Non null ages : ",data.index.stop-missingAge)

Null ages :  263
Non null ages :  1046


## Then we compute stats on :
how many are they \
what is their average age \
what is their average fare price

In [4]:
totSurvivors = data.loc[data['survived'] == 1]

print("There are ",len(totSurvivors)," survivors.")
formatted_avgAge = "{:.3f}".format(totSurvivors["age"].mean())
print("Their average age is ",formatted_avgAge," years old.")
formatted_avgFare = "{:.3f}".format(totSurvivors["fare"].mean())
print("Their average fare is ",formatted_avgFare," dollars.")

There are  500  survivors.
Their average age is  28.918  years old.
Their average fare is  49.361  dollars.


## We display the probability of surviving a Titanic disaster for all the possible values of the following variables: 
sex\
pclass\
age

In [5]:
# PROBABILITIES OF SURVIVABILITY

#surv prob basic
basic_surv_probs = data.groupby('survived').size().div(len(data))
print("Overall survivability prob is ","{:.2f}".format(basic_surv_probs[1])," , ( or ","{:.2f}".format(basic_surv_probs[1]*100),"%)")
print("  ")
print("  ")

# conditional probabilities

#by sex
surv_prob_by_age = data.groupby(['sex', 'survived']).size().div(len(data)).div(basic_surv_probs, axis=0, level='survived')
print("Survival probability by sex : ")
print(surv_prob_by_age)
print("  ")
print("  ")

#by pclass
surv_prob_by_pclass = data.groupby(['pclass', 'survived']).size().div(len(data)).div(basic_surv_probs, axis=0, level='survived')
print("Survival probability by pclass : ")
print(surv_prob_by_pclass)
print("  ")
print("  ")


#by age
surv_prob_by_age = data.groupby(['age', 'survived']).size().div(len(data)).div(basic_surv_probs, axis=0, level='survived')
print("Survival probability by age : ")
print(surv_prob_by_age)
print("  ")
print("  ")

print(" The result by age does not make sense since there are so many different ones and floats...")
print("  ")
print("  ")
print("  ")
print("  ")


Overall survivability prob is  0.38  , ( or  38.20 %)
  
  
Survival probability by sex : 
sex     survived
female  0           0.156984
        1           0.678000
male    0           0.843016
        1           0.322000
dtype: float64
  
  
Survival probability by pclass : 
pclass  survived
1       0           0.152040
        1           0.400000
2       0           0.195303
        1           0.238000
3       0           0.652658
        1           0.362000
dtype: float64
  
  
Survival probability by age : 
age      survived
0.1667   1           0.002000
0.3333   0           0.001236
0.4167   1           0.002000
0.6667   1           0.002000
0.7500   0           0.001236
                       ...   
70.5000  0           0.001236
71.0000  0           0.002472
74.0000  0           0.001236
76.0000  1           0.002000
80.0000  1           0.002000
Length: 159, dtype: float64
  
  
 The result by age does not make sense since there are so many different ones and floats...
  
 

## We add a new column to our dataframe and label each row depending on the age of the person

In [6]:
# we add a column in ordere to make age groups
data['age_group'] = ""

for i, row in data.iterrows():
    if ( row['age'] == 'nan' ) :
        data.at[i,'age_group'] = "age undefined"
    if (row['age']) > 0 and (row['age']) <= 10:
        data.at[i,'age_group'] = "0 to 10"
    if (row['age']) > 10 and (row['age']) <= 20:
        data.at[i,'age_group'] = "10 to 20"
    if (row['age']) > 20 and (row['age']) <= 30:
        data.at[i,'age_group'] = "20 to 30"
    if (row['age']) > 30 and (row['age']) <= 40:
        data.at[i,'age_group'] = "30 to 40"
    if (row['age']) > 40 and (row['age']) <= 50:
        data.at[i,'age_group'] = "40 to 50"
    if (row['age']) > 50 and (row['age']) <= 60:
        data.at[i,'age_group'] = "50 to 60"
    if (row['age']) > 60 and (row['age']) <= 70:
        data.at[i,'age_group'] = "60 to 70"
    if (row['age']) > 70 and (row['age']) <= 80:
        data.at[i,'age_group'] = "70 to 80"
    if (row['age']) > 80 and (row['age']) <= 90:
        data.at[i,'age_group'] = "80 to 90"
    if (row['age']) > 90 and (row['age']) <= 100:
        data.at[i,'age_group'] = "90 to 100"
    if (row['age']) > 100 and (row['age']) <= 110:
        data.at[i,'age_group'] = "100 to 110"
        
data





Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,age_group
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO",20 to 30
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON",0 to 10
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",0 to 10
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON",20 to 30
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",20 to 30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,,10 to 20
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,,20 to 30
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,,20 to 30


## Survivability probabilities make more sense when grouped by 10 yeear chunks

In [7]:

surv_prob_by_age_group = data.groupby(['age_group', 'survived']).size().div(len(data)).div(basic_surv_probs, axis=0, level='survived')
print("Survival probability by age-group : ")
print(surv_prob_by_age_group)
print("  ")
print("  ")

Survival probability by age-group : 
age_group  survived
           0           0.234858
           1           0.146000
0 to 10    0           0.044499
           1           0.100000
10 to 20   0           0.121137
           1           0.128000
20 to 30   0           0.280593
           1           0.268000
30 to 40   0           0.149567
           1           0.178000
40 to 50   0           0.098888
           1           0.104000
50 to 60   0           0.039555
           1           0.060000
60 to 70   0           0.025958
           1           0.012000
70 to 80   0           0.004944
           1           0.004000
dtype: float64
  
  


## We import all our sncf tables as dataframes

In [8]:
data_sncf_routes = pd.read_csv('/Users/jonathankhalifa/Desktop/T-DAT-901/Bootstrap-20210923/data_sncf/routes.txt')
data_sncf_trips = pd.read_csv('/Users/jonathankhalifa/Desktop/T-DAT-901/Bootstrap-20210923/data_sncf/trips.txt')
data_sncf_agency = pd.read_csv('/Users/jonathankhalifa/Desktop/T-DAT-901/Bootstrap-20210923/data_sncf/agency.txt')
data_sncf_stop_times = pd.read_csv('/Users/jonathankhalifa/Desktop/T-DAT-901/Bootstrap-20210923/data_sncf/stop_times.txt')
data_sncf_stops = pd.read_csv('/Users/jonathankhalifa/Desktop/T-DAT-901/Bootstrap-20210923/data_sncf/stops.txt')
data_sncf_calendar = pd.read_csv('/Users/jonathankhalifa/Desktop/T-DAT-901/Bootstrap-20210923/data_sncf/calendar.txt')
data_sncf_calendar_dates = pd.read_csv('/Users/jonathankhalifa/Desktop/T-DAT-901/Bootstrap-20210923/data_sncf/calendar_dates.txt')

data_sncf_transfers = pd.read_csv('/Users/jonathankhalifa/Desktop/T-DAT-901/Bootstrap-20210923/data_sncf/transfers.txt')

## We merge our different tables via their respective IDs in order to have a single DF with correlated data

In [9]:
pd.options.display.max_columns = None

result = pd.merge(data_sncf_trips, data_sncf_routes, how="outer", on=["route_id"])
result = pd.merge(result, data_sncf_agency, how="outer", on=["agency_id"])
result = pd.merge(result, data_sncf_stop_times, how="outer", on=["trip_id"])
result = pd.merge(result, data_sncf_stops, how="outer", on=["stop_id"])
result = pd.merge(result, data_sncf_calendar, how="outer", on=["service_id"])
result = pd.merge(result, data_sncf_calendar_dates, how="outer", on=["service_id"])
result

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,direction_id,block_id,shape_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color,agency_name,agency_url,agency_timezone,agency_lang,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date,date,exception_type
0,OCE1506035,9939.0,OCESN037071R0100119847,37071.0,1.0,,,OCESN,,Paris-Vernon-Rouen-Le Havre,,2.0,,,,SNCF,http://www.ter-sncf.com,Europe/Paris,fr,23:05:00,23:05:00,StopPoint:OCECar TER-87381509,0.0,,0.0,0.0,,Gare de Mantes-la-Jolie,,48.989687,1.703294,,,0,StopArea:OCE87381509,1.0,1.0,1.0,1.0,0.0,0.0,0.0,20200220.0,20200520.0,20200413.0,2.0
1,OCE1506035,9939.0,OCESN037071R0100119847,37071.0,1.0,,,OCESN,,Paris-Vernon-Rouen-Le Havre,,2.0,,,,SNCF,http://www.ter-sncf.com,Europe/Paris,fr,23:35:00,23:35:00,StopPoint:OCECar TER-87415604,1.0,,0.0,0.0,,Gare de Vernon-Giverny,,49.091286,1.478363,,,0,StopArea:OCE87415604,1.0,1.0,1.0,1.0,0.0,0.0,0.0,20200220.0,20200520.0,20200413.0,2.0
2,OCE1506035,9939.0,OCESN037071R0100119847,37071.0,1.0,,,OCESN,,Paris-Vernon-Rouen-Le Havre,,2.0,,,,SNCF,http://www.ter-sncf.com,Europe/Paris,fr,23:55:00,23:55:00,StopPoint:OCECar TER-87415620,2.0,,0.0,0.0,,Gare de Gaillon-Aubevoye,,49.174632,1.352518,,,0,StopArea:OCE87415620,1.0,1.0,1.0,1.0,0.0,0.0,0.0,20200220.0,20200520.0,20200413.0,2.0
3,OCE1506035,9939.0,OCESN037071R0100119847,37071.0,1.0,,,OCESN,,Paris-Vernon-Rouen-Le Havre,,2.0,,,,SNCF,http://www.ter-sncf.com,Europe/Paris,fr,24:25:00,24:25:00,StopPoint:OCECar TER-87415877,3.0,,0.0,0.0,,Gare de Val-de-Reuil,,49.275399,1.224609,,,0,StopArea:OCE87415877,1.0,1.0,1.0,1.0,0.0,0.0,0.0,20200220.0,20200520.0,20200413.0,2.0
4,OCE1506035,9939.0,OCESN037071R0100119847,37071.0,1.0,,,OCESN,,Paris-Vernon-Rouen-Le Havre,,2.0,,,,SNCF,http://www.ter-sncf.com,Europe/Paris,fr,24:45:00,24:45:00,StopPoint:OCECar TER-87411207,4.0,,0.0,0.0,,Gare de Oissel,,49.343042,1.101821,,,0,StopArea:OCE87411207,1.0,1.0,1.0,1.0,0.0,0.0,0.0,20200220.0,20200520.0,20200413.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
972300,,,,,,,,,,,,,,,,,,,,,,StopPoint:OCECar TER-87144048,,,,,,Lépanges,,48.172680,6.672160,,,0,StopArea:OCE87144048,,,,,,,,,,,
972301,,,,,,,,,,,,,,,,,,,,,,StopPoint:OCECar TER-87317339,,,,,,Gare de Marquise Rinxent,,50.805649,1.729852,,,0,StopArea:OCE87317339,,,,,,,,,,,
972302,,,,,,,,,,,,,,,,,,,,,,StopPoint:OCECar TER-87317016,,,,,,Gare de Boulogne Tintelleries,,50.727250,1.609085,,,0,StopArea:OCE87317016,,,,,,,,,,,
972303,,,,,,,,,,,,,,,,,,,,,,StopPoint:OCECar TER-87342071,,,,,,Gare de Boisleux,,50.210403,2.771839,,,0,StopArea:OCE87342071,,,,,,,,,,,


## keep only the trains that leave Paris - Gare de l’est, in the morning before 10:00. Display, as a dataframe, their ids and departure time.

In [10]:
aa = result[(result['stop_name'] == "Gare de Paris-Est") & (result['departure_time'] <= "10:00:00") ]       
aa = aa[['trip_id', 'departure_time']]
aa

Unnamed: 0,trip_id,departure_time
46923,OCESN839133F0600638027,08:36:00
46924,OCESN839133F0600638027,08:36:00
46925,OCESN839133F0600638027,08:36:00
51952,OCESN839401F0400438176,07:42:00
51953,OCESN839401F0400438176,07:42:00
...,...,...
957737,OCESN839406F1101138188,09:16:00
957757,OCESN839404F0600638180,08:16:00
957761,OCESN839416F0400438218,07:46:00
958325,OCESN839101F0300337926,08:36:00
