In [66]:
#Import sqlalchemy
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect, distinct
from sqlalchemy import extract

In [2]:
#Import dependencies
import pandas as pd
import numpy as np

#Import PostgreSQL password
from config import postgre_pw

# Set up SQLalchemy

In [3]:
#Connect to the database
engine = create_engine(f'postgresql://postgres:{postgre_pw}@localhost:5432/F1 Data')

#Reflect the database
Base = automap_base()

#Reflect the tables
Base.prepare(engine, reflect=True)

In [4]:
#Save the table references
Circuits = Base.classes.circuits
Constructor_results = Base.classes.constructorResults
Constructor_standings = Base.classes.constructorStandings
Constructors = Base.classes.constructors
Driver_standings = Base.classes.driverStandings
Drivers = Base.classes.drivers
Lap_times = Base.classes.lapTimes
Pit_stops = Base.classes.pitStops
Qualifying = Base.classes.qualifying
Races = Base.classes.races
Results = Base.classes.results
Seasons = Base.classes.seasons
Sprint_results = Base.classes.sprint_results
Status = Base.classes.status

In [5]:
#Create a session
session = Session(engine)

# Get 'all' mechanical failure results

In [6]:
#Create list of mechanical failures manually identified by statusId
mech_fail_id = [5, 6, 7, 8, 9, 10, 21, 22, 23, 24, 25, 26, 28, 30, 32, 34, 36, 37, 38, 39, 40, 42, 43, 44, 47, 48, 49, 51, 56, 129, 63, 66, 69, 70, 71, 72, 74, 75 ,76, 79, 80, 83, 84, 85, 86, 87, 91, 94, 95, 98, 99, 101, 102, 103, 105, 106, 108, 109, 110, 126, 131, 132, 135, 136, 140, 141]

In [7]:
#Create dataframe by filtering Results table for mechanical failures
mech_failures_df_raw = pd.read_sql(session.query(Results).filter(Results.statusId.in_(mech_fail_id)).statement, session.bind)

## Add matching data from other tables and make it pretty

In [8]:
#Clean mech_failures_df by dropping unnecessary columns
mech_failures_df = mech_failures_df_raw.drop(columns=['number', 'grid', 'points', 'laps', 'time', 'milliseconds', 'fastestLap', 'rank', 'fastestLapTime', 'fastedLapSpeed'])

### Add status description

In [9]:
#Append status description to mech_failures_df
mech_failures_df = mech_failures_df.merge(pd.read_sql(session.query(Status).statement, session.bind), on='statusId')

### Add driver names

In [10]:
#Add driver forename and surname
mech_failures_df = mech_failures_df.merge(pd.read_sql(session.query(Drivers).statement, session.bind), on='driverId')

In [11]:
#Couldn't find how to merge on specific columns, so I had to drop the unnecessary columns
mech_failures_df = mech_failures_df.drop(columns=['driverRef', 'number', 'code', 'dob', 'nationality', 'url'])

In [12]:
#Merge forename and surname into one column
mech_failures_df['driver_name'] = mech_failures_df['forename'] + ' ' + mech_failures_df['surname']

### Add constructor names

In [13]:
#Add constructor name
mech_failures_df = mech_failures_df.merge(pd.read_sql(session.query(Constructors).statement, session.bind), on='constructorId')

In [14]:
#Couldn't find how to merge on specific columns, so I had to drop the unnecessary columns
mech_failures_df = mech_failures_df.drop(columns=['constructorRef', 'nationality', 'url'])

In [15]:
#Rename constructor name column
mech_failures_df = mech_failures_df.rename(columns={'name': 'constructor_name'})

### Add race data

In [16]:
#year,round,circuitId,name,date

#Add race data
mech_failures_df = mech_failures_df.merge(pd.read_sql(session.query(Races).statement, session.bind), on=['raceId'])

In [17]:
#Couldn't find how to merge on specific columns, so I had to drop the unnecessary columns
mech_failures_df = mech_failures_df.drop(columns=['round', 'time', 'url', 'fp1_date', 'fp1_time', 'fp2_date', 'fp2_time', 'fp3_date', 'fp3_time', 'quali_date', 'quali_time', 'sprint_date', 'sprint_time'])

In [18]:
#Rename race columns
mech_failures_df = mech_failures_df.rename(columns={'year': 'race_year', 'name': 'race_name', 'date': 'race_date'})

# Final df

In [19]:
#Reorder columns
mech_failures_df = mech_failures_df[['resultId', 'raceId', 'race_year', 'circuitId', 'race_name', 'race_date', 'driverId', 'forename', 'surname', 'driver_name', 'position', 'positionText', 'positionOrder', 'constructorId', 'constructor_name', 'statusId', 'status']]

In [20]:
mech_failures_df

Unnamed: 0,resultId,raceId,race_year,circuitId,race_name,race_date,driverId,forename,surname,driver_name,position,positionText,positionOrder,constructorId,constructor_name,statusId,status
0,7,18,2008,1,Australian Grand Prix,2008-03-16,7,Sébastien,Bourdais,Sébastien Bourdais,7.0,7,7,5,Toro Rosso,5,Engine
1,8,18,2008,1,Australian Grand Prix,2008-03-16,8,Kimi,Räikkönen,Kimi Räikkönen,8.0,8,8,6,Ferrari,5,Engine
2,13,18,2008,1,Australian Grand Prix,2008-03-16,13,Felipe,Massa,Felipe Massa,,R,13,6,Ferrari,5,Engine
3,12,18,2008,1,Australian Grand Prix,2008-03-16,12,Nelson,Piquet Jr.,Nelson Piquet Jr.,,R,12,4,Renault,8,Clutch
4,16,18,2008,1,Australian Grand Prix,2008-03-16,16,Adrian,Sutil,Adrian Sutil,,R,16,10,Force India,9,Hydraulics
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6296,24965,1047,2020,24,Abu Dhabi Grand Prix,2020-12-13,815,Sergio,Pérez,Sergio Pérez,,R,20,211,Racing Point,7,Transmission
6297,24625,1030,2019,24,Abu Dhabi Grand Prix,2019-12-01,840,Lance,Stroll,Lance Stroll,,R,20,211,Racing Point,23,Brakes
6298,24685,1033,2020,11,Hungarian Grand Prix,2020-07-19,842,Pierre,Gasly,Pierre Gasly,,R,20,213,AlphaTauri,5,Engine
6299,25045,1055,2021,4,Spanish Grand Prix,2021-05-09,852,Yuki,Tsunoda,Yuki Tsunoda,,R,20,213,AlphaTauri,10,Electrical


In [21]:
#Export dataframe to csv
mech_failures_df.to_csv('mech_failures.csv', index=False)

# Order results by years and drivers

In [22]:
#Doing this helps the process of visualizing the data in Tableau

## Order by years

In [54]:
#Group by count of statusId per year
mech_failures_per_year = mech_failures_df.groupby(mech_failures_df['race_year']).count()['statusId']

In [58]:
#Create dataframe from groupby
mech_failures_per_year_df = pd.DataFrame(mech_failures_per_year)
mech_failures_per_year_df = mech_failures_per_year_df.rename(columns={'statusId': 'number_of_mechanical_failures'})

### Add column for number of races per year

In [60]:
#Add column for number of races each year
races_per_year = pd.read_sql(session.query(func.count(Races.raceId).label('number_of_races'), (Races.year).label('race_year')).group_by(Races.year).statement, session.bind)

#For some reason this statement returned only NaN values. Might be a cache issue, but restarting everything didn't help
#mech_failures_per_year_df['number_of_races'] = pd.read_sql(session.query(func.count(Races.raceId)).group_by(Races.year).order_by(Races.year).statement, session.bind)

In [61]:
#Merge races_per_year with mech_failures_per_year_df
mech_failures_per_year_df = mech_failures_per_year_df.merge(races_per_year, on=['race_year'])

In [63]:
#Add column for avgerage number of mechanical failures per race per year
#Divide number of mechanical failures by number of races
mech_failures_per_year_df['average_mechanical_failure_per_race'] = mech_failures_per_year_df['number_of_mechanical_failures'] / mech_failures_per_year_df['number_of_races']

### Add average number of drivers per race per year

In [83]:
#In order to connect driverId with raceId with race_year, it has to go through the results table

In [75]:
#Create dataframe from drivers table to merge with results table
drivers_df = pd.read_sql(session.query(Drivers.driverId).statement, session.bind)

In [81]:
#Merge drivers_df with results table
drivers_df = drivers_df.merge(pd.read_sql(session.query(Results.raceId, Results.driverId).statement, session.bind), on='driverId')

In [91]:
#Merge with races table
drivers_df = drivers_df.merge(pd.read_sql(session.query((Races.year).label('race_year'), Races.raceId).statement, session.bind), on='raceId')

In [94]:
#Create new df with distinct driverId per year
drivers_per_year = pd.DataFrame(drivers_df.groupby(drivers_df['race_year']).nunique()['driverId'])

In [100]:
#Rename column
drivers_per_year = drivers_per_year.rename(columns={'driverId': 'number_of_drivers'})

In [101]:
#Merge with mech_failures_per_year_df
mech_failures_per_year_df = mech_failures_per_year_df.merge(drivers_per_year, on=['race_year'])

In [105]:
#Add column for average number of drivers per race per year
mech_failures_per_year_df['average_drivers_per_race'] = mech_failures_per_year_df['number_of_drivers'] / mech_failures_per_year_df['number_of_races']

### Final df

In [106]:
mech_failures_per_year_df

Unnamed: 0,race_year,number_of_mechanical_failures,number_of_races,average_mechanical_failure_per_race,number_of_drivers,average_drivers_per_race
0,1950,59,7,8.428571,81,11.571429
1,1951,70,8,8.750000,84,10.500000
2,1952,66,8,8.250000,105,13.125000
3,1953,92,9,10.222222,108,12.000000
4,1954,86,9,9.555556,97,10.777778
...,...,...,...,...,...,...
68,2018,45,21,2.142857,20,0.952381
69,2019,28,21,1.333333,20,0.952381
70,2020,28,17,1.647059,23,1.352941
71,2021,21,22,0.954545,21,0.954545


In [65]:
#Export dataframe to csv
mech_failures_per_year_df.to_csv('mech_failures_per_year.csv', index=False)