### OCI Data Science - pull data
This notebook provides tools and techniques to pull required data for the project

##### Use fastF1 API 
* Pull Lap, weather, car, results, and position data 




#### This notebook uses formula1conda which is a custom conda
fastf1 package requires python>=3.8 and OCI prebuild packages at this time mostly come with python==3.7.

#### Steps to build and publish a custom conda:
1. create a yaml file and list your packages 
2. in the terminal execute: `odsc conda create -f environment.yaml -n my-conda-env`

In [1]:
path = '/home/datascience/WorkSpace/RedBull-Racining-TimeToPit/notebooks'
data_path = '../../RedBull/data/'

In [2]:
import os
os.chdir(path)
import pandas as pd
import logging
import json
import pickle
import requests
import numpy as np
import fastf1
import matplotlib.pyplot as plt
import pickle
fastf1.Cache.enable_cache(data_path)

In [3]:
schedule = fastf1.get_event_schedule(2022).to_dict()
schedule = pd.DataFrame.from_dict(schedule)
schedule.head(10)

Unnamed: 0,RoundNumber,Country,Location,OfficialEventName,EventDate,EventName,EventFormat,Session1,Session1Date,Session2,Session2Date,Session3,Session3Date,Session4,Session4Date,Session5,Session5Date,F1ApiSupport
0,0,Spain,Spain,FORMULA 1 PRE-SEASON TRACK SESSION 2022,2022-02-25 18:00:00,Pre-Season Track Session,testing,Practice 1,2022-02-23 09:00:00,Practice 2,2022-02-24 09:00:00,Practice 3,2022-02-25 09:00:00,,NaT,,NaT,False
1,0,Bahrain,Bahrain,FORMULA 1 ARAMCO PRE-SEASON TESTING 2022,2022-03-12 19:00:00,Pre-Season Test,testing,Practice 1,2022-03-10 10:00:00,Practice 2,2022-03-11 10:00:00,Practice 3,2022-03-12 10:00:00,,NaT,,NaT,True
2,1,Bahrain,Sakhir,FORMULA 1 GULF AIR BAHRAIN GRAND PRIX 2022,2022-03-20 20:00:00,Bahrain Grand Prix,conventional,Practice 1,2022-03-18 15:00:00,Practice 2,2022-03-18 18:00:00,Practice 3,2022-03-19 15:00:00,Qualifying,2022-03-19 18:00:00,Race,2022-03-20 18:00:00,True
3,2,Saudi Arabia,Jeddah,FORMULA 1 STC SAUDI ARABIAN GRAND PRIX 2022,2022-03-27 22:00:00,Saudi Arabian Grand Prix,conventional,Practice 1,2022-03-25 17:00:00,Practice 2,2022-03-25 20:00:00,Practice 3,2022-03-26 17:00:00,Qualifying,2022-03-26 20:00:00,Race,2022-03-27 20:00:00,True
4,3,Australia,Melbourne,FORMULA 1 HEINEKEN AUSTRALIAN GRAND PRIX 2022,2022-04-10 17:00:00,Australian Grand Prix,conventional,Practice 1,2022-04-08 13:00:00,Practice 2,2022-04-08 16:00:00,Practice 3,2022-04-09 13:00:00,Qualifying,2022-04-09 16:00:00,Race,2022-04-10 15:00:00,True
5,4,Italy,Imola,FORMULA 1 ROLEX GRAN PREMIO DEL MADE IN ITALY ...,2022-04-24 17:00:00,Emilia Romagna Grand Prix,sprint,Practice 1,2022-04-22 13:30:00,Qualifying,2022-04-22 17:00:00,Practice 2,2022-04-23 12:30:00,Sprint,2022-04-23 16:30:00,Race,2022-04-24 15:00:00,True
6,5,United States,Miami,FORMULA 1 CRYPTO.COM MIAMI GRAND PRIX 2022,2022-05-08 17:30:00,Miami Grand Prix,conventional,Practice 1,2022-05-06 14:30:00,Practice 2,2022-05-06 17:30:00,Practice 3,2022-05-07 13:00:00,Qualifying,2022-05-07 16:00:00,Race,2022-05-08 15:30:00,True
7,6,Spain,Barcelona,FORMULA 1 PIRELLI GRAN PREMIO DE ESPAÑA 2022,2022-05-22 17:00:00,Spanish Grand Prix,conventional,Practice 1,2022-05-20 14:00:00,Practice 2,2022-05-20 17:00:00,Practice 3,2022-05-21 13:00:00,Qualifying,2022-05-21 16:00:00,Race,2022-05-22 15:00:00,True
8,7,Monaco,Monaco,FORMULA 1 GRAND PRIX DE MONACO 2022,2022-05-29 17:00:00,Monaco Grand Prix,conventional,Practice 1,2022-05-27 14:00:00,Practice 2,2022-05-27 17:00:00,Practice 3,2022-05-28 13:00:00,Qualifying,2022-05-28 16:00:00,Race,2022-05-29 15:00:00,True
9,8,Azerbaijan,Baku,FORMULA 1 AZERBAIJAN GRAND PRIX 2022,2022-06-12 17:00:00,Azerbaijan Grand Prix,conventional,Practice 1,2022-06-10 15:00:00,Practice 2,2022-06-10 18:00:00,Practice 3,2022-06-11 15:00:00,Qualifying,2022-06-11 18:00:00,Race,2022-06-12 15:00:00,True


In [4]:
# EventName = list(schedule['EventName'][(schedule.index<13) & (schedule.index>1)])

In [4]:
def get_lap_data(session, schedule, evnt, ses, EventDate):
    '''get lap data from a session and
       retun as a dataframe
    '''
    lap = session.laps.to_dict()
    lap = pd.DataFrame.from_dict(lap)
    lap['EventName'] =evnt
    lap['country'] = schedule['Country'][schedule['EventName']==evnt].values[0]
    lap['session'] = ses
    lap['EventDate'] = schedule[EventDate][schedule['EventName']==evnt].values[0]
    return lap

In [5]:
def get_weather_data(session, schedule, evnt, ses,EventDate): 
    '''get weather data from a session and
       retun as a dataframe
    '''
    weather = session.weather_data.to_dict()
    weather = pd.DataFrame.from_dict(weather)
    weather['EventName'] =evnt
    weather['country'] = schedule['Country'][schedule['EventName']==evnt].values[0]
    weather['session'] = ses
    weather['EventDate'] = schedule[EventDate][schedule['EventName']==evnt].values[0]
    return weather

In [6]:
def get_car_data(session, schedule, evnt, ses,EventDate):
    '''get car_data from a session and
       retun as a dataframe
    '''
    for ii in session.car_data:
        car_data = session.car_data[ii].to_dict()
        car_data = pd.DataFrame.from_dict(car_data)
        car_data['driver'] = ii
    car_data['EventName'] =evnt
    car_data['country'] = schedule['Country'][schedule['EventName']==evnt].values[0]
    car_data['session'] = ses
    car_data['EventDate'] = schedule[EventDate][schedule['EventName']==evnt].values[0]
    return car_data

In [7]:
def get_position_data(session, schedule, evnt, ses, EventDate):
    '''get position_data from a session and
       retun as a dataframe
    '''    
    for ii in session.pos_data:
        position = session.pos_data[ii].to_dict()
        position = pd.DataFrame.from_dict(position)
        position['driver'] = ii
    position['EventName'] =evnt
    position['country'] = schedule['Country'][schedule['EventName']==evnt].values[0]
    position['session'] = ses
    position['EventDate'] = schedule[EventDate][schedule['EventName']==evnt].values[0]
    return position

In [9]:
def get_results(session, schedule, evnt, ses, EventDate):
    '''get results data from a session and
       retun as a dataframe
    '''
    result = session.results.to_dict()
    result = pd.DataFrame.from_dict(result).reset_index()
    result['EventName'] =evnt
    result['country'] = schedule['Country'][schedule['EventName']==evnt].values[0]
    result['session'] = ses
    result['EventDate'] = schedule[EventDate][schedule['EventName']==evnt].values[0]
    return result

In [10]:
%%time

sessionDateMap = {'Race': 'Session5Date',
          'Qualifying': 'Session4Date'}

for year in [2022, 2021, 2020, 2019]:
    
    laps =[]
    weathers =[]
    results = []
    car_data_ses =[]
    all_positions =[]
    
    sch = fastf1.get_event_schedule(year).to_dict()
    sch = pd.DataFrame.from_dict(sch)

#     schedule.drop(schedule[schedule['EventName'] == 'Pre-Season Test'].index, inplace = True)
    
    ## year 2022 doesn't have data after Jun
    if year = 2022:
        sch = sch[sch.index<12]
        
    EventName = [s for s in sch['EventName'] if "Grand" in s]
    
    for event in EventName:
        for session_type in ['Qualifying','Race']:
            
            eventDateColName = sessionDateMap[session_type]
            
            session = fastf1.get_session(year, event, session_type)
            session.load()
            
            ## get lap data for a session
            laps.append(get_lap_data(session, sch, event, 
                                     session_type, eventDateColName))
            
            ## get weather data for a session
            weathers.append(get_weather_data(session, sch, event, 
                                             session_type, eventDateColName))

            ## get car_data for a session
            car_data_ses.append(get_car_data(session, sch, event, 
                                             session_type, eventDateColName))
            
            ## get positions for a session
            all_positions.append(get_position_data(session, sch, event, 
                                                   session_type, eventDateColName))
            
            ## get results for a session
            results.append(get_results(session, sch, event, 
                                       session_type, eventDateColName))

    ## save all extracted session data in a year
    file = open(data_path+ 'laps_'+str(year)+'.pkl', 'wb')
    pickle.dump(laps,file)
    file = open(data_path+ 'weathers_'+str(year)+'.pkl', 'wb')
    pickle.dump(weathers,file)
    file = open(data_path+ 'results_'+str(year)+'.pkl', 'wb')
    pickle.dump(results,file)
    file = open(data_path+ 'car_data_ses_'+str(year)+'.pkl', 'wb')
    pickle.dump(car_data_ses,file)
    file = open(data_path+' all_positions_'+str(year)+'.pkl', 'wb')
    pickle.dump(all_positions,file)

core           INFO 	Loading data for Australian Grand Prix - Qualifying [v2.2.8]
api            INFO 	Using cached data for driver_info
api            INFO 	Using cached data for timing_data
api            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
api            INFO 	Using cached data for session_status_data
api            INFO 	Using cached data for track_status_data
api            INFO 	Using cached data for car_data
api            INFO 	Using cached data for position_data
api            INFO 	Using cached data for weather_data
api            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['44', '7', '5', '33', '3', '20', '8', '27', '55', '77', '14', '2', '11', '18', '31', '28', '9', '16', '35', '10']
core           INFO 	Loading data for Australian Grand Prix - Race [v2.2.8]
api            INFO 	No cached data found for driver_info. Loading data...
api            INFO 	Fe

UnboundLocalError: local variable 'car_data' referenced before assignment

## Concat all relevant files

In [32]:
files = os.listdir(data_path)
weather_files = [x for x in files if x.endswith(".pkl") if 'weathers' in x]
car_data_files = [x for x in files if x.endswith(".pkl") if 'car_data' in x]
results_files = [x for x in files if x.endswith(".pkl") if 'results' in x]
lap_files = [x for x in files if x.endswith(".pkl") if 'laps' in x]
position_files = [x for x in files if x.endswith(".pkl") if 'position' in x]

In [40]:
weather_data =[]
for fl in weather_files:
    file = open(data_path+fl,'rb')
    weather_data.extend(pickle.load(file))
weather_data = pd.concat(weather_data, axis=0)

In [45]:
weather_data.sample(10)

Unnamed: 0,Time,AirTemp,Humidity,Pressure,Rainfall,TrackTemp,WindDirection,WindSpeed,EventName,country,session,EventDate
128,0 days 02:08:41.402000,11.6,67.0,1011.0,False,17.7,88,0.5,Emilia Romagna Grand Prix,Italy,Race,2021-04-18 00:00:00
82,0 days 01:22:53.401000,12.2,90.0,998.0,False,13.9,258,1.7,Emilia Romagna Grand Prix,Italy,Qualifying,2022-04-24 17:00:00
45,0 days 00:45:14.438000,19.8,25.7,1008.2,False,26.1,155,1.0,United States Grand Prix,USA,Qualifying,2019-11-03 00:00:00
117,0 days 01:57:31.632000,14.9,89.3,942.6,True,19.3,61,1.0,Styrian Grand Prix,Austria,Qualifying,2020-07-12 00:00:00
68,0 days 01:08:29.520000,21.2,53.6,780.8,False,33.4,352,0.8,Mexican Grand Prix,Mexico,Qualifying,2019-10-27 00:00:00
19,0 days 00:19:27.775000,26.0,46.0,1014.4,False,40.0,293,2.5,Australian Grand Prix,Australia,Race,2022-04-10 17:00:00
73,0 days 01:13:24.078000,20.1,58.4,1006.1,False,33.3,51,1.2,British Grand Prix,UK,Race,2019-07-14 00:00:00
68,0 days 01:08:06.631000,14.0,80.1,1016.3,False,16.1,312,0.1,Russian Grand Prix,Russia,Qualifying,2021-09-26 00:00:00
33,0 days 00:33:10.951000,14.6,92.1,970.8,False,19.5,6,0.3,Belgian Grand Prix,Belgium,Qualifying,2021-08-29 00:00:00
7,0 days 00:07:04.017000,19.3,39.5,1005.4,False,38.9,353,0.4,Portuguese Grand Prix,Portugal,Race,2021-05-02 00:00:00


In [None]:
#     try:
#         ret = json.loads(response.text)
#     except Exception as e:
#         print(e, flush=True)