In [32]:
import pandas as pd    
from os import walk

In [153]:
f1_drivers = {
    'Lewis Hamilton': ["Lewis", "Hamilton", "HAM"],
    'George Russell': ["George", "Russell", "RUS"],
    'Valtteri Bottas': ["Valtteri", "Bottas", "BOT"],
    "Guan Yu Zhou": ["Guan Yu", "Zhou", "ZHO"],
    'Max Verstappen': ["Max", "Verstappen", "VER"],
    'Sergio Perez': ["Sergio", "Perez", "PER", "Checo"],
    'Lando Norris': ["Lando", "Norris", "NOR"],
    'Oscar Piastri': ["Oscar", "Piastri", "PIA"],
    'Daniel Ricciardo': ["Daniel", "Ricciardo", "RIC", "Danny"],
    'Yuko Tsunoda': ["Yuki", "Tsunoda", "TSU"],
    'Carlos Sainz': ["Carlos", "Sainz", "SAI", ],
    'Charles Leclerc': ["Charles", "Leclerc", "LEC"],
    'Fernando Alonso': ["Fernando", "Alonso", "ALO", "Nando"],
    'Lance Stroll': ["Lance", "Stroll", "STR"],
    'Esteban Ocon': ["Esteban", "Ocon", "OCO"],
    'Pierre Gasly': ["Pierre", "Gasly", "GAS"],
    'Alex Albon': ["Alex", "Albon", "ALB"],
    'Logan Sargeant': ["Logan", "Sargeant", "SAR", ],
    'Niko Hulkenberg': ["Nico", "Hulkenberg", "HUL"],
    'Kevin Magnussen': ["Kevin", "Magnussen", "MAG"],
}
def preprocess_drivers(drivers):
    return {driver: {alias.lower() for alias in aliases} for driver, aliases in drivers.items()}

f1_drivers = preprocess_drivers(f1_drivers)
f1_drivers

{'Lewis Hamilton': {'ham', 'hamilton', 'lewis'},
 'George Russell': {'george', 'rus', 'russell'},
 'Valtteri Bottas': {'bot', 'bottas', 'valtteri'},
 'Guan Yu Zhou': {'guan yu', 'zho', 'zhou'},
 'Max Verstappen': {'max', 'ver', 'verstappen'},
 'Sergio Perez': {'checo', 'per', 'perez', 'sergio'},
 'Lando Norris': {'lando', 'nor', 'norris'},
 'Oscar Piastri': {'oscar', 'pia', 'piastri'},
 'Daniel Ricciardo': {'daniel', 'danny', 'ric', 'ricciardo'},
 'Yuko Tsunoda': {'tsu', 'tsunoda', 'yuki'},
 'Carlos Sainz': {'carlos', 'sai', 'sainz'},
 'Charles Leclerc': {'charles', 'lec', 'leclerc'},
 'Fernando Alonso': {'alo', 'alonso', 'fernando', 'nando'},
 'Lance Stroll': {'lance', 'str', 'stroll'},
 'Esteban Ocon': {'esteban', 'oco', 'ocon'},
 'Pierre Gasly': {'gas', 'gasly', 'pierre'},
 'Alex Albon': {'alb', 'albon', 'alex'},
 'Logan Sargeant': {'logan', 'sar', 'sargeant'},
 'Niko Hulkenberg': {'hul', 'hulkenberg', 'nico'},
 'Kevin Magnussen': {'kevin', 'mag', 'magnussen'}}

In [154]:
def read_multiple_jsons(jsons: list[str]) -> pd.DataFrame:
    return pd.concat([pd.read_json(json) for json in jsons])

def get_files_from_dirextory(directory: str) -> list[str]:
    return next(walk("2024_translated"), (None, None, []))[2]

In [155]:
directory = "2024_translated"
json_files = get_files_from_dirextory(directory)
json_files = [f"{directory}/{file}" for file in json_files]
dataset = read_multiple_jsons(json_files)

In [156]:
print(dataset.head(15))
print(dataset.size)

                                                 text
0   Fail of the day: Stroll. AKA Latifi & Magnusse...
1   This why Ricc career is over... we see again t...
2                                                0:19
3          Hall Cynthia Jackson Kimberly Lewis Sharon
4                                                   🔥
5                Lee Ronald Hall Brenda Moore Jeffrey
6   here after singapore gp 2024, oh how the table...
7            Where's the scream of the engine gone? ❤
8              Miller Angela Lewis Donald Davis Jason
9   I'm a Red Bull fan, so when the FIA says nothi...
10  Red Bull Racing's last grand prix dominance in...
11  Magnuson and Hulkenberg are so overrated , the...
12                 I rewatched this only for turn 1 😂
13  Enjoying how Magnusson is such a menace, that ...
14                 huge gap with norris, but now... 🙃
94753


In [157]:
dataset.dropna(inplace=True)
dataset.reset_index(drop=True, inplace=True)
dataset.head()
print(dataset.isnull().sum())
print(dataset.head(15))#
print(dataset.size)

text    0
dtype: int64
                                                 text
0   Fail of the day: Stroll. AKA Latifi & Magnusse...
1   This why Ricc career is over... we see again t...
2                                                0:19
3          Hall Cynthia Jackson Kimberly Lewis Sharon
4                                                   🔥
5                Lee Ronald Hall Brenda Moore Jeffrey
6   here after singapore gp 2024, oh how the table...
7            Where's the scream of the engine gone? ❤
8              Miller Angela Lewis Donald Davis Jason
9   I'm a Red Bull fan, so when the FIA says nothi...
10  Red Bull Racing's last grand prix dominance in...
11  Magnuson and Hulkenberg are so overrated , the...
12                 I rewatched this only for turn 1 😂
13  Enjoying how Magnusson is such a menace, that ...
14                 huge gap with norris, but now... 🙃
93793


In [133]:

def get_driver_names_without_spacy(comment, drivers):
    matched_drivers = list(set([driver for driver, aliases in drivers.items() for alias in aliases if alias in comment.lower()]))
    return matched_drivers

dataset['drivers'] = dataset.apply(lambda row: get_driver_names_without_spacy(row['text'], drivers) if row['contains_driver'] else list(), axis=1)

In [139]:
dataset.to_json("data/dataset_with_drivers.json", orient="records")
