In [3]:
import pandas as pd    

In [31]:
f1_drivers = {
    'Lewis Hamilton': ["Lewis", "Hamilton", "HAM"],
    'George Russell': ["George", "Russell", "RUS"],
    'Valtteri Bottas': ["Valtteri", "Bottas", "BOT"],
    "Guan Yu Zhou": ["Guan Yu", "Zhou", "ZHO"],
    'Max Verstappen': ["Max", "Verstappen", "VER"],
    'Sergio Perez': ["Sergio", "Perez", "PER", "Checo"],
    'Lando Norris': ["Lando", "Norris", "NOR", ],
    'Oscar Piastri': ["Oscar", "Piastri", "PIA"],
    'Daniel Ricciardo': ["Daniel", "Ricciardo", "RIC", "Danny", "Ric"],
    'Yuko Tsunoda': ["Yuki", "Tsunoda", "TSU"],
    'Carlos Sainz': ["Carlos", "Sainz", "SAI", ],
    'Charles Leclerc': ["Charles", "Leclerc", "LEC"],
    'Fernando Alonso': ["Fernando", "Alonso", "ALO", "Nando"],
    'Lance Stroll': ["Lance", "Stroll", "STR"],
    'Esteban Ocon': ["Esteban", "Ocon", "OCO"],
    'Pierre Gasly': ["Pierre", "Gasly", "GAS"],
    'Alex Albon': ["Alex", "Albon", "ALB"],
    'Logan Sargeant': ["Logan", "Sargeant", "SAR", ],
    'Niko Hulkenberg': ["Nico", "Hulkenberg", "HUL"],
    'Kevin Magnussen': ["Kevin", "Magnussen", "MAG"],
}

In [5]:
dataset = pd.read_json("2024_translated/azerbaijan_grand_prix.json")

In [13]:
dataset.head()
print(dataset.isnull().sum())

text    0
dtype: int64


In [15]:
dataset.dropna(inplace=True)
dataset.reset_index(drop=True, inplace=True)
dataset.head()
print(dataset.isnull().sum())

text    0
dtype: int64


In [19]:
def check_driver_in_comment(comment, drivers):
    for driver in drivers:
        if any(word in comment for word in driver.split(" ")):
            return True
    return False

dataset['contains_driver'] = dataset['text'].apply(lambda x: check_driver_in_comment(x, f1_drivers))
print(dataset[['text', 'contains_driver']])

                                                   text  contains_driver
0                                        Kelley Islands            False
1       The only reason Hamilton dominated was the car.             True
2     Should trade Lewis to McLaren and get Lando ðŸ˜® ...             True
3           I feel like this is Carlos' first crash.ðŸ˜­ðŸ˜­ðŸ˜­             True
4                                            Terry Dale            False
...                                                 ...              ...
7421                                               Nope            False
7422                                              Drink            False
7423  A foolish drive again from Crashloss...  just ...            False
7424                                  What do you mean?            False
7425                                             Urge s            False

[7426 rows x 2 columns]


In [20]:
print(dataset['contains_driver'].value_counts())

contains_driver
False    4418
True     3008
Name: count, dtype: int64


In [51]:
import spacy


# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Define a function to find all matching driver names in a comment
def get_driver_names(comment, drivers=f1_drivers):
    doc = nlp(comment)
    matched_drivers = []
    for ent in doc.ents:
        for driver, aliases in drivers.items():
            if ent.text.lower() in [alias.lower() for alias in aliases]:
                matched_drivers.append(driver)
    return set(matched_drivers)

# Apply the function to your dataset
dataset['drivers'] = dataset['text'].apply(lambda x: get_driver_names(x))

# Print the dataset to verify the results
print(dataset[['text', 'drivers']])

                                                   text  \
0                                        Kelley Islands   
1       The only reason Hamilton dominated was the car.   
2     Should trade Lewis to McLaren and get Lando ðŸ˜® ...   
3           I feel like this is Carlos' first crash.ðŸ˜­ðŸ˜­ðŸ˜­   
4                                            Terry Dale   
...                                                 ...   
7421                                               Nope   
7422                                              Drink   
7423  A foolish drive again from Crashloss...  just ...   
7424                                  What do you mean?   
7425                                             Urge s   

                             drivers  
0                                 {}  
1                   {Lewis Hamilton}  
2     {Lewis Hamilton, Lando Norris}  
3                     {Carlos Sainz}  
4                                 {}  
...                              ...  
7421  

In [52]:
print(dataset.drivers.value_counts())
print(dataset.drivers.value_counts().sum() - 4790)

drivers
{}                                                                             4790
{Sergio Perez}                                                                  458
{Oscar Piastri}                                                                 362
{Carlos Sainz, Sergio Perez}                                                    356
{Carlos Sainz}                                                                  353
                                                                               ... 
{Logan Sargeant, Lando Norris, Oscar Piastri, Charles Leclerc}                    1
{Daniel Ricciardo, Oscar Piastri, Charles Leclerc}                                1
{Carlos Sainz, Lance Stroll, Yuko Tsunoda}                                        1
{Lando Norris, Carlos Sainz, George Russell, Lewis Hamilton, Oscar Piastri}       1
{Lewis Hamilton, Pierre Gasly}                                                    1
Name: count, Length: 132, dtype: int64
2636


In [53]:
dataset.head(10)

Unnamed: 0,text,contains_driver,drivers
0,Kelley Islands,False,{}
1,The only reason Hamilton dominated was the car.,True,{Lewis Hamilton}
2,Should trade Lewis to McLaren and get Lando ðŸ˜® ...,True,"{Lewis Hamilton, Lando Norris}"
3,I feel like this is Carlos' first crash.ðŸ˜­ðŸ˜­ðŸ˜­,True,{Carlos Sainz}
4,Terry Dale,False,{}
5,Zander Parkways,False,{}
6,Thomas George Jones Barbara Wilson Kimberly,True,{}
7,Graham Station,False,{}
8,"I race for Ferrari, F1 2023 so much fun, it's ...",False,{}
9,6:09 Accidents happen in milliseconds.,False,{}


In [58]:
# Define the set of drivers to filter
target_drivers = {"Lando Norris", "Carlos Sainz", "George Russell", "Lewis Hamilton", "Oscar Piastri"}

# Filter the dataset using where
filtered_dataset = dataset.where(dataset['drivers'].apply(lambda x: target_drivers == x)).dropna()

# Print the filtered dataset
print(filtered_dataset.text.values)

["Excellent victory by Piastri, he did very well for me as the driver of the day!! Percival once again failed to convert the pole position into a victory, but he finished in P2, that's fine. Russell was there at the right time and won P3!! Vespa and Hamilton's equipment didn't help today, it was a race well below theirs. Norris silenced the critics, good recovery!! The badass P6 had a lonely race but managed to get good points for Aston and even finished ahead of a Mercedes. The Williams were very good, great race by the duo and Bearman was also very good!!! Sainz and Checo Perez for me it was a racing incident, they were both having a good race and unfortunately had this misfortune. The race was very good, I liked it."]
