In [27]:
import sys

# setting path
sys.path.append('../')

import livef1
import pandas as pd

In [None]:
import fastf1

session = fastf1.get_session(2019, 'Monza', 'Q')

In [263]:
from rapidfuzz import fuzz
fuzz.ratio("spa", "spain"), fuzz.ratio("spa", "spa francochamps")

(75.0, 31.57894736842105)

In [29]:
season2021 = livef1.get_season(2021)
season2023 = livef1.get_season(2023)
season2024 = livef1.get_season(2024)

df = season.meetings_table

df = pd.concat([
    season2021.meetings_table,
    season2023.meetings_table,
    season2024.meetings_table
])

In [None]:
from string import punctuation
import re
# re.split(punctuation, "spa-franc sao")
re.split(rf'[\s{punctuation}]+', "spa-franc sao")

['spa', 'franc', 'sao']

In [None]:
from difflib import SequenceMatcher
import pandas as pd
from jellyfish import jaro_similarity, jaro_winkler_similarity

from string import punctuation
import re

stopwords = [
    "formula",
    "1",
    "grand",
    "prix"
]

def identifer_text_format(text):
    querywords = re.split(rf'[\s{punctuation}]+', text.casefold())
    return [word for word in querywords if word not in stopwords]

def find_most_similar_vectorized(df, target):
    """
    Find the most similar string in a Pandas DataFrame using SequenceMatcher.

    Parameters
    ----------
    df : pd.DataFrame
        The DataFrame to search in.
    target : str
        The string to search for.

    Returns
    -------
    dict
        A dictionary containing the most similar value, its similarity ratio,
        and its location (row and column).
    """

    def jaccard_similarity(cell):
        """ returns the jaccard similarity between two lists """
        intersection_cardinality = len(
            set.intersection(
                *[
                    set(identifer_text_format(target)),
                    set(identifer_text_format(cell))
                ]
            )
        )
        union_cardinality = len(
            set.union(
                *[
                    set(identifer_text_format(target)),
                    set(identifer_text_format(cell))
                ]
            )
        )
        return intersection_cardinality/float(union_cardinality)

    def jarow_similarity(cell):
        # return SequenceMatcher(None, target, str(cell)).ratio()
        return jaro_winkler_similarity(
            " ".join(identifer_text_format(target)),
            " ".join(identifer_text_format(cell))
            )

    def argmax_n(arr: np.array, n: int, axis=None):
        argmaxes = []
        for _ in range(n):
            row, col = divmod(arr.argmax(), arr.shape[1])
            argmaxes.append(row)
            # arr = np.delete(arr, row, axis=0)
            arr[row,:] = 0
            # print(row, col)
        return argmaxes

    print("..:: Search started.")
    similarity_df = df.map(jaccard_similarity)
    jaccard_score = similarity_df.max().max()
    row, col = divmod(similarity_df.values.argmax(), similarity_df.shape[1])
    most_similar = df.iloc[row, col]


    if jaccard_score:
        print("..:: Found.")
        print("..:: Most similar:", most_similar)
        print("..:: Row:", row)
        print("..:: Column:", df.columns[col])

        return {
            "isFound": 1,
            "how" : "jaccard",
            "value": most_similar,
            "similarity": max_similarity,
            "row": row,
            "column": df.columns[col]
        }
    else:
        print("..:: Couldn't find.")
        jaro_df = df.map(jarow_similarity)
        jaro_score = jaro_df.max().max()

        if jaro_score >= 0.9:
            row, col = divmod(jaro_df.values.argmax(), jaro_df.shape[1])
            most_similar = df.iloc[row, col]

            return {
                "isFound": 1,
                "how" : "jaro",
                "value": most_similar,
                "similarity": max_similarity,
                "row": row,
                "column": df.columns[col]
            }

        else:
            possible_df = df.iloc[argmax_n(jaro_df.values, 3, axis=1)]
            print("The searched query not found in the meetings table. Did you mean one of these :")
            for idx, prow in possible_df.iterrows():
                print()
                print("Meeting Official Name :", prow.meeting_offname)
                print("Meeting Name :", prow.meeting_name)
                print("Meeting Circuit Shortname :", prow.meeting_circuit_shortname)
                print("> Suggested search queries :",identifer_text_format(prow.meeting_name) + identifer_text_format(prow.meeting_circuit_shortname))
            print("============================================")
            
            return {
                "isFound": 0,
                "how": None,
                "value": None,
                "similarity": None,
                "row": None,
                "column": None
            }



# df_main = df.reset_index()
# df_main.index = df_main.meeting_name.values
# df_main = df_main[["meeting_offname","meeting_name","meeting_circuit_shortname"]].drop_duplicates()

df_main = df[["meeting_offname","meeting_name","meeting_circuit_shortname"]]
df_main = season2021.meetings_table[["meeting_code","meeting_offname","meeting_name","meeting_circuit_shortname"]].reset_index(drop=True).drop_duplicates().set_index("meeting_code")

# Search for the most similar string
target_string = "emmilian"
result = find_most_similar_vectorized(df_main, target_string)
print(f"Most similar value: {result['value']} (Similarity: {result['similarity'] * 100:.2f}%)")
print(f"Found at row {result['row']} in column '{result['column']}'")
row = df_main.reset_index().iloc[result["row"]]
print(row.meeting_code)
row.meeting_offname, row.meeting_name, row.meeting_circuit_shortname

..:: Search started.
..:: Couldn't find.
0.8333333333333333
The searched query not found in the meetings table. Did you mean one of these :

Meeting Official Name : FORMULA 1 PIRELLI GRAN PREMIO DEL MADE IN ITALY E DELL'EMILIA ROMAGNA 2021
Meeting Name : Emilia Romagna Grand Prix
Meeting Circuit Shortname : Imola
> Suggested search queries : ['emilia', 'romagna', 'imola']

Meeting Official Name : FORMULA 1 HEINEKEN GRAN PREMIO D’ITALIA 2021
Meeting Name : Italian Grand Prix
Meeting Circuit Shortname : Monza
> Suggested search queries : ['italian', 'monza']

Meeting Official Name : FORMULA 1 ROLEX BELGIAN GRAND PRIX 2021
Meeting Name : Belgian Grand Prix
Meeting Circuit Shortname : Spa-Francorchamps
> Suggested search queries : ['belgian', 'spa', 'francorchamps']


TypeError: unsupported operand type(s) for *: 'NoneType' and 'int'

In [291]:
season2023.meetings_table[["meeting_code","meeting_offname","meeting_name","meeting_circuit_shortname"]].reset_index(drop=True).drop_duplicates().set_index("meeting_code").sort_values("meeting_name")

Unnamed: 0_level_0,meeting_offname,meeting_name,meeting_circuit_shortname
meeting_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
UAE0103,FORMULA 1 ETIHAD AIRWAYS ABU DHABI GRAND PRIX ...,Abu Dhabi Grand Prix,Yas Marina Circuit
AUS0204,FORMULA 1 ROLEX AUSTRALIAN GRAND PRIX 2023,Australian Grand Prix,Melbourne
AUT0108,FORMULA 1 ROLEX GROSSER PREIS VON ÖSTERREICH 2023,Austrian Grand Prix,Spielberg
AZE02,FORMULA 1 AZERBAIJAN GRAND PRIX 2023,Azerbaijan Grand Prix,Baku
BRN0104,FORMULA 1 GULF AIR BAHRAIN GRAND PRIX 2023,Bahrain Grand Prix,Sakhir
BEL02012,FORMULA 1 MSC CRUISES BELGIAN GRAND PRIX 2023,Belgian Grand Prix,Spa-Francorchamps
GBR0114,FORMULA 1 ARAMCO BRITISH GRAND PRIX 2023,British Grand Prix,Silverstone
CAN0108,FORMULA 1 PIRELLI GRAND PRIX DU CANADA 2023,Canadian Grand Prix,Montreal
NED0105,FORMULA 1 HEINEKEN DUTCH GRAND PRIX 2023,Dutch Grand Prix,Zandvoort
HUN0109,FORMULA 1 QATAR AIRWAYS HUNGARIAN GRAND PRIX 2023,Hungarian Grand Prix,Hungaroring


In [288]:
df[["meeting_offname","meeting_name","meeting_circuit_shortname"]].reset_index(drop=True).drop_duplicates().sort_values("meeting_name")

Unnamed: 0,meeting_offname,meeting_name,meeting_circuit_shortname
106,FORMULA 1 ETIHAD AIRWAYS ABU DHABI GRAND PRIX ...,Abu Dhabi Grand Prix,Yas Marina Circuit
222,FORMULA 1 ETIHAD AIRWAYS ABU DHABI GRAND PRIX ...,Abu Dhabi Grand Prix,Yas Marina Circuit
124,FORMULA 1 ROLEX AUSTRALIAN GRAND PRIX 2023,Australian Grand Prix,Melbourne
40,FORMULA 1 BWT GROSSER PREIS VON ÖSTERREICH 2021,Austrian Grand Prix,Spielberg
232,FORMULA 1 QATAR AIRWAYS AUSTRIAN GRAND PRIX 2024,Austrian Grand Prix,Spielberg
154,FORMULA 1 ROLEX GROSSER PREIS VON ÖSTERREICH 2023,Austrian Grand Prix,Spielberg
262,FORMULA 1 QATAR AIRWAYS AZERBAIJAN GRAND PRIX ...,Azerbaijan Grand Prix,Baku
129,FORMULA 1 AZERBAIJAN GRAND PRIX 2023,Azerbaijan Grand Prix,Baku
25,FORMULA 1 AZERBAIJAN GRAND PRIX 2021,Azerbaijan Grand Prix,Baku
0,FORMULA 1 GULF AIR BAHRAIN GRAND PRIX 2021,Bahrain Grand Prix,Sakhir


In [265]:
matrix_words = []
for idx, row in df_main.iterrows():
    words = []
    for i in row:
        words += i.split(" ")
    matrix_words += [words]

matrix_words

def addTwo(i):
    return i+2
   
import numpy as np
applyall = np.vectorize(addTwo)
res = applyall(matrix_words)
res

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (58,) + inhomogeneous part.

In [213]:
string = "Spa-Francorchamps"
string = "Spielberg"

SequenceMatcher(None, "belgium", string).ratio()

import jellyfish

jellyfish.jaro_similarity("sao paulo", 'São Paulo'.casefold()), jellyfish.jaro_similarity("sao paulo", 'Monte Carlo'.casefold())


(0.9259259259259259, 0.6700336700336701)

In [44]:
df.reset_index().groupby("meeting_name").first()

Unnamed: 0_level_0,season_year,meeting_location,session_type,meeting_code,meeting_key,meeting_number,meeting_offname,meeting_country_key,meeting_country_code,meeting_country_name,meeting_circuit_key,meeting_circuit_shortname,session_key,session_name,session_startDate,session_endDate,gmtoffset,path
meeting_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Abu Dhabi Grand Prix,2021,Yas Island,Practice 1,UAE0103,1107,22,FORMULA 1 ETIHAD AIRWAYS ABU DHABI GRAND PRIX ...,21,UAE,United Arab Emirates,70,Yas Marina Circuit,7165,Practice 1,2021-12-10T13:30:00,2021-12-10T14:30:00,04:00:00,2021/2021-12-12_Abu_Dhabi_Grand_Prix/2021-12-1...
Australian Grand Prix,2023,Melbourne,Practice 1,AUS0204,1143,3,FORMULA 1 ROLEX AUSTRALIAN GRAND PRIX 2023,5,AUS,Australia,10,Melbourne,7780,Practice 1,2023-03-31T12:30:00,2023-03-31T13:30:00,11:00:00,2023/2023-04-02_Australian_Grand_Prix/2023-03-...
Austrian Grand Prix,2021,Spielberg,Practice 1,AUT0108,1071,9,FORMULA 1 BWT GROSSER PREIS VON ÖSTERREICH 2021,17,AUT,Austria,19,Spielberg,6245,Practice 1,2021-07-02T11:30:00,2021-07-02T12:30:00,02:00:00,2021/2021-07-04_Austrian_Grand_Prix/2021-07-02...
Azerbaijan Grand Prix,2021,Baku,Practice 1,AZE02,1068,6,FORMULA 1 AZERBAIJAN GRAND PRIX 2021,30,AZE,Azerbaijan,144,Baku,6230,Practice 1,2021-06-04T12:30:00,2021-06-04T13:30:00,04:00:00,2021/2021-06-06_Azerbaijan_Grand_Prix/2021-06-...
Bahrain Grand Prix,2021,Sakhir,Practice 1,BRN0104,1064,1,FORMULA 1 GULF AIR BAHRAIN GRAND PRIX 2021,36,BRN,Bahrain,63,Sakhir,6210,Practice 1,2021-03-26T14:30:00,2021-03-26T15:30:00,03:00:00,2021/2021-03-28_Bahrain_Grand_Prix/2021-03-26_...
Belgian Grand Prix,2021,Spa-Francorchamps,Practice 1,BEL02012,1074,12,FORMULA 1 ROLEX BELGIAN GRAND PRIX 2021,16,BEL,Belgium,7,Spa-Francorchamps,6260,Practice 1,2021-08-27T11:30:00,2021-08-27T12:30:00,02:00:00,2021/2021-08-29_Belgian_Grand_Prix/2021-08-27_...
British Grand Prix,2021,Silverstone,Practice 1,GBR0114,1072,10,FORMULA 1 PIRELLI BRITISH GRAND PRIX 2021,2,GBR,Great Britain,2,Silverstone,6250,Practice 1,2021-07-16T14:30:00,2021-07-16T15:30:00,01:00:00,2021/2021-07-18_British_Grand_Prix/2021-07-16_...
Canadian Grand Prix,2023,Montréal,Practice 1,CAN0108,1212,9,FORMULA 1 PIRELLI GRAND PRIX DU CANADA 2023,46,CAN,Canada,23,Montreal,9103,Practice 1,2023-06-16T13:30:00,2023-06-16T14:30:00,-04:00:00,2023/2023-06-18_Canadian_Grand_Prix/2023-06-16...
Dutch Grand Prix,2021,Zandvoort,Practice 1,NED0105,1075,13,FORMULA 1 HEINEKEN DUTCH GRAND PRIX 2021,133,NED,Netherlands,55,Zandvoort,6265,Practice 1,2021-09-03T11:30:00,2021-09-03T12:30:00,02:00:00,2021/2021-09-05_Dutch_Grand_Prix/2021-09-03_Pr...
Emilia Romagna Grand Prix,2021,Imola,Practice 1,ITA0110,1065,2,FORMULA 1 PIRELLI GRAN PREMIO DEL MADE IN ITAL...,13,ITA,Italy,6,Imola,6215,Practice 1,2021-04-16T11:00:00,2021-04-16T12:00:00,02:00:00,2021/2021-04-18_Emilia_Romagna_Grand_Prix/2021...


In [30]:
df.dtypes

meeting_code                 object
meeting_key                   int64
meeting_number                int64
meeting_offname              object
meeting_name                 object
meeting_country_key           int64
meeting_country_code         object
meeting_country_name         object
meeting_circuit_key           int64
meeting_circuit_shortname    object
session_key                   int64
session_name                 object
session_startDate            object
session_endDate              object
gmtoffset                    object
path                         object
dtype: object

In [31]:
for col in df.columns:
    print(df[col].unique())
    print(len(df[col].unique()))


['BRN0104' 'ITA0110' 'POR0401' 'ESP0111' 'MON0112' 'AZE02' 'FRA0603'
 'AUT0108' 'GBR0114' 'HUN0109' 'BEL02012' 'NED0105' 'ITA0311' 'RUS0101'
 'TUR0104' 'USA0101' 'MEX0103' 'BRA0108' 'QAT0101' 'KSA0101' 'UAE0103'
 'AUS0204' 'USA1401' 'ESP0112' 'CAN0108' 'SIN0107' 'JPN0306' 'QAT0102'
 'BRA0109' 'USA1501']
30
[1064 1065 1066 1086 1067 1068 1070 1092 1071 1072 1073 1074 1075 1076
 1077 1078 1102 1103 1104 1105 1106 1107 1140 1141 1142 1143 1207 1208
 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223
 1224 1225 1226 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248
 1249 1250]
58
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
23
['FORMULA 1 GULF AIR BAHRAIN GRAND PRIX 2021'
 "FORMULA 1 PIRELLI GRAN PREMIO DEL MADE IN ITALY E DELL'EMILIA ROMAGNA 2021"
 'FORMULA 1 HEINEKEN GRANDE PRÉMIO DE PORTUGAL 2021'
 'FORMULA 1 ARAMCO GRAN PREMIO DE ESPAÑA 2021'
 'FORMULA 1 GRAND PRIX DE MONACO 2021'
 'FORMULA 1 AZERBAIJAN GRAND PRIX 2021'
 'FORMULA 1 EMIR