Add new variables to the dataset (**AI Technique**, **Sport**, **AI for Injury Risk**, **AI for Sporting Performance**)

In [10]:
import pandas as pd
import os

# Loading the CSV file with references
ref_abs = pd.read_csv(os.path.join('..','results','refs_abstracts_sys.csv'))
ref_abs.head()

Unnamed: 0,Author,Title,Abstract,Journal,Year
0,"López-Valenciano A, Ayala F, Puerta JM, et al",A preventive model for muscle injuries: a nove...,The application of contemporary statistical ap...,Medicine and science in sports and exercise,2018.0
1,Li C,Predict the neural network mathematical model ...,Deep learning has achieved impressive predicti...,Physical review letters,2020.0
2,Lu G,Evaluation model of young basketball players ’...,,,
3,Wu L,The participating team ’s technical analysis o...,,,
4,Zhang Q,Prediction based on basketball competition vid...,,,


In [11]:
# Define list of AI techniques based on Table 2 and sports for keyword search
ai_techniques = ["Absolute shrinkage and selection operator", "Artificial neural network",
                 "Bayesian logistic", "Bayesian networks", "Decision tree classifier",
                 "Fuzzy clustering", "K-means clustering", "K-nearest neighbor",
                 "Markov process", "Support vector machine",
                 "Support vector machine + decision tree classifier"]

sports = ["basketball", "soccer", "volleyball", "baseball", "handball", 
          "australian football", "ice hockey", "american football", 
          "cricket", "field hockey", "rugby"]

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to identify AI technique from text
def identify_ai_technique(text):
    for technique in ai_techniques:
        if technique.lower() in str(text).lower():
            return technique
    return "None"

# Function to identify sport from text
def identify_sport(text):
    for sport in sports:
        if sport in str(text).lower():
            return sport.capitalize()
    return "None"

# Function to identify if AI is used for injury risk prediction
def ai_injury(text):
    keywords = ["injury","injuries","medicine","risk"]
    for keyword in keywords:
        if keyword in str(text).lower():
            return "Yes"
    return "No"

# Function to identify if AI is used for sporting performance prediction
def ai_performance(text):
    keywords = ["performance", "technical", "tactical", "match"]
    for keyword in keywords:
        if keyword in str(text).lower():
            return "Yes"
    return "No"

# Define a function to find the most similar term using TF-IDF and cosine similarity

def find_most_similar(term, text, terms_list):

    # Combine the term and the text for TF-IDF vectorization

    combined_text = [term] + [text]

    

    # Vectorize the text

    vectorizer = TfidfVectorizer().fit(combined_text)

    vectorized_text = vectorizer.transform(combined_text)

    

    # Calculate cosine similarity

    similarity = cosine_similarity(vectorized_text[0:1], vectorized_text[1:])

    

    # Check if similarity is above a threshold, return the term if it is

    if similarity[0, 0] > 0.2:  # Threshold set to 0.2

        return term

    return "None"



# Function to identify AI technique using the most similar term from the list

def identify_ai_technique_tfidf(text):

    for technique in ai_techniques:

        most_similar = find_most_similar(technique, text, ai_techniques)

        if most_similar != "None":

            return most_similar

    return "None"



# Function to identify sport using the most similar term from the list

def identify_sport_tfidf(text):

    for sport in sports:

        most_similar = find_most_similar(sport, text, sports)

        if most_similar != "None":

            return most_similar.capitalize()

    return "None"

# Apply these functions to the dataset

# Apply these functions to the dataset
ref_abs["AI Technique"] = ref_abs.apply(lambda row: identify_ai_technique(row["Title"]) or identify_ai_technique(row["Abstract"]), axis=1)
ref_abs["Sport"] = ref_abs.apply(lambda row: identify_sport(row["Title"]) or identify_sport(row["Abstract"]), axis=1)
ref_abs["AI for Injury Risk"] = ref_abs.apply(lambda row: ai_injury(row["Title"]) or ai_injury(row["Abstract"]), axis=1)
ref_abs["AI for Performance"] = ref_abs.apply(lambda row: ai_performance(row["Title"]) or ai_performance(row["Abstract"]), axis=1)

ref_abs["AI Technique"] = ref_abs.apply(lambda row: identify_ai_technique_tfidf(row["Title"]) or identify_ai_technique_tfidf(row["Abstract"]), axis=1)

ref_abs["Sport"] = ref_abs.apply(lambda row: identify_sport_tfidf(row["Title"]) or identify_sport_tfidf(row["Abstract"]), axis=1)

# Display the updated dataset
ref_abs.head()

Unnamed: 0,Author,Title,Abstract,Journal,Year,AI Technique,Sport,AI for Injury Risk,AI for Performance
0,"López-Valenciano A, Ayala F, Puerta JM, et al",A preventive model for muscle injuries: a nove...,The application of contemporary statistical ap...,Medicine and science in sports and exercise,2018.0,,,Yes,No
1,Li C,Predict the neural network mathematical model ...,Deep learning has achieved impressive predicti...,Physical review letters,2020.0,,,No,No
2,Lu G,Evaluation model of young basketball players ’...,,,,,,No,No
3,Wu L,The participating team ’s technical analysis o...,,,,,,No,Yes
4,Zhang Q,Prediction based on basketball competition vid...,,,,,,No,No
