Add new variables to the dataset (**AI Technique**, **Sport**, **AI for Injury Risk**, **AI for Sporting Performance**)

In [1]:
import pandas as pd
import os

# Loading the CSV file with references
ref_abs = pd.read_csv(os.path.join('..','results','refs_abstracts_sys.csv'))
ref_abs.head()

Unnamed: 0,Author,Title,Abstract,Journal,Year
0,"López-Valenciano A, Ayala F, Puerta JM, et al",A preventive model for muscle injuries: a nove...,The application of contemporary statistical ap...,Medicine and science in sports and exercise,2018.0
1,Li C,Predict the neural network mathematical model ...,Deep learning has achieved impressive predicti...,Physical review letters,2020.0
2,Lu G,Evaluation model of young basketball players ’...,,,
3,Wu L,The participating team ’s technical analysis o...,,,
4,Zhang Q,Prediction based on basketball competition vid...,,,


In [2]:
# Define list of AI techniques based on Table 2
ai_techniques = ["Absolute shrinkage and selection operator", "Artificial neural network",
                 "Bayesian logistic", "Bayesian networks", "Decision tree classifier",
                 "Fuzzy clustering", "K-means clustering", "K-nearest neighbor",
                 "Markov process", "Support vector machine",
                 "Support vector machine + decision tree classifier"]

# Creating a dictionary to store AI techniques and their most plausible corresponding aliases
ai_techniques_aliases = {
    "Absolute Shrinkage and Selection Operator": ["LASSO", "Least Absolute Shrinkage and Selection Operator", "L1 Regularization"],
    "Artificial Neural Network": ["ANN", "Neural Net", "Multi-layer Perceptron", "Deep Neural Net"],
    "Bayesian Logistic": ["Bayesian Regression", "Bayesian Model"],
    "Bayesian Networks": ["Bayesian Belief", "Belief Net", "Bayes Net", 
                          "Probabilistic Directed Acyclic Graphical Model"],
    "Decision Tree Classifier": ["Decision Tree", "Classification Tree", "Tree Classifier", 
                                 "Classification And Regression Tree", "CART"],
    "Fuzzy Clustering": ["Fuzzy C-means", "Soft Clustering", "Fuzzy K-means", "C-Means Clustering"],
    "K-means Clustering": ["K-means", "Lloyd's Algorithm", "Hard Clustering", "Centroid-based Clustering"],
    "K-nearest Neighbor": ["KNN", "K-nearest", "Instance-based Learning", "Lazy Learning"],
    "Markov Process": ["Markov Chain", "Markov Model"],
    "Support Vector Machine": ["SVM", "Support Vector Classifier", "Support Vector Networks", "Kernel Methods"],
    "Support Vector Machine + Decision Tree Classifier": ["SVM Decision Tree", "Hybrid SVM Decision Tree", 
                                                          "Integrated SVM Tree Classifier", 
                                                          "Combined Support Vector Decision Tree Model"]
}

# Define list of sports used in the research
sports = ["basketball", "soccer", "volleyball", "baseball", "handball", 
          "australian football", "ice hockey", "american football", 
          "cricket", "field hockey", "rugby"]

In [28]:
import pandas as pd
import re

# Load the dataset
ref_abs = pd.read_csv(os.path.join('..','results','refs_abstracts_sys.csv'))

def find_ai_techniques(text):
    found_techniques = []
    for technique, aliases in ai_techniques_aliases.items():
        if any(alias.lower() in text.lower() for alias in aliases):
            found_techniques.append(technique)
    return found_techniques

def find_sports(text):
    found_sports = []
    for sport in sports:
        if sport in text.lower():
            found_sports.append(sport)
    return found_sports

def check_for_injury_prediction(text):
    return bool(re.search(r"injury (risk|prediction)", text, re.IGNORECASE))

def check_for_performance_prediction(text):
    return bool(re.search(r"sporting performance|performance analysis", text, re.IGNORECASE))

# Process the dataset
ai_techniques_col = []
sports_col = []
ai_for_injury_risk_col = []
ai_for_sporting_performance_col = []

for index, paper in ref_abs.iterrows():
    text = str(paper['Title']) + ' ' + str(paper['Abstract'])
    ai_techniques_col.append(find_ai_techniques(text))
    sports_col.append(find_sports(text))
    ai_for_injury_risk_col.append(check_for_injury_prediction(text))
    ai_for_sporting_performance_col.append(check_for_performance_prediction(text))

# Adding new columns to the dataset
ref_abs['AI_Techniques'] = ai_techniques_col
ref_abs['Sports'] = sports_col
ref_abs['AI_for_Injury_Risk'] = ai_for_injury_risk_col
ref_abs['AI_for_Sporting_Performance'] = ai_for_sporting_performance_col

# Saving the updated dataset
ref_abs


Unnamed: 0,Author,Title,Abstract,Journal,Year,AI_Techniques,Sports,AI_for_Injury_Risk,AI_for_Sporting_Performance
0,"López-Valenciano A, Ayala F, Puerta JM, et al",A preventive model for muscle injuries: a nove...,The application of contemporary statistical ap...,Medicine and science in sports and exercise,2018.0,[],[],False,False
1,Li C,Predict the neural network mathematical model ...,Deep learning has achieved impressive predicti...,Physical review letters,2020.0,[Artificial Neural Network],[basketball],False,False
2,Lu G,Evaluation model of young basketball players ’...,,,,[Artificial Neural Network],[basketball],False,False
3,Wu L,The participating team ’s technical analysis o...,,,,[Artificial Neural Network],[basketball],False,False
4,Zhang Q,Prediction based on basketball competition vid...,,,,[Artificial Neural Network],[basketball],False,False
5,"Kempe M, Grunz A, Memmert D",Detecting tactical patterns in basketball: com...,"The soaring amount of data, especially spatial...",European journal of sport science,2015.0,[Artificial Neural Network],[basketball],False,False
6,"Bianchi F, Facchinetti T, Zuccolotto P",Role revolution: towards a new meaning of posi...,,,,[],[basketball],False,False
7,"Tilp M, Schrapf N",Analysis of tactical defensive behavior in tea...,,,,[Artificial Neural Network],[handball],False,False
8,"Hassan A, Schrapf N, Ramadan W, et a l",Evaluation of ta ctical training in team handb...,,,,[Artificial Neural Network],[handball],False,False
9,"Hassan A, Schrapf N, Tilp M",The prediction of action positions in team han...,,,,[Artificial Neural Network],[handball],False,False


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to identify AI technique from text
def identify_ai_technique(text):
    for technique in ai_techniques:
        if technique.lower() in str(text).lower():
            return technique
    return "None"

# Function to identify sport from text
def identify_sport(text):
    for sport in sports:
        if sport in str(text).lower():
            return sport.capitalize()
    return "None"

# Function to identify if AI is used for injury risk prediction
def ai_injury(text):
    keywords = ["injury","injuries","medicine","risk"]
    for keyword in keywords:
        if keyword in str(text).lower():
            return "Yes"
    return "No"

# Function to identify if AI is used for sporting performance prediction
def ai_performance(text):
    keywords = ["performance", "technical", "tactical", "match"]
    for keyword in keywords:
        if keyword in str(text).lower():
            return "Yes"
    return "No"

# Define a function to find the most similar term using TF-IDF and cosine similarity

def find_most_similar(term, text, terms_list):

    # Combine the term and the text for TF-IDF vectorization

    combined_text = [term] + [text]

    

    # Vectorize the text

    vectorizer = TfidfVectorizer().fit(combined_text)

    vectorized_text = vectorizer.transform(combined_text)

    

    # Calculate cosine similarity

    similarity = cosine_similarity(vectorized_text[0:1], vectorized_text[1:])

    

    # Check if similarity is above a threshold, return the term if it is

    if similarity[0, 0] > 0.2:  # Threshold set to 0.2

        return term

    return "None"



# Function to identify AI technique using the most similar term from the list

def identify_ai_technique_tfidf(text):

    for technique in ai_techniques:

        most_similar = find_most_similar(technique, text, ai_techniques)

        if most_similar != "None":

            return most_similar

    return "None"



# Function to identify sport using the most similar term from the list

def identify_sport_tfidf(text):

    for sport in sports:

        most_similar = find_most_similar(sport, text, sports)

        if most_similar != "None":

            return most_similar.capitalize()

    return "None"

# Apply these functions to the dataset

# Apply these functions to the dataset
ref_abs["AI Technique"] = ref_abs.apply(lambda row: identify_ai_technique(row["Title"]) or identify_ai_technique(row["Abstract"]), axis=1)
ref_abs["Sport"] = ref_abs.apply(lambda row: identify_sport(row["Title"]) or identify_sport(row["Abstract"]), axis=1)
ref_abs["AI for Injury Risk"] = ref_abs.apply(lambda row: ai_injury(row["Title"]) or ai_injury(row["Abstract"]), axis=1)
ref_abs["AI for Performance"] = ref_abs.apply(lambda row: ai_performance(row["Title"]) or ai_performance(row["Abstract"]), axis=1)

ref_abs["AI Technique"] = ref_abs.apply(lambda row: identify_ai_technique_tfidf(row["Title"]) or identify_ai_technique_tfidf(row["Abstract"]), axis=1)

ref_abs["Sport"] = ref_abs.apply(lambda row: identify_sport_tfidf(row["Title"]) or identify_sport_tfidf(row["Abstract"]), axis=1)

# Display the updated dataset
ref_abs

Unnamed: 0,Author,Title,Abstract,Journal,Year,AI Technique,Sport,AI for Injury Risk,AI for Performance
0,"López-Valenciano A, Ayala F, Puerta JM, et al",A preventive model for muscle injuries: a nove...,The application of contemporary statistical ap...,Medicine and science in sports and exercise,2018.0,,,Yes,No
1,Li C,Predict the neural network mathematical model ...,Deep learning has achieved impressive predicti...,Physical review letters,2020.0,,,No,No
2,Lu G,Evaluation model of young basketball players ’...,,,,,,No,No
3,Wu L,The participating team ’s technical analysis o...,,,,,,No,Yes
4,Zhang Q,Prediction based on basketball competition vid...,,,,,,No,No
5,"Kempe M, Grunz A, Memmert D",Detecting tactical patterns in basketball: com...,"The soaring amount of data, especially spatial...",European journal of sport science,2015.0,,,No,Yes
6,"Bianchi F, Facchinetti T, Zuccolotto P",Role revolution: towards a new meaning of posi...,,,,,Basketball,No,No
7,"Tilp M, Schrapf N",Analysis of tactical defensive behavior in tea...,,,,,,No,Yes
8,"Hassan A, Schrapf N, Ramadan W, et a l",Evaluation of ta ctical training in team handb...,,,,,,No,No
9,"Hassan A, Schrapf N, Tilp M",The prediction of action positions in team han...,,,,,,No,No


In [25]:
import spacy
import pandas as pd
spacy.cli.download("en_core_web_md")
# Load the spaCy model with pre-trained word vectors
nlp = spacy.load("en_core_web_md")
ref_abs = pd.read_csv(os.path.join('..','results','refs_abstracts_sys.csv'))

# Function to find the most similar term using spaCy word vectors
def find_most_similar_spacy(term, text, terms_list):
    term_doc = nlp(preprocess_text(term))
    text_doc = nlp(preprocess_text(text))

    similarity = term_doc.similarity(text_doc)

    if similarity > 0.7:  # Adjust the threshold as needed
        return term

    return "None"

# Function to tokenize and lowercase text
def preprocess_text(text):
    tokens = [token.text.lower() for token in nlp(text) if not token.is_punct]
    return " ".join(tokens)

# Function to identify AI technique using spaCy word vectors
def identify_ai_technique_spacy(text):
    techniques_found = []
    for technique in ai_techniques:
        most_similar = find_most_similar_spacy(technique, text, ai_techniques)
        if most_similar != "None":
            techniques_found.append(most_similar)
    return ", ".join(techniques_found) if techniques_found else "None"

# Function to identify sport using spaCy word vectors
def identify_sport_spacy(text):
    for sport in sports:
        most_similar = find_most_similar_spacy(sport, text, sports)
        if most_similar != "None":
            return most_similar.capitalize()
    return "None"

# Function to identify if AI is used for injury prediction using spaCy
def ai_injury_spacy(text):
    keywords = ["injury", "injuries", "medicine", "risk"]
    for keyword in keywords:
        if keyword in preprocess_text(text):
            return "Yes"
    return "No"

# Function to identify if AI is used for sporting performance prediction using spaCy
def ai_performance_spacy(text):
    keywords = ["performance", "technical", "tactical", "match"]
    for keyword in keywords:
        if keyword in preprocess_text(text):
            return "Yes"
    return "No"

# Apply these functions to the dataset  
  # Replace with your dataset
ref_abs["AI Technique"] = ref_abs.apply(lambda row: identify_ai_technique_spacy(row["Title"]) or identify_ai_technique_spacy(row["Abstract"]), axis=1)
ref_abs["Sport"] = ref_abs.apply(lambda row: identify_sport_spacy(row["Title"]) or identify_sport_spacy(row["Abstract"]), axis=1)
ref_abs["AI for Injury Risk"] = ref_abs.apply(lambda row: ai_injury_spacy(row["Title"]) or ai_injury_spacy(row["Abstract"]), axis=1)
ref_abs["AI for Performance"] = ref_abs.apply(lambda row: ai_performance_spacy(row["Title"]) or ai_performance_spacy(row["Abstract"]), axis=1)
ref_abs["Sport"] = ref_abs.apply(lambda row: identify_sport_spacy(row["Title"]) or identify_sport_spacy(row["Abstract"]), axis=1)
ref_abs


Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


Unnamed: 0,Author,Title,Abstract,Journal,Year,AI Technique,Sport,AI for Injury Risk,AI for Performance
0,"López-Valenciano A, Ayala F, Puerta JM, et al",A preventive model for muscle injuries: a nove...,The application of contemporary statistical ap...,Medicine and science in sports and exercise,2018.0,Absolute shrinkage and selection operator,,Yes,No
1,Li C,Predict the neural network mathematical model ...,Deep learning has achieved impressive predicti...,Physical review letters,2020.0,"Absolute shrinkage and selection operator, Art...",,No,No
2,Lu G,Evaluation model of young basketball players ’...,,,,"Absolute shrinkage and selection operator, Art...",,No,No
3,Wu L,The participating team ’s technical analysis o...,,,,Bayesian networks,,No,Yes
4,Zhang Q,Prediction based on basketball competition vid...,,,,"Absolute shrinkage and selection operator, Art...",,No,No
5,"Kempe M, Grunz A, Memmert D",Detecting tactical patterns in basketball: com...,"The soaring amount of data, especially spatial...",European journal of sport science,2015.0,"Absolute shrinkage and selection operator, Art...",,No,Yes
6,"Bianchi F, Facchinetti T, Zuccolotto P",Role revolution: towards a new meaning of posi...,,,,,,No,No
7,"Tilp M, Schrapf N",Analysis of tactical defensive behavior in tea...,,,,"Absolute shrinkage and selection operator, Bay...",,No,Yes
8,"Hassan A, Schrapf N, Ramadan W, et a l",Evaluation of ta ctical training in team handb...,,,,Bayesian networks,,No,No
9,"Hassan A, Schrapf N, Tilp M",The prediction of action positions in team han...,,,,"Absolute shrinkage and selection operator, Bay...",,No,No
