Add new variables to the dataset (**AI Technique**, **Sport**, **AI for Injury Risk**, **AI for Sporting Performance**)

In [1]:
import pandas as pd
import os

# Loading the CSV file with references
new_variables = pd.read_csv(os.path.join('..','results','refs_abstracts_sys.csv'))
new_variables.head()

Unnamed: 0,Author,Title,Abstract,Journal,Year
0,"López-Valenciano A, Ayala F, Puerta JM, et al",A preventive model for muscle injuries: a nove...,The application of contemporary statistical ap...,Medicine and science in sports and exercise,2018.0
1,Li C,Predict the neural network mathematical model ...,Deep learning has achieved impressive predicti...,Physical review letters,2020.0
2,Lu G,Evaluation model of young basketball players ’...,,,
3,Wu L,The participating team ’s technical analysis o...,,,
4,Zhang Q,Prediction based on basketball competition vid...,,,


In [2]:
# Define dictionary of AI techniques based on Table 2 and their most plausible corresponding aliases
ai_techniques = {
    "Absolute Shrinkage and Selection Operator": ["LASSO", "Least Absolute Shrinkage and Selection Operator", "L1 Regularization"],
    "Artificial Neural Network": ["ANN", "Neural Net", "Deep Learning", "Perceptron", 
                                  "Self-organising map", "Self-organizing map", "SOM"],
    "Bayesian Logistic": ["Bayesian Regression", "Bayesian Model"],
    "Bayesian Network": ["Bayesian Belief", "Belief Net", "Bayes Net", 
                          "Probabilistic Directed Acyclic Graphical Model"],
    "Decision Tree Classifier": ["Decision Tree", "Classification Tree", "Tree Classifier", 
                                 "Classification And Regression Tree", "CART"],
    "Fuzzy Clustering": ["Fuzzy C-means", "Soft Clustering", "Fuzzy K-means", "C-Means Clustering"],
    "K-means Clustering": ["K-means", "Lloyd's Algorithm", "Hard Clustering", "Centroid-based Clustering"],
    "K-nearest Neighbor": ["KNN", "K-nearest", "Instance-based Learning", "Lazy Learning"],
    "Markov Process": ["Markov Chain", "Markov Model"],
    "Support Vector Machine": ["SVM", "Support Vector Classifier", "Support Vector Networks", "Kernel Methods"]
}

# Define dictionary of sports used in the research along with their aliases
sports = {
    "Basketball": ["Basket"],
    "Soccer": ["Association Football"],
    "Volleyball": ["Volley"],
    "Baseball": ["MLB"],
    "Handball": [],
    "Australian Football": ["AFL Football"],
    "Ice Hockey": ["Hockey", "NHL", "Stanley Cup"],
    "American Football": ["NFL Football", "College Football"],
    "Cricket": ["IPL Bowl"],
    "Field Hockey": [],
    "Rugby": []
}

# Define keywords to help us decide which papers are related to injuries and/or to sporting performance
injury_keywords = ["injur", "medic", "risk", "prevent", "diagnosis", "detection"]
perf_keywords = ["performance", "techni", "tactical", "match", "position",
                 "possession", "ranking", "team", "skill", "game", "win"]

In [3]:
# Load the dataset
ref_abs = pd.read_csv(os.path.join('..','results','refs_abstracts_sys.csv'))

# Define a function that will find instances of a certain item from a dictionary in text (non-exact matches allowed)
def find_dictionary(text, dictionary):
    found = []
    for item, aliases in dictionary.items():
        if item.lower() in text.lower() or any(alias.lower() in text.lower() for alias in aliases):
            found.append(item)
    return found

# Function to identify if AI is used for sporting performance prediction or injury risk
def ai_logical(text, keywords):
    for key in keywords:
        if key in text.lower():
            return "Yes"
    return "No"

# Process the dataset
ai_techniques_col = []
sports_col = []
ai_for_injury_risk_col = []
ai_for_performance_col = []

for index, paper in ref_abs.iterrows():
    text = str(paper['Title']) + ' ' + str(paper['Abstract']) + ' ' + str(paper['Journal'])

    # Find AI techniques and sports
    ai_techniques_list = find_dictionary(text, ai_techniques)
    sports_list = find_dictionary(text, sports)

    # Convert to string, or use None if the list is empty    
    ai_techniques_col.append(' + '.join(ai_techniques_list) if ai_techniques_list else pd.NA)
    sports_col.append(', '.join(sports_list) if sports_list else pd.NA)
    ai_for_injury_risk_col.append(ai_logical(text, injury_keywords))
    ai_for_performance_col.append(ai_logical(text, perf_keywords))

# Adding new columns to the dataset
new_variables['AI Techniques'] = ai_techniques_col
new_variables['Sports'] = sports_col
new_variables['AI for Injury Risk'] = ai_for_injury_risk_col
new_variables['AI for Performance'] = ai_for_performance_col

# Exporting the updated dataset to csv
ref_abs.to_csv(os.path.join('..','results','new_variables.csv'), index=False)

new_variables.head()


Unnamed: 0,Author,Title,Abstract,Journal,Year,AI Techniques,Sports,AI for Injury Risk,AI for Performance
0,"López-Valenciano A, Ayala F, Puerta JM, et al",A preventive model for muscle injuries: a nove...,The application of contemporary statistical ap...,Medicine and science in sports and exercise,2018.0,,,Yes,No
1,Li C,Predict the neural network mathematical model ...,Deep learning has achieved impressive predicti...,Physical review letters,2020.0,Artificial Neural Network,Basketball,No,Yes
2,Lu G,Evaluation model of young basketball players ’...,,,,Artificial Neural Network,Basketball,No,Yes
3,Wu L,The participating team ’s technical analysis o...,,,,Artificial Neural Network,Basketball,No,Yes
4,Zhang Q,Prediction based on basketball competition vid...,,,,Artificial Neural Network,Basketball,No,No
