### This script will explore topic modelling on the title, abstract and then abstract and title together using 3 different approaches (basic, stemming, and lemmatization).

### 1. Title Only 

In [20]:
import pandas as pd
import os

#Load the CSV file with title only
references = pd.read_csv(os.path.join('..','results','refs_systematic.csv'))
references.head()

Unnamed: 0,Author,Title
0,"López-Valenciano A, Ayala F, Puerta JM, et al",A preventive model for muscle injuries: a nove...
1,Li C,Predict the neural network mathematical model ...
2,Lu G,Evaluation model of young basketball players ’...
3,Wu L,The participating team ’s technical analysis o...
4,Zhang Q,Prediction based on basketball competition vid...


#### 1.1 Finding the Most Common Words in Title (Basic)

In [3]:
#Using the nltk package for topic modeling
import nltk

#Filtering out common words that are "meaningless" using stop words
nltk.download('stopwords', quiet = True)
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from collections import Counter
import string

#Function for cleaning and preprocessing the titles in a basic way
def basic_preproc(text):
    #Lowercasing, punctuation removal, word tokenization, and stop word filtering
    cleaned_titles = []
    for sentence in text:
        if isinstance(sentence, str):
            #Replace hyphens and slashes with spaces, then split the text into words
            #Lowercasing, punctuation removal, word tokenization, and stop words

            words = sentence.lower().translate(str.maketrans('-/', '  ')).split()
            words = sentence.lower().translate(str.maketrans('', '', string.punctuation)).split()
            #taking into account words with more than 3 characters
            words = [word for word in words if word not in stop_words and len(word) > 3] 
            cleaned_titles.extend(words)

    return cleaned_titles

#Preprocessing titles and counting
cleaned_titles = basic_preproc(references['Title'])
word_counts = Counter(cleaned_titles)
words_basic_T = word_counts.most_common(30)

print(words_basic_T)

[('neural', 17), ('based', 15), ('team', 13), ('basketball', 12), ('network', 11), ('football', 11), ('data', 10), ('model', 9), ('using', 9), ('artificial', 8), ('performance', 8), ('analysis', 7), ('prediction', 7), ('tactical', 7), ('networks', 7), ('training', 7), ('soccer', 7), ('mining', 7), ('technical', 6), ('match', 6), ('volleyball', 6), ('learning', 5), ('application', 5), ('machine', 5), ('injury', 5), ('games', 4), ('handball', 4), ('game', 4), ('elite', 4), ('professional', 4)]


#### 1.2 Finding the Most Common Words in Title (Stemming)

In [4]:
#Import necessary libraties
from nltk.stem import PorterStemmer

#Function for cleaning and preprocessing text with stemming
def stem_preproc(text):
    stemmer = PorterStemmer()
    #Lowercasing, punctuation removal, word tokenization, stemming and stop word filtering
    cleaned_titles = []
    for sentence in text:
        if isinstance(sentence, str):
            words = sentence.lower().translate(str.maketrans('-/', '  ')).split()
            words = sentence.lower().translate(str.maketrans('', '', string.punctuation)).split()
            stemmed_words = [stemmer.stem(word) for word in words if word not in stop_words]
            cleaned_titles.extend(stemmed_words)

    return cleaned_titles

#Preprocessing titles and counting
cleaned_titles = stem_preproc(references['Title'])
word_counts = Counter(cleaned_titles)
words_stemming_T = word_counts.most_common(30)

print(words_stemming_T)

[('network', 18), ('neural', 17), ('base', 15), ('team', 15), ('basketbal', 12), ('footbal', 12), ('model', 11), ('predict', 11), ('data', 10), ('perform', 9), ('use', 9), ('game', 8), ('artifici', 8), ('injuri', 7), ('analysi', 7), ('tactic', 7), ('train', 7), ('soccer', 7), ('match', 7), ('mine', 7), ('learn', 6), ('’s', 6), ('technic', 6), ('volleybal', 6), ('machin', 6), ('player', 5), ('athlet', 5), ('applic', 5), ('outcom', 5), ('evalu', 4)]


#### 1.3 Finding the Most Common Words in Title (Lemmatization)

In [5]:
#Import necessary libraties
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag

#Ensure necessary NLTK resources are downloaded
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('wordnet', quiet=True)

#Function to map NLTK's part of speech tags to those used by WordNet
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

#Function for cleaning and preprocessing text with lemmatization
def lemma_preproc(text):
    lemmatizer = WordNetLemmatizer()
    #Lowercasing, punctuation removal, word tokenization, lemmatizing and stop word filtering
    cleaned_titles = []
    for sentence in text:
        if isinstance(sentence, str):
            words = sentence.lower().translate(str.maketrans('-/', '  ')).split()
            words = sentence.lower().translate(str.maketrans('', '', string.punctuation)).split()
            pos_tags = pos_tag(words)
            lem_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags if word not in stop_words]
            cleaned_titles.extend(lem_words)

    return [word for word in cleaned_titles if len(word) > 3]

#Preprocessing titles and counting
cleaned_titles = lemma_preproc(references['Title'])
word_counts = Counter(cleaned_titles)
words_lemm_T = word_counts.most_common(30)

print(words_lemm_T)

[('network', 18), ('neural', 17), ('base', 15), ('team', 15), ('basketball', 12), ('football', 11), ('model', 10), ('data', 10), ('game', 8), ('prediction', 8), ('artificial', 8), ('performance', 8), ('injury', 7), ('analysis', 7), ('tactical', 7), ('soccer', 7), ('match', 7), ('mining', 7), ('technical', 6), ('training', 6), ('volleyball', 6), ('machine', 6), ('player', 5), ('athlete', 5), ('application', 5), ('outcome', 5), ('learn', 4), ('handball', 4), ('elite', 4), ('professional', 4)]


#### 1.4 Displaying All Three Methods Together and Conclusion

In [6]:
def results_df(basic_df, stem_df, lemma_df):

    #Using pandas to display the 10 most common words in a table for ease of comparison
    results_df = pd.DataFrame({
        'Method': ['Basic', 'Stemming', 'Lemmatization'],
        '1st Most Common': [basic_df[0][0], stem_df[0][0], lemma_df[0][0]],
        '2nd Most Common': [basic_df[1][0], stem_df[1][0], lemma_df[1][0]],
        '3rd Most Common': [basic_df[2][0], stem_df[2][0], lemma_df[2][0]],
        '4th Most Common': [basic_df[3][0], stem_df[3][0], lemma_df[3][0]],
        '5th Most Common': [basic_df[4][0], stem_df[4][0], lemma_df[4][0]],
        '6th Most Common': [basic_df[5][0], stem_df[5][0], lemma_df[5][0]],
        '7th Most Common': [basic_df[6][0], stem_df[6][0], lemma_df[6][0]],
        '8th Most Common': [basic_df[7][0], stem_df[7][0], lemma_df[7][0]],
        '9th Most Common': [basic_df[8][0], stem_df[8][0], lemma_df[8][0]],
        '10th Most Common': [basic_df[9][0], stem_df[9][0], lemma_df[9][0]]
    })
    results_df.set_index('Method', inplace=True)

    #Applying basic styling to the table 
    styled_df = results_df.style.set_properties(**{
        'background-color': 'white',  #Background color
        'color': 'black',             #Font color
        'border-color': 'black',      #Border color
        'border-style': 'solid',      #Border style
        'border-width': '1px'         #Border width
    }).set_table_styles([{
        'selector': 'th',
        'props': [('background-color', '#f4f4f4'), ('color', 'black')]  #Header styling
    }])
    return styled_df

titles_results = results_df(words_basic_T, words_stemming_T, words_lemm_T)

titles_results

Unnamed: 0_level_0,1st Most Common,2nd Most Common,3rd Most Common,4th Most Common,5th Most Common,6th Most Common,7th Most Common,8th Most Common,9th Most Common,10th Most Common
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Basic,neural,based,team,basketball,network,football,data,model,using,artificial
Stemming,network,neural,base,team,basketbal,footbal,model,predict,data,perform
Lemmatization,network,neural,base,team,basketball,football,model,data,game,prediction


***Conclusion***: Lemmatization considers the context and part of speech of a word, leading to more accurate results as it reduces words to their dictionary form, but requires more computational power. Stemming is robust for search and indexing purposes as the exact form of a word is less important and is faster. It's interesting to note that some words such as "team" and "basketball" have drastic differences across the methods, whereas words such as "football" have more consistent positioning.

### 2. Abstract Only 

In [7]:
#Loading the CSV file with references
ref_abs = pd.read_csv(os.path.join('..','results','refs_abstracts_sys.csv'))
ref_abs.head()

Unnamed: 0,Author,Title,Abstract,Journal,Year
0,"López-Valenciano A, Ayala F, Puerta JM, et al",A preventive model for muscle injuries: a nove...,The application of contemporary statistical ap...,Medicine and science in sports and exercise,2018.0
1,Li C,Predict the neural network mathematical model ...,Deep learning has achieved impressive predicti...,Physical review letters,2020.0
2,Lu G,Evaluation model of young basketball players ’...,,,
3,Wu L,The participating team ’s technical analysis o...,,,
4,Zhang Q,Prediction based on basketball competition vid...,,,


 #### 2.1 Finding the Most Common Words in Abstract (Basic)

In [8]:
#Preprocessing abstracts and counting
cleaned_abstracts = basic_preproc(ref_abs['Abstract'])
word_counts = Counter(cleaned_abstracts)
words_basic_A = word_counts.most_common(10)

print(words_basic_A)

[('data', 27), ('team', 27), ('performance', 26), ('classification', 19), ('ball', 18), ('match', 16), ('indicators', 15), ('football', 15), ('learning', 14), ('analysis', 12)]


This Method is limited, as words like "sport" and "sports" will be counted separately. We will further apply lemmatization and stemming to address this.

#### 2.2 Finding the Most Common Words in Abstract (Stemming)

In [9]:
#Preprocessing abstracts and counting
cleaned_abstracts = stem_preproc(ref_abs['Abstract'])
word_counts = Counter(cleaned_abstracts)
words_stemming_A = word_counts.most_common(10)

print(words_stemming_A)

[('team', 34), ('perform', 33), ('use', 32), ('classifi', 28), ('data', 27), ('indic', 22), ('model', 21), ('classif', 19), ('match', 19), ('ball', 18)]


#### 2.3 Most Common words in Abstract (Lematization)

In [10]:
#Preprocessing the abstracts and counting
cleaned_abstracts = lemma_preproc(ref_abs['Abstract'])
word_counts = Counter(cleaned_abstracts)
words_lemm_A = word_counts.most_common(10)

print(words_lemm_A)

[('team', 34), ('data', 27), ('performance', 26), ('model', 20), ('classification', 19), ('match', 19), ('ball', 18), ('indicator', 18), ('classify', 16), ('outcome', 16)]


### 2.4 Displaying All Three Methods Together and Conclusion

In [11]:
abstracts_results = results_df(words_basic_A, words_stemming_A, words_lemm_A)

abstracts_results

Unnamed: 0_level_0,1st Most Common,2nd Most Common,3rd Most Common,4th Most Common,5th Most Common,6th Most Common,7th Most Common,8th Most Common,9th Most Common,10th Most Common
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Basic,data,team,performance,classification,ball,match,indicators,football,learning,analysis
Stemming,team,perform,use,classifi,data,indic,model,classif,match,ball
Lemmatization,team,data,performance,model,classification,match,ball,indicator,classify,outcome


***Conclusion:***  Lemmatization considers the context and part of speech of a word, leading to more accurate results as it reduces words to their dictionary form, but requires more computational power. Stemming is robust for search and indexing purposes as the exact form of a word is less important and is faster. It's interesting to note that the word "data" is consistently the most common word across all 3 methods in the abstract and some words like "risk" have greatly varying positioning. Overall, compared to titles the differences are less extreme.

Accross both title and abstract, the most common words vary significantly. In the 3rd part we will discuss what are the most common words in both together.

### 3. Title and Abstract Together

#### 3.1 Most Common Words in Title and Abstract (Basic)

In [12]:
#Columns are named 'Title' and 'Abstract' and combined_texts combines them
#Replace NaN values with empty strings
combined_texts = ref_abs['Title'].fillna('') + ' ' + ref_abs['Abstract'].fillna('')

#Preprocessing titles and abstracts together and counting
cleaned_texts = basic_preproc(combined_texts)
word_counts = Counter(cleaned_texts)
words_basic_TA = word_counts.most_common(10)

print(words_basic_TA)

[('team', 40), ('data', 37), ('performance', 34), ('based', 26), ('football', 26), ('classification', 22), ('match', 22), ('neural', 21), ('using', 21), ('model', 20)]


#### 3.2 Most Common Words in  Title and Abstract (Stemming)

In [13]:
#Preprocessing the abstracts and counting
cleaned_texts = stem_preproc(combined_texts)
word_counts = Counter(cleaned_texts)
words_stemming_TA = word_counts.most_common(10)

print(words_stemming_TA)

[('team', 49), ('perform', 42), ('use', 41), ('data', 37), ('model', 32), ('predict', 28), ('classifi', 28), ('footbal', 28), ('base', 26), ('network', 26)]


### 3.3 Most Common Words in  Title and Abstract (Lemmatization)

In [14]:
#Preprocessing the abstracts and counting
cleaned_texts = lemma_preproc(combined_texts)
word_counts = Counter(cleaned_texts)
words_lemm_TA = word_counts.most_common(10)

print(words_lemm_TA)

[('team', 49), ('data', 37), ('performance', 34), ('model', 30), ('base', 26), ('network', 26), ('match', 26), ('football', 26), ('classification', 22), ('analysis', 22)]


#### 3.4 Displaying all Three Methods Together and Conclusion

In [15]:
TA_results = results_df(words_basic_TA, words_stemming_TA, words_lemm_TA)

TA_results

Unnamed: 0_level_0,1st Most Common,2nd Most Common,3rd Most Common,4th Most Common,5th Most Common,6th Most Common,7th Most Common,8th Most Common,9th Most Common,10th Most Common
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Basic,team,data,performance,based,football,classification,match,neural,using,model
Stemming,team,perform,use,data,model,predict,classifi,footbal,base,network
Lemmatization,team,data,performance,model,base,network,match,football,classification,analysis


### 4. Identifying the Most Common Bigrams and Trigrams in Title, Abstract and then Title and Abstract

#### 4.1 Most Common Bigrams and Trigrams in Title Only

In [16]:
from nltk import bigrams, trigrams

# Download required NLTK data
nltk.download('punkt',quiet = True)

def bitri_preproc(text):
    # Convert to lowercase, remove punctuation, and split into words
    words = text.lower().translate(str.maketrans('-/', '  ')).split()
    # Remove stopwords & return
    return [word for word in words if word not in stop_words]

# Combine and preprocess titles
all_words = []
for title in references['Title'].fillna(''):
    all_words.extend(bitri_preproc(title))

# Find the most common bigrams and trigrams
bigram_counts = Counter(bigrams(all_words))
trigram_counts = Counter(trigrams(all_words))

title_bigrams = bigram_counts.most_common(10)
title_trigrams = trigram_counts.most_common(10)

print("Most common bigrams in Title:", title_bigrams)
print("Most common trigrams in Title:", title_trigrams)

Most common bigrams in Title: [(('neural', 'network'), 11), (('artificial', 'neural'), 8), (('data', 'mining'), 7), (('neural', 'networks'), 6), (('team', 'handball'), 4), (('’s', 'basketball'), 3), (('olympic', 'games'), 3), (('football', 'using'), 3), (('team', 'performance'), 3), (('technical', 'tactical'), 3)]
Most common trigrams in Title: [(('artificial', 'neural', 'network'), 5), (('artificial', 'neural', 'networks'), 3), (('rbf', 'neural', 'network'), 2), (('women', '’s', 'basketball'), 2), (('self', 'organising', 'maps'), 2), (('team', 'handball', 'means'), 2), (('handball', 'means', 'artificial'), 2), (('means', 'artificial', 'neural'), 2), (('explaining', 'match', 'outcome'), 2), (('team', 'performance', 'indicators'), 2)]


#### 4.2 Most Common Bigrams and Trigrams in Abstract Only

In [17]:
#Preprocess and combine all abstracts into one list
all_words = []
for abstract in ref_abs['Abstract'].fillna(''):
    all_words.extend(bitri_preproc(abstract))

#Generate and count bigrams and trigrams
bigram_counts = Counter(bigrams(all_words))
trigram_counts = Counter(trigrams(all_words))

#Get the 10 most common bigrams and trigrams
abs_bigrams = bigram_counts.most_common(10)
abs_trigrams = trigram_counts.most_common(10)

#Display the results
print("Most common bigrams in abstracts in Abstract:", abs_bigrams)
print("Most common trigrams in abstracts in Abstract:", abs_trigrams)

Most common bigrams in abstracts in Abstract: [(('performance', 'indicators'), 12), (('team', 'performance'), 10), (('match', 'outcome'), 7), (('ground', 'reaction'), 7), (('machine', 'learning'), 6), (('ball', 'possession'), 5), (('logistic', 'regression'), 5), (('classification', 'accuracy'), 5), (('mir', '146a'), 5), (('146a', '5p'), 5)]
Most common trigrams in abstracts in Abstract: [(('team', 'performance', 'indicators'), 8), (('mir', '146a', '5p'), 5), (('ground', 'reaction', 'force'), 4), (('class', 'errors', 'ranging'), 3), (('ci', 'classification', 'tree'), 3), (('provided', 'greatest', 'probability'), 3), (('"field', 'goal', 'percentage",'), 3), (('support', 'vector', 'machine'), 3), (('one', 'versus', 'one'), 3), (('time', 'course', 'ground'), 3)]


#### 4.3 Most Common Bigrams and Trigrams in Abstract and Title

In [18]:
#Preprocess and combine all texts into one list
all_words = []
for text in combined_texts:
    all_words.extend(bitri_preproc(text))

#Generate and count bigrams and trigrams
bigram_counts = Counter(bigrams(all_words))
trigram_counts = Counter(trigrams(all_words))

#Get the 10 most common bigrams and trigrams
TA_bigrams = bigram_counts.most_common(10)
TA_trigrams = trigram_counts.most_common(10)

# Display the results
print("Most common bigrams in Title and Abstract:", TA_bigrams)
print("Most common trigrams in Title and Abstract:", TA_trigrams)

Most common bigrams in Title and Abstract: [(('performance', 'indicators'), 14), (('team', 'performance'), 13), (('neural', 'network'), 12), (('match', 'outcome'), 9), (('machine', 'learning'), 8), (('data', 'mining'), 8), (('artificial', 'neural'), 8), (('olympic', 'games'), 7), (('neural', 'networks'), 7), (('ground', 'reaction'), 7)]
Most common trigrams in Title and Abstract: [(('team', 'performance', 'indicators'), 10), (('artificial', 'neural', 'network'), 5), (('support', 'vector', 'machine'), 5), (('mir', '146a', '5p'), 5), (('one', 'versus', 'one'), 4), (('ground', 'reaction', 'force'), 4), (('artificial', 'neural', 'networks'), 3), (('class', 'errors', 'ranging'), 3), (('ci', 'classification', 'tree'), 3), (('provided', 'greatest', 'probability'), 3)]


### 5. Export information of interest to csv

In [19]:
word_counts.most_common(35), bigram_counts.most_common(20), trigram_counts.most_common(10)

#Subsetting the lemmatized word counts to only include frequencies geq 11
subset_word_counts = [item for item in word_counts.most_common() if item[1] >= 11]
#Subsetting the bigram counts to only include frequencies geq 4
subset_bigram_counts = [item for item in bigram_counts.most_common() if item[1] >= 4]
#Subsetting the trigram counts to only include frequencies geq 4
subset_trigram_counts = [item for item in trigram_counts.most_common() if item[1] >= 4]

counts = subset_word_counts + subset_bigram_counts + subset_trigram_counts

counts_df = pd.DataFrame(counts, columns=["String", "Frequency"])
#Concatenate bigrams/trigrams
counts_df['String'] = counts_df['String'].apply(lambda x: ' '.join(x) if isinstance(x, tuple) else x)
#Remove some of the entries that have numerical characters in them stemming from results and references to other papers
counts_df = counts_df[counts_df['String'].apply(lambda x: not any(char.isdigit() for char in x))]

#Export to csv for future use
counts_df.to_csv(os.path.join('..','results','common_strings.csv'), index=False)

#Display a preview of the dataframe
counts_df 

Unnamed: 0,String,Frequency
0,team,49
1,data,37
2,performance,34
3,model,30
4,base,26
...,...,...
77,team performance indicators,10
78,artificial neural network,5
79,support vector machine,5
81,one versus one,4
