### This script will explore topic modelling on the title, abstract and then abstract and title together using 3 different approaches (basic, stemming, and lemmatization).

### 1. Title Only 

In [402]:
#Loading the CSV file with references
import pandas as pd
import os

references = pd.read_csv(os.path.join('..','results','paper_refs.csv'))
references

Unnamed: 0,Author,Title
0,"Russell S, Norvig P",Artificial Intelligence: a modern approach
1,"Witten IH, Frank E, Hall MA, et al",Data Mining: practical Machine Learning tools ...
2,"Zaki MJ, Meira Jr, W",Data Mining and analysis: fundamental concepts...
3,"Passfield L, Hopker JG",A mine of information: can sports analytics pr...
4,"Rein R, Memmert D",Big data and tactical analysis in elite soccer...
...,...,...
98,"Dalton-Barron NE, McLaren SJ, Black CJ, et al",Identifying contextual influences on training ...
99,"McLaren SJ, Weston M, Smith A, et al",Variability of physical performance and player...
100,"Oliveira WK, Jesus K, Andrade AD, et al",Monitoring training load in beach volleyball p...
101,"Düking P, Achtzehn S, Holmberg HC, Sperlich B",Integrated framework of load monitoring by a c...


#### 1.1 Finding the Most Common Words in Title (Basic)

In [403]:
#Using the nltk package for topic modeling
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sonayavrumyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [404]:
from nltk.corpus import stopwords
from collections import Counter
import string

#Function for cleaning and preprocessing the titles
def preprocess_titles(titles):
    prepositions = set([
        
    ])
    #Filtering out common words that are "meaningless" (such as prepositions) using stop words
    stop_words = set(stopwords.words('english')).union(prepositions)
    
    #Lowercasing, punctuation removal, word tokenization, and stop word filtering
    cleaned_titles = []
    for title in titles:
        words = title.lower().translate(str.maketrans('', '', string.punctuation)).split()
        words = [word for word in words if word not in stop_words]
        cleaned_titles.extend(words)

    return cleaned_titles

#Preprocessing titles and counting
cleaned_titles = preprocess_titles(references['Title'])
word_counts = Counter(cleaned_titles)
most_common_words_basic_T = word_counts.most_common(10)

print(most_common_words_basic_T)

[('football', 17), ('neural', 17), ('data', 16), ('training', 16), ('based', 16), ('team', 16), ('performance', 15), ('basketball', 14), ('artificial', 13), ('sports', 13)]


#### 1.2 Finding the Most Common Words in Title (Stemming)

In [405]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from collections import Counter
import string

#Function for cleaning and preprocessing the titles with stemming
def preprocess_titles(titles):
    stemmer = PorterStemmer()
    prepositions = set([
        # ... (list all prepositions here) ...
    ])
    stop_words = set(stopwords.words('english')).union(prepositions)
#Lowercasing, punctuation removal, word tokenization, stemming and stop word filtering
    cleaned_titles = []
    for title in titles:
        words = title.lower().translate(str.maketrans('', '', string.punctuation)).split()
        stemmed_words = [stemmer.stem(word) for word in words if word not in stop_words]
        cleaned_titles.extend(stemmed_words)

    return cleaned_titles

#Preprocessing titles and counting
cleaned_titles = preprocess_titles(references['Title'])
word_counts = Counter(cleaned_titles)
most_common_words_stemming_T = word_counts.most_common(10)

print(most_common_words_stemming_T)


[('footbal', 18), ('network', 18), ('team', 18), ('perform', 17), ('neural', 17), ('data', 16), ('train', 16), ('base', 16), ('sport', 15), ('injuri', 15)]


#### 1.3 Finding the Most Common Words in Title (Lemmatization)

In [406]:
#Import necessary libraties
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.corpus import stopwords
from collections import Counter
import string
import nltk

#Ensure necessary NLTK resources are downloaded
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

#Function to map NLTK's part of speech tags to those used by WordNet
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

#Function for cleaning and preprocessing the titles with lemmatization
def preprocess_titles(titles):
    lemmatizer = WordNetLemmatizer()
    prepositions = set([
        
    ])
    stop_words = set(stopwords.words('english')).union(prepositions)
#Lowercasing, punctuation removal, word tokenization, lemmatizing and stop word filtering
    cleaned_titles = []
    for title in titles:
        words = title.lower().translate(str.maketrans('', '', string.punctuation)).split()
        pos_tags = pos_tag(words)
        lem_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags if word not in stop_words]
        cleaned_titles.extend(lem_words)

    return cleaned_titles
#Preprocessing titles and counting
cleaned_titles = preprocess_titles(references['Title'])
word_counts = Counter(cleaned_titles)
most_common_words_lemm_T = word_counts.most_common(10)

print(most_common_words_lemm_T)


[('network', 18), ('team', 18), ('football', 17), ('neural', 17), ('data', 16), ('base', 16), ('sport', 15), ('performance', 15), ('injury', 15), ('basketball', 14)]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sonayavrumyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sonayavrumyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### 1.4 Displaying All Three Methods Together and Conclusion

In [407]:
#Using pandas to display the 10 most common words in a table for ease of comparison
import pandas as pd
results_df = pd.DataFrame({
    'Method': ['Basic', 'Stemming', 'Lemmatization'],
    '1st Most Common': [most_common_words_basic_T[0][0], most_common_words_stemming_T[0][0], most_common_words_lemm_T[0][0]],
    '2nd Most Common': [most_common_words_basic_T[1][0], most_common_words_stemming_T[1][0], most_common_words_lemm_T[1][0]],
    '3rd Most Common': [most_common_words_basic_T[2][0], most_common_words_stemming_T[2][0], most_common_words_lemm_T[2][0]],
    '4th Most Common': [most_common_words_basic_T[3][0], most_common_words_stemming_T[3][0], most_common_words_lemm_T[3][0]],
    '5th Most Common': [most_common_words_basic_T[4][0], most_common_words_stemming_T[4][0], most_common_words_lemm_T[4][0]],
    '6th Most Common': [most_common_words_basic_T[5][0], most_common_words_stemming_T[5][0], most_common_words_lemm_T[5][0]],
    '7th Most Common': [most_common_words_basic_T[6][0], most_common_words_stemming_T[6][0], most_common_words_lemm_T[6][0]],
    '8th Most Common': [most_common_words_basic_T[7][0], most_common_words_stemming_T[7][0], most_common_words_lemm_T[7][0]],
    '9th Most Common': [most_common_words_basic_T[8][0], most_common_words_stemming_T[8][0], most_common_words_lemm_T[8][0]],
    '10th Most Common': [most_common_words_basic_T[9][0], most_common_words_stemming_T[9][0], most_common_words_lemm_T[9][0]]
})
results_df.set_index('Method', inplace=True)

#Applying basic styling to the table 
styled_df = results_df.style.set_properties(**{
    'background-color': 'white',  #Background color
    'color': 'black',             #Font color
    'border-color': 'black',      #Border color
    'border-style': 'solid',      #Border style
    'border-width': '1px'         #Border width
}).set_table_styles([{
    'selector': 'th',
    'props': [('background-color', '#f4f4f4'), ('color', 'black')]  #Header styling
}])

styled_df

Unnamed: 0_level_0,1st Most Common,2nd Most Common,3rd Most Common,4th Most Common,5th Most Common,6th Most Common,7th Most Common,8th Most Common,9th Most Common,10th Most Common
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Basic,football,neural,data,training,based,team,performance,basketball,artificial,sports
Stemming,footbal,network,team,perform,neural,data,train,base,sport,injuri
Lemmatization,network,team,football,neural,data,base,sport,performance,injury,basketball


***Conclusion***: Lemmatization considers the context and part of speech of a word, leading to more accurate results as it reduces words to their dictionary form, but requires more computational power. Stemming is robust for search and indexing purposes as the exact form of a word is less important and is faster. It's interesting to note that some words such as "team" and "basketball" have drastic differences across the methods, whereas words such as "football" have more consistent positioning.

### 2. Abstract Only 

In [408]:
#Loading the CSV file with references
ref_abs = pd.read_csv(os.path.join('..','results','paper_refs_abstracts.csv'))
ref_abs

Unnamed: 0,Author,Title,Abstract,Year
0,"Russell S, Norvig P",Artificial Intelligence: a modern approach,From the enraged robots in the 1920 play R.U.R...,2015.0
1,"Witten IH, Frank E, Hall MA, et al",Data Mining: practical Machine Learning tools ...,,
2,"Zaki MJ, Meira Jr, W",Data Mining and analysis: fundamental concepts...,,
3,"Passfield L, Hopker JG",A mine of information: can sports analytics pr...,This paper explores the notion that the availa...,2017.0
4,"Rein R, Memmert D",Big data and tactical analysis in elite soccer...,Until recently tactical analysis in elite socc...,2016.0
...,...,...,...,...
98,"Dalton-Barron NE, McLaren SJ, Black CJ, et al",Identifying contextual influences on training ...,"Dalton-Barron, NE, McLaren, SJ, Black, CJ, Gra...",2021.0
99,"McLaren SJ, Weston M, Smith A, et al",Variability of physical performance and player...,The aims of this study were to establish sourc...,2021.0
100,"Oliveira WK, Jesus K, Andrade AD, et al",Monitoring training load in beach volleyball p...,,
101,"Düking P, Achtzehn S, Holmberg HC, Sperlich B",Integrated framework of load monitoring by a c...,Athletes schedule their training and recovery ...,2018.0


 #### 2.1 Finding the Most Common Words in Abstract (Basic)

In [409]:
#Function for cleaning and preprocessing the abstracts still using NLTK
def preprocess_text(texts):
    prepositions = set([
    
    ])
    #Filtering out common words that are "meaningless" (such as prepositions) using stop words
    stop_words = set(stopwords.words('english')).union(prepositions)

    cleaned_texts = []
    for text in texts:
        if isinstance(text, str):
            #Replace hyphens and slashes with spaces, then split the text into words
            #Lowercasing, punctuation removal, word tokenization, and stop word filtering
            words = text.lower().translate(str.maketrans('-/', '  ')).split()
            #Split on other punctuations and filter out stop words
            words = [word for part in words for word in part.translate(str.maketrans('', '', string.punctuation)).split() if word not in stop_words]
            cleaned_texts.extend(words)

    return cleaned_texts
#Preprocessing abstracts and counting
cleaned_abstracts = preprocess_text(ref_abs['Abstract'])
word_counts = Counter(cleaned_abstracts)
most_common_words_basic_A = word_counts.most_common(10)

print(most_common_words_basic_A)


[('data', 66), ('training', 51), ('team', 50), ('performance', 50), ('match', 31), ('risk', 26), ('injury', 26), ('players', 26), ('used', 25), ('sports', 24)]


This Method is limited, as words like "sport" and "sports" will be counted separately. We will further apply lemmatization and stemming to address this.

#### 2.2 Finding the Most Common Words in Abstract (Stemming)

In [410]:
from nltk.stem import PorterStemmer
#Function for cleaning and preprocessing the abstracts
def preprocess_text(texts):
    prepositions = set([
        
    ])
    #Filtering out common words that are "meaningless" (such as prepositions) using stop words
    stop_words = set(stopwords.words('english')).union(prepositions)
    stemmer = PorterStemmer()
    #Using PorterStemmer() for stemming and further lowercasing, punctuation removal, word tokenization, and stop word filtering
    cleaned_texts = []
    for text in texts:
        if isinstance(text, str):
            #Replace hyphens and slashes with spaces
            words = text.lower().translate(str.maketrans('-/', '  ')).split()
            words = [word for part in words for word in part.translate(str.maketrans('', '', string.punctuation)).split() if word not in stop_words]
            stemmed_words = [stemmer.stem(word) for word in words]
            cleaned_texts.extend(stemmed_words)

    return cleaned_texts
#Preprocessing abstracts and counting
cleaned_abstracts = preprocess_text(ref_abs['Abstract'])
word_counts = Counter(cleaned_abstracts)
most_common_words_stemming_A = word_counts.most_common(10)

print(most_common_words_stemming_A)


[('data', 66), ('perform', 62), ('train', 60), ('use', 59), ('team', 58), ('sport', 42), ('player', 41), ('injuri', 36), ('studi', 35), ('model', 35)]


#### 2.3 Most Common words in Abstract (Lematization)

In [411]:
import nltk

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sonayavrumyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sonayavrumyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [412]:
#Ensure necessary NLTK resources are downloaded
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

#Function to map NLTK's part of speech tags to those used by WordNet
def get_wordnet_pos(word):
    """Map POS tag to the first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

#Function for cleaning and preprocessing the abstracts with lemmatization
def preprocess_text(texts):
    prepositions = set([
       
    ])
    stop_words = set(stopwords.words('english')).union(prepositions)
    lemmatizer = WordNetLemmatizer()
#Lowercasing, punctuation removal, word tokenization, lemmatizing and stop word filtering
    cleaned_texts = []
    for text in texts:
        if isinstance(text, str):
            #Replace hyphens and slashes with spaces
            words = text.lower().translate(str.maketrans('-/', '  ')).split()
            words = [word for part in words for word in part.translate(str.maketrans('', '', string.punctuation)).split() if word not in stop_words]
            lem_words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words]
            cleaned_texts.extend(lem_words)

    return cleaned_texts
#Preprocessing the abstracts and counting
cleaned_abstracts = preprocess_text(ref_abs['Abstract'])
word_counts = Counter(cleaned_abstracts)
most_common_words_lemm_A = word_counts.most_common(10)

print(most_common_words_lemm_A)


[('data', 66), ('team', 58), ('use', 51), ('training', 51), ('performance', 51), ('sport', 42), ('player', 41), ('injury', 36), ('study', 35), ('model', 35)]


### 2.4 Displaying All Three Methods Together and Conclusion

In [413]:
#Using pandas for displaying the results side by side in a table for ease of comparison
import pandas as pd
results_df = pd.DataFrame({
    'Method': ['Basic', 'Stemming', 'Lemmatization'],
    '1st Most Common': [most_common_words_basic_A[0][0], most_common_words_stemming_A[0][0], most_common_words_lemm_A[0][0]],
    '2nd Most Common': [most_common_words_basic_A[1][0], most_common_words_stemming_A[1][0], most_common_words_lemm_A[1][0]],
    '3rd Most Common': [most_common_words_basic_A[2][0], most_common_words_stemming_A[2][0], most_common_words_lemm_A[2][0]],
    '4th Most Common': [most_common_words_basic_A[3][0], most_common_words_stemming_A[3][0], most_common_words_lemm_A[3][0]],
    '5th Most Common': [most_common_words_basic_A[4][0], most_common_words_stemming_A[4][0], most_common_words_lemm_A[4][0]],
    '6th Most Common': [most_common_words_basic_A[5][0], most_common_words_stemming_A[5][0], most_common_words_lemm_A[5][0]],
    '7th Most Common': [most_common_words_basic_A[6][0], most_common_words_stemming_A[6][0], most_common_words_lemm_A[6][0]],
    '8th Most Common': [most_common_words_basic_A[7][0], most_common_words_stemming_A[7][0], most_common_words_lemm_A[7][0]],
    '9th Most Common': [most_common_words_basic_A[8][0], most_common_words_stemming_A[8][0], most_common_words_lemm_A[8][0]],
    '10th Most Common': [most_common_words_basic_A[9][0], most_common_words_stemming_A[9][0], most_common_words_lemm_A[9][0]]
})
results_df.set_index('Method', inplace=True)

#Applying basic styling
styled_df = results_df.style.set_properties(**{
    'background-color': 'white',  #Background color
    'color': 'black',             #Font color
    'border-color': 'black',      #Border color
    'border-style': 'solid',      #Border style
    'border-width': '1px'         #Border width
}).set_table_styles([{
    'selector': 'th',
    'props': [('background-color', '#f4f4f4'), ('color', 'black')]  #Header styling
}])

styled_df

Unnamed: 0_level_0,1st Most Common,2nd Most Common,3rd Most Common,4th Most Common,5th Most Common,6th Most Common,7th Most Common,8th Most Common,9th Most Common,10th Most Common
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Basic,data,training,team,performance,match,risk,injury,players,used,sports
Stemming,data,perform,train,use,team,sport,player,injuri,studi,model
Lemmatization,data,team,use,training,performance,sport,player,injury,study,model


***Conclusion:***  Lemmatization considers the context and part of speech of a word, leading to more accurate results as it reduces words to their dictionary form, but requires more computational power. Stemming is robust for search and indexing purposes as the exact form of a word is less important and is faster. It's interesting to note that the word "data is consistently the most common word across all 3 methods in the abstract and some words like "risk" have greatly varying positioning. Overall, compared to titles the differences are less extreme.

Accross both title and abstract, the most common words vary significantly. In the 3rd part we will discuss what are the most common words in both together.

### 3. Title and Abstract Together