# Building the Comment Scrapper

In the first section, we will build the YouTube Comment Scrapper. Here I will showcase how to build the scrapper, build a DataFrame out of the comments, filter out the wrestler names and add a sentiment to each comment.

The comments were gathered from the YouTube Channel Wrestlelamia's videos.

NOTE: All YouTube comments have been anonymised so that the comments cannot be traced back to any user.

In [4]:
#!pip install selenium
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys

In [5]:
#Variable containing the link to the video to scrape
video_to_scrape = "https://www.youtube.com/watch?v=zjjlRxtcdUE"

#Chrome webdriver has been installed on the system
driver = webdriver.Chrome()
driver.get(video_to_scrape)

#Amount of time on which the code waits to move on to the next comment
scroll_pause_time = 2
#Delay for webdriver in seconds
delay = 5
#True means we are still scrolling; False means we are not scrolling anymore
scrolling = True
#The last current position on the page
last_height = driver.execute_script("return document.documentElement.scrollHeight")
#The list where we will save the comments
all_comments_list = []
#Number of attempts before we switch scrolling to False
scrolling_attempt = 4

def scrape_yt_comments():
    loaded_comments = []
    try:
        all_usernames = WebDriverWait(driver, delay).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#author-text span")))
        all_comments = WebDriverWait(driver, delay).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#content-text")))
        
        for username, comment in zip(all_usernames, all_comments):
            current_comment = {"username": username.text, "comment": comment.text}
            loaded_comments.append(current_comment)
        return loaded_comments
    except Exception as e:
        print(f"Error while scraping comments: {str(e)}")
        return None

while scrolling:
    htmlelement = driver.find_element(By.TAG_NAME, "body")
    htmlelement.send_keys(Keys.END)
    time.sleep(scroll_pause_time)
    new_height = driver.execute_script("return document.documentElement.scrollHeight")
    
    if new_height == last_height:
        scrolling_attempt -= 1
        print(f"Scrolling attempt {scrolling_attempt}")
        if scrolling_attempt == 0:
            scrolling = False
    else:
        scrolling_attempt = 4
    
    last_height = new_height
    
    try:
        last_20_comments = scrape_yt_comments()
        if last_20_comments:
            all_comments_list.extend(last_20_comments)
    except Exception as e:
        print(f"Error while loading comments: {str(e)}")

print("Scraping completed.")

Error while scraping comments: Message: 

Error while scraping comments: Message: 

Error while scraping comments: Message: 

Error while scraping comments: Message: 

Error while scraping comments: Message: 

Error while scraping comments: Message: 

Error while scraping comments: Message: 

Error while scraping comments: Message: 

Error while scraping comments: Message: 

Error while scraping comments: Message: 

Error while scraping comments: Message: 

Scrolling attempt 3
Scrolling attempt 2
Scrolling attempt 1
Scrolling attempt 0
Scraping completed.


In [32]:
#a list with all comments
all_comments_list

[{'comment': 'WWE Fans Who Went Too Far \nWATCH: https://youtu.be/2LFUbtFkOPw'},
 {'comment': "The rain made The Rock's attack on Cody more brutal"},
 {'comment': 'The Rock didn’t steal Roman’s spotlight, Roman lost his spotlight'},
 {'comment': '“Rock Overshadowing Roman” is a great plot twist - and seems intentional. Having those 2 egos being the downfall of the Bloodline feels like a natural (and more compelling) way to put the belt on Cody.'},
 {'comment': 'Maybe if Roman showed up then Rock wouldnt be stealing the spotlight as much.'},
 {'comment': "Man, not only Solo is suffering losses but he's the only Bloodline member without a match at WrestleMania"},
 {'comment': 'the Rock really knew when he claimed that he made pro wrestling fun again'},
 {'comment': 'They got to give Solo a different gimmick because this whole “silent enforcer” thing isn’t doing it.'},
 {'comment': "It was a waste for John Cena to put over Sucko Sikoa and Austin Theory. They've gone no where and done noth

In [33]:
#first loop to enter the list
#for x in all_comments_list:
    #second loop to get rid of the user name
for x in all_comments_list:
    x.pop('username',None)
print(all_comments_list)

[{'comment': 'WWE Fans Who Went Too Far \nWATCH: https://youtu.be/2LFUbtFkOPw'}, {'comment': "The rain made The Rock's attack on Cody more brutal"}, {'comment': 'The Rock didn’t steal Roman’s spotlight, Roman lost his spotlight'}, {'comment': '“Rock Overshadowing Roman” is a great plot twist - and seems intentional. Having those 2 egos being the downfall of the Bloodline feels like a natural (and more compelling) way to put the belt on Cody.'}, {'comment': 'Maybe if Roman showed up then Rock wouldnt be stealing the spotlight as much.'}, {'comment': "Man, not only Solo is suffering losses but he's the only Bloodline member without a match at WrestleMania"}, {'comment': 'the Rock really knew when he claimed that he made pro wrestling fun again'}, {'comment': 'They got to give Solo a different gimmick because this whole “silent enforcer” thing isn’t doing it.'}, {'comment': "It was a waste for John Cena to put over Sucko Sikoa and Austin Theory. They've gone no where and done nothing sign

In [34]:
#here we get the date of video
from bs4 import BeautifulSoup
from bs4 import Comment
import requests

url="https://www.youtube.com/watch?v=zjjlRxtcdUE"
data=requests.get(url).text
soup=BeautifulSoup(data,'html.parser')
test=soup.find('meta', itemprop ='datePublished')

content_value = test.get('content')
content_value.split('T')[0]

'2024-03-27'

In [35]:
#creating a def function to get the data of the video
def get_date(url):
    from bs4 import BeautifulSoup
    from bs4 import Comment
    import requests

    url="https://www.youtube.com/watch?v=zjjlRxtcdUE"
    data=requests.get(url).text
    soup=BeautifulSoup(data,'html.parser')
    test=soup.find('meta', itemprop ='datePublished')

    content_value = test.get('content')
    return content_value.split('T')[0]

In [36]:
import re
#Flatten the list of dictionaries
#flat_list = [item for sublist in all_comments_list for item in sublist] => list comprehension
#here the for-loop
flat_list=[]
#for lst in all_comments_list:
for x in all_comments_list:
    flat_list.append(x)
df=pd.DataFrame(flat_list)
df['comment']=df['comment'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df.rename(columns={'comment':'comments_{}'.format(get_date("https://www.youtube.com/watch?v=zjjlRxtcdUE"))},inplace=True)
date=pd.to_datetime(get_date("https://www.youtube.com/watch?v=zjjlRxtcdUE"))
#adding the date as the first of the month
df['Date']=date.to_period('M').to_timestamp()
df.drop_duplicates(inplace=True)

In [37]:
df.head()

Unnamed: 0,comments_2024-03-27,Date
0,WWE Fans Who Went Too Far \nWATCH httpsyoutube...,2024-03-01
1,The rain made The Rocks attack on Cody more br...,2024-03-01
2,The Rock didnt steal Romans spotlight Roman lo...,2024-03-01
3,Rock Overshadowing Roman is a great plot twist...,2024-03-01
4,Maybe if Roman showed up then Rock wouldnt be ...,2024-03-01


In [38]:
#stop_words allow us to remove words which are needed to form a proper sentence but do not provide more insight to the text
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words=set(stopwords.words('english'))
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [39]:
#def function to remove lower case words
def remove_lower_case(x):
    filtered_word=[x for x in x if not re.match('^[a-z]+$', x)]
    return filtered_word

In [40]:
test=df['comments_2024-03-27'][0]
words=word_tokenize(test)
#here we apply the stopwords filter on a test set
new_filtered_words=[x for x in words if x.lower() not in stop_words]
new_filtered_words=remove_lower_case(new_filtered_words)
new_filtered_words

['WWE', 'Fans', 'Went', 'Far', 'WATCH', 'httpsyoutube2LFUbtFkOPw']

In [41]:
#here we apply the stopwords filter on a data frame
df['word_comment']=df['comments_2024-03-27'].apply(lambda x: [x for x in word_tokenize(x) if x.lower() not in stop_words])
#here we remove all lower case words
df['word_comment_tag_words']=df['word_comment'].apply(lambda x: remove_lower_case(x))

In [42]:
df['word_comment'][0]

['WWE', 'Fans', 'Went', 'Far', 'WATCH', 'httpsyoutube2LFUbtFkOPw']

In [43]:
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [44]:
#when the compund output is bigger to equal than 0.05, then the comment will be considered as positive
#when the compound output is smaller to equal to -0.05, the the comment will be considered as negative
#otherwise the comment will be considerd as neutral

def comment_sentiment(text):
    nltk.download('vader_lexicon')
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    senti=SentimentIntensityAnalyzer()
    output=senti.polarity_scores(text)
    if output['compound'] >= 0.05:
        return 'Positive'
    elif output['compound'] <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

In [45]:
df['Sentiment']=[comment_sentiment(x) for x in df.iloc[:,0]]

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-t

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-t

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-t

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-t

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-t

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-t

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gandj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-t

## Crating the Occurance Count

In [46]:
#counter allows us to count the appearance of each word
from collections import Counter
Counter(df['word_comment'][0])

Counter({'WWE': 1,
         'Fans': 1,
         'Went': 1,
         'Far': 1,
         'WATCH': 1,
         'httpsyoutube2LFUbtFkOPw': 1})

Next, we will do that for the whole data set.

In [47]:
step_list=[x for lst in df['word_comment_tag_words'] for x in lst]
df_count=Counter(step_list)
df_count_dict=dict(df_count)
sorted_word_count=dict(sorted(df_count_dict.items(),key=lambda item:item[1],reverse=True))
sorted_word_count

{'Roman': 215,
 'Rock': 175,
 'Cody': 73,
 'Solo': 56,
 'Reigns': 47,
 'WWE': 40,
 'Romans': 33,
 'Bloodline': 22,
 'Ronda': 22,
 'Rocks': 18,
 '2': 17,
 'Punk': 16,
 'Cena': 15,
 'Im': 14,
 'Hes': 13,
 'Wrestlemania': 12,
 'Rousey': 11,
 'WM': 11,
 'WrestleMania': 10,
 'Drew': 10,
 'Ridge': 10,
 'CM': 10,
 'Stephanie': 9,
 '1': 9,
 'Holland': 9,
 'John': 8,
 '3': 8,
 'Rhonda': 8,
 'Thats': 8,
 'Maybe': 7,
 'McMahon': 7,
 'Vince': 7,
 'Seth': 7,
 'Id': 7,
 'Codys': 7,
 'Becky': 7,
 'Rhodes': 6,
 'Rollins': 6,
 'Hollywood': 6,
 'E': 6,
 'Like': 6,
 'Jimmy': 6,
 'Austin': 5,
 'Solos': 5,
 'Big': 5,
 'TV': 5,
 'Dwayne': 5,
 'Paul': 5,
 'Lynch': 5,
 'Theory': 4,
 '034': 4,
 'RAW': 4,
 'Spotlight': 4,
 'McIntyre': 4,
 'Night': 4,
 'NXT': 4,
 'Everyone': 4,
 'Bayley': 4,
 'Jey': 4,
 '34': 4,
 'Heyman': 4,
 'Hogans': 4,
 'Mania': 4,
 'Tribal': 4,
 'AEW': 4,
 'Boss': 4,
 'ROCK': 4,
 'RR': 4,
 'Equal': 4,
 'Sikoa': 3,
 'HHH': 3,
 'Punks': 3,
 'Chief': 3,
 'Women': 3,
 'Raw': 3,
 '6': 3,
 'Funny

In [48]:
df_app=pd.DataFrame(sorted_word_count.items(),columns=['Word','Count'])
date=pd.to_datetime(get_date("https://www.youtube.com/watch?v=zjjlRxtcdUE"))
df_app['Date']=date
#how to filter out non-string values
df_app['Word']=df_app['Word'][df_app['Word'].apply(lambda x: x.isalpha())]
df_app['Word']=df_app['Word'].astype(str)
df_app.head(25)

Unnamed: 0,Word,Count,Date
0,Roman,215,2024-03-27
1,Rock,175,2024-03-27
2,Cody,73,2024-03-27
3,Solo,56,2024-03-27
4,Reigns,47,2024-03-27
5,WWE,40,2024-03-27
6,Romans,33,2024-03-27
7,Bloodline,22,2024-03-27
8,Ronda,22,2024-03-27
9,Rocks,18,2024-03-27


When we leave the data frames as it is, we will run into a big problem. We simply count the occurrences of words/names, for example "Reigns". The wrestler's name is Roman Reigns, but the data frame counts the singal occurrences of Roman, Romans, Reigns etc.

To solve this problem, we will get the full names of the wrestlers and create a dictionary that covers all possible combinations.

In [49]:
#building a webscrapper to get the wrestler names
from bs4 import BeautifulSoup
from bs4 import Comment
import requests

url="https://www.thesmackdownhotel.com/roster/?promotion=wwe&date=all-time"
data=requests.get(url).text
soup=BeautifulSoup(data,'html.parser')
#getting the section where the needed information is loacted
test=soup.find('div', class_ ='roster_section roster2k22')
#list comprehension to get all Names
wrestler=[x.get('title') for x in test.find_all('a')]
wrestler

['Aalyah Mysterio',
 'Abbey Laith',
 'Abraham Washington',
 'Adam Cole',
 'Adam Pearce',
 'Adam Rose',
 'Adnan Virk',
 'Adrian Adonis',
 'Adriana Rizzo',
 'Afa',
 'Ahmed Johnson',
 'Aiden English',
 'AJ Lee',
 'AJ Styles',
 'Aja Kong',
 'Akam',
 'Akebono',
 'Akeem',
 'Akira Maeda',
 'Akira Tozawa',
 'Aksana',
 'Al Snow',
 'Alba Fyre',
 'Alberto Del Rio',
 'Aleah James',
 'Aleister Black',
 'Alex Riley',
 'Alexa Bliss',
 'Alexander Wolfe',
 'Alicia Fox',
 'Alicia Taylor',
 'Aliyah',
 'Alundra Blayze',
 'Alyse Ashton',
 'Amale',
 'Amanpreet Singh',
 'Amari Miller',
 'Amy Weber',
 'Andrade',
 'Andre Chase',
 "Andrea D'Marco",
 'André the Giant',
 'Andy Shepherd',
 'Angel Hayze',
 'Angel',
 'Angelo Dawkins',
 'Angelo Poffo',
 'Antonio',
 'Apollo Crews',
 'Ari Sterling',
 'Arianna Grace',
 'Ariya Daivari',
 'Armando Estrada',
 'Arn Anderson',
 'Arnold Skaaland',
 'Arturo Ruas',
 'Ashante Adonis',
 'Asher Hale',
 'Ashley Massaro',
 'Ashton Smith',
 'Assassin #3',
 'Asuka',
 'Audrey Marie',
 

In [50]:
#splitting each string value by the space
name_split=[x.split(' ') for x in wrestler]
#nested list comprehension
plural_name_split = [[k + 's' for k in x.split(' ')] for x in wrestler]
#splitting each string value by the space and truning all string values into lower case
lower_name_split=[x.lower().split(' ') for x in wrestler]
#nested list comprehension
plural_lower_name_split=[[k + "s" for k in x.lower().split(' ')] for x in wrestler]
#putting each string value from a list into a list of its own
sep_list=[[str(x)] for x in wrestler]
#nested list comprehension
plural_sep_list=[[k + 's' for k in [str(x)]] for x in wrestler]
#putting each string value from a list into a list of its own and transforming all letter to lower case
sep_list_lower=[[str(x.lower())] for x in wrestler]

plural_lower_sep_list=[[k + "s" for k in [str(x.lower())]] for x in wrestler]

dict_value=[]
for x in range(0,len(wrestler)):
    new_input=sep_list[x]+plural_sep_list[x]+sep_list_lower[x]+plural_lower_sep_list[x]+name_split[x]+plural_name_split[x]
    dict_value.append(new_input)
dict_value

[['Aalyah Mysterio',
  'Aalyah Mysterios',
  'aalyah mysterio',
  'aalyah mysterios',
  'Aalyah',
  'Mysterio',
  'Aalyahs',
  'Mysterios'],
 ['Abbey Laith',
  'Abbey Laiths',
  'abbey laith',
  'abbey laiths',
  'Abbey',
  'Laith',
  'Abbeys',
  'Laiths'],
 ['Abraham Washington',
  'Abraham Washingtons',
  'abraham washington',
  'abraham washingtons',
  'Abraham',
  'Washington',
  'Abrahams',
  'Washingtons'],
 ['Adam Cole',
  'Adam Coles',
  'adam cole',
  'adam coles',
  'Adam',
  'Cole',
  'Adams',
  'Coles'],
 ['Adam Pearce',
  'Adam Pearces',
  'adam pearce',
  'adam pearces',
  'Adam',
  'Pearce',
  'Adams',
  'Pearces'],
 ['Adam Rose',
  'Adam Roses',
  'adam rose',
  'adam roses',
  'Adam',
  'Rose',
  'Adams',
  'Roses'],
 ['Adnan Virk',
  'Adnan Virks',
  'adnan virk',
  'adnan virks',
  'Adnan',
  'Virk',
  'Adnans',
  'Virks'],
 ['Adrian Adonis',
  'Adrian Adoniss',
  'adrian adonis',
  'adrian adoniss',
  'Adrian',
  'Adonis',
  'Adrians',
  'Adoniss'],
 ['Adriana Rizzo

In [51]:
wrestler_dict=dict(zip(wrestler,dict_value))
wrestler_dict

{'Aalyah Mysterio': ['Aalyah Mysterio',
  'Aalyah Mysterios',
  'aalyah mysterio',
  'aalyah mysterios',
  'Aalyah',
  'Mysterio',
  'Aalyahs',
  'Mysterios'],
 'Abbey Laith': ['Abbey Laith',
  'Abbey Laiths',
  'abbey laith',
  'abbey laiths',
  'Abbey',
  'Laith',
  'Abbeys',
  'Laiths'],
 'Abraham Washington': ['Abraham Washington',
  'Abraham Washingtons',
  'abraham washington',
  'abraham washingtons',
  'Abraham',
  'Washington',
  'Abrahams',
  'Washingtons'],
 'Adam Cole': ['Adam Cole',
  'Adam Coles',
  'adam cole',
  'adam coles',
  'Adam',
  'Cole',
  'Adams',
  'Coles'],
 'Adam Pearce': ['Adam Pearce',
  'Adam Pearces',
  'adam pearce',
  'adam pearces',
  'Adam',
  'Pearce',
  'Adams',
  'Pearces'],
 'Adam Rose': ['Adam Rose',
  'Adam Roses',
  'adam rose',
  'adam roses',
  'Adam',
  'Rose',
  'Adams',
  'Roses'],
 'Adnan Virk': ['Adnan Virk',
  'Adnan Virks',
  'adnan virk',
  'adnan virks',
  'Adnan',
  'Virk',
  'Adnans',
  'Virks'],
 'Adrian Adonis': ['Adrian Adonis'

Due to many reoccuring names we need to adjust the dictionary a little bit. The following values will be adjusted:

- Aalyah Mysterio
- AJ Lee
- Adam Cole
- Adam Pearce
- Adam Rose
- Angel Hayze
- Austin Aries
- Austin Theory
- Bam Neely
- Big Boss Man
- Big Cass
- Big Daddy V
- Big John Studd
- Big Show
- Billy Kidman
- Bill Watts
- Blake Beverly
- Bobby Fish
- Bobby Heenan
- Chris Benoit
- Chris Candido
- Chris Kanyon
- Chris Masters
- Chris Park
- Dominik Mysterio
- Drake Maverick
- Duke Droese
- Eddie Dennis
- Eddie Gilbert
- Eddie Graham
- Eric Bischoff
- Jeff Hardy
- Jeff Jarrett
- Jimmy Hart
- Jimmy Jacobs
- Jimmy Smith
- Jimmy Snuka
- John Cena
- John Laurinaitis
- John Morrison
- Kevin Nash
- Randy Savage
- Rob Conway
- Rob Gronkowski
- Shawn Daivari
- The Rock
- The Miz
- The Undertaker

In [53]:
del wrestler_dict['Aalyah Mysterio'][5]
    del wrestler_dict['Aalyah Mysterio'][6]
    del wrestler_dict['AJ Lee'][4]
    del wrestler_dict['AJ Lee'][5]
    del wrestler_dict['The Rock'][4]
    del wrestler_dict['The Rock'][5]
    del wrestler_dict['The Miz'][4]
    del wrestler_dict['The Miz'][5]
    del wrestler_dict['The Undertaker'][4]
    del wrestler_dict['The Undertaker'][5]
    del wrestler_dict['John Morrison'][4]
    del wrestler_dict['John Morrison'][5]
    del wrestler_dict['Austin Theory'][4]
    del wrestler_dict['Austin Theory'][5]
    del wrestler_dict['Dominik Mysterio'][5]
    del wrestler_dict['Dominik Mysterio'][6]
    del wrestler_dict['Chris Benoit'][0]
    del wrestler_dict['Chris Benoit'][0]
    del wrestler_dict['Chris Benoit'][0]
    del wrestler_dict['Chris Benoit'][0]
    del wrestler_dict['Chris Benoit'][0]
    del wrestler_dict['Chris Benoit'][1]
    first_val=['Adam Cole','Adam Pearce','Adam Rose','Angel Hayze','Austin Aries','Bam Neely','Big Boss Man','Big Cass','Big Daddy V',
              'Big John Studd','Big Show','Billy Kidman','Bill Watts','Blake Beverly','Bobby Fish','Bobby Heenan','Chris Candido',
              'Chris Kanyon','Chris Masters','Chris Park','Drake Maverick','Duke Droese','Eddie Dennis','Eddie Gilbert','Eddie Graham',
              'Eric Bischoff','Jeff Hardy','Jeff Jarrett','Jimmy Hart','Jimmy Jacobs','Jimmy Smith','Jimmy Snuka','Kevin Nash','Randy Savage',
              'Rob Conway','Rob Gronkowski','Shawn Daivari','Shawn Spears','Shawn Stasiak','Luther Reigns','Rocco Rock','Drew Gulak',
              'John Laurinaitis','Linda McMahon','Mr. Stone']
    for x in first_val:
        del wrestler_dict[x][4:]
    wrestler_dict['WWE']=['WWE','wwe']
    wrestler_dict['Raw']=['RAW','Raw','raw']
    wrestler_dict['Smack Down']=['Smack Down','smack down','Smack down','Smack','smack','Down','down','Smackdown','SmackDown',
                                'smackdown']
    wrestler_dict['WrestleMania']=['WrestleMania','wrestlemania','WM','wm','Wrestlemania','Mania','mania']
    wrestler_dict['Royal Rumble']=['Royal Rumble','royal rumble','RR','rr','Royal rumble']
    wrestler_dict['Elimination Chamber']=['Elimination Chamber','elimination chamber','EC','ec','Elimination chamber']
    wrestler_dict['NXT Vengeance Day']=['Vengeance Day','vengeance day']
    wrestler_dict['NXT Roadblock']=['Roadblock','roadblock']
    wrestler_dict['NXT Stand & Deliver']=['Stand & Deliver','stand & deliver']
    wrestler_dict['Backlash']=['Backlash','backlash']
    wrestler_dict['NXT Battleground']=['Battleground','battleground']
    wrestler_dict['Money in the Bank']=['Money in the Bank','vengeance day','money in the bank','MITB','mitb']
    wrestler_dict['NXT Heatwave']=['Heatwave','heatwave']
    wrestler_dict['Summer Slam']=['Summer Slam','summer slam']
    wrestler_dict['Bash in Berlin']=['Bash in Berlin','bash in berlin','BIB','bib']
    wrestler_dict['Survivor Series']=['Survivor Series','survivor series','Survivor series']
    wrestler_dict['The Bloodline']=['Bloodline','bloodline','The Bloodline','the bloodline']
    wrestler_dict['The Judgment Day']=['The Judgment Day','the judgment day','Judgment Day','judgment day', 'Judgment','judgment',
                                      'Day','day']
    wrestler_dict['Seth Rollins']=['Seth Rollins','Seth Rollinss','Seth','Seth ','Seths','seth','seths','Rollins','rollins','Rollinss','rollinss']
    wrestler_dict['McMahon']=['McMahon','mcmahon','Mcmahon','mcMahon','McMahons','mcmahons','Mcmahons','mcMahons']
    wrestler_dict['Golden Era']=['Golden Era','golden era','Golden','golden']
    wrestler_dict['Attitude Era']=['Attitude Era','attitude era','Attitude','attitude','Era','era']
    wrestler_dict['Ruthless Aggression Era']=['Ruthless Aggression Era','ruthless aggression era',
                                              'Ruthless','ruthless','Aggression','aggression',]
    wrestler_dict['PG Era']=['PG Era','pg era','PG','pg']
    wrestler_dict['New Era']=['New Era','new era','New','new']
    wrestler_dict['The Rock'].append('Dwayne')
    wrestler_dict['The Rock'].append('Dwaynes')
    wrestler_dict['The Rock'].append('dwayne')
    wrestler_dict['The Rock'].append('dwaynes')
    del wrestler_dict['Seth ']
    wrestler_dict['Stone Cold Steve Austin']=wrestler_dict['Steve Austin']
    del wrestler_dict['Steve Austin']
    wrestler_dict['Stone Cold Steve Austin'].append('Stone')
    wrestler_dict['Stone Cold Steve Austin'].append('Stones')
    wrestler_dict['Stone Cold Steve Austin'].append('stone')
    wrestler_dict['Stone Cold Steve Austin'].append('stones')
    wrestler_dict['Stone Cold Steve Austin'].append('Cold')
    wrestler_dict['Stone Cold Steve Austin'].append('Colds')
    wrestler_dict['Stone Cold Steve Austin'].append('cold')
    wrestler_dict['Stone Cold Steve Austin'].append('colds')
    wrestler_dict['Stone Cold Steve Austin'].append('Stone Cold')
    wrestler_dict['Stone Cold Steve Austin'].append('Stone Colds')
    wrestler_dict['Stone Cold Steve Austin'].append('Stone cold')
    wrestler_dict['Stone Cold Steve Austin'].append('Stone colds')
    wrestler_dict['Stone Cold Steve Austin'].append('stone cold')
    wrestler_dict['Stone Cold Steve Austin'].append('stone cold')
    wrestler_dict['Shawn Michaels'].append('HBK')
    wrestler_dict['Shawn Michaels'].append('HBKs')
    wrestler_dict['Shawn Michaels'].append('hbk')
    wrestler_dict['Shawn Michaels'].append('hbks')
    del wrestler_dict["Paul "]
    wrestler_dict['Triple H']=['Triple H','Triple Hs','triple h','triple hs','Triple','Triples','triple','triples','H','Hs',
                              'h','hs','HHH','HHHs','hhh','hhhs','Hunter','Hunters','hunter','hunters']

In [55]:
#def function to get the key for the corrosponding value
def find_key(dictionary, value):
    keys = []
    for key, val in dictionary.items():
        if value in val:
            keys.append(key)
    return keys if keys else None  # If no matching keys found, return None

#mapping the column values to the matching dictionary values
df_app['Wrestler'] = df_app['Word'].map(lambda x: find_key(wrestler_dict, x))
#this line allows us to remove the list from each value, otherwise the "Wrestler" column would have 
df_app['Wrestler'] = df_app['Wrestler'].apply(lambda x: x[0] if isinstance(x, list) else x)

In [None]:
wrestler_dict

In [56]:
df_app.head(75)

Unnamed: 0,Word,Count,Date,Wrestler
0,Roman,215,2024-03-27,Roman Reigns
1,Rock,175,2024-03-27,The Rock
2,Cody,73,2024-03-27,Cody Rhodes
3,Solo,56,2024-03-27,Solo Sikoa
4,Reigns,47,2024-03-27,Roman Reigns
...,...,...,...,...
70,HHH,3,2024-03-27,
71,Punks,3,2024-03-27,CM Punk
72,Chief,3,2024-03-27,Chief Jay Strongbow
73,Women,3,2024-03-27,


In [57]:
df_app.groupby(['Wrestler'])['Count'].sum().sort_values(ascending=False).to_frame().head(20)

Unnamed: 0_level_0,Count
Wrestler,Unnamed: 1_level_1
Roman Reigns,295
The Rock,198
Cody Rhodes,86
Solo Sikoa,64
WWE,40
Ronda Rousey,34
WrestleMania,33
CM Punk,29
John Cena,23
The Bloodline,22
