# All the information is on the task(master fan wiki)

In this notebook we try to use Python's BeautifulSoup module to scrape the wording of all the tasks from series 14 of the popular TV show Taskmaster from the fan wiki:

https://taskmaster.fandom.com/wiki/Series_4

with a view to analysing them.

## The main tasks

In [None]:
# Ususal uploads

from bs4 import BeautifulSoup
import requests
import numpy as np

from wordcloud import WordCloud
import plotly.express as px
from pprint import pprint
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import pandas as pd

In [None]:
url = 'https://taskmaster.fandom.com/wiki/Series_4'

In [None]:
page = requests.get(url)

In [None]:
soup = BeautifulSoup(page.text, 'html.parser')

In [None]:
all_starts = list(soup.find_all("tr", class_='tmtablerow'))
len(all_starts)

In [None]:
#Putting them all in a single list

all_tasks = []
for i in range(len(all_starts)):
    if len(list(all_starts[i])) > 2:
        all_tasks.append(str(list(all_starts[i])[3]))
    else:
        all_tasks.append(str(list(all_starts[i])[1]))
    
            
pprint(all_tasks)

In [None]:
# Getting rid of the <td>s from the beinnings 

all_tasks = [task[4:] for task in all_tasks]
all_tasks

In [None]:
#Remove the stuff from the beginnings

for i in range(len(all_tasks)):
    if 'Prize:' in all_tasks[i]:
        all_tasks[i] = all_tasks[i][14:]
    if 'Team Live' in all_tasks[i]:
        all_tasks[i] = all_tasks[i][18:]
    if 'Team' in all_tasks[i]:
        all_tasks[i] = all_tasks[i][13:]
    if 'Live' in all_tasks[i]:
        all_tasks[i] = all_tasks[i][13:]
        
all_tasks

In [None]:
# Removing the stuff from the end

all_tasks = [task[:-7] for task in all_tasks]
all_tasks

In [None]:
# Addressing problematic cases

all_tasks[2] = "Find out the person's full name"
all_tasks[2]

In [None]:
all_tasks[8] = 'Also, you must smile at the camera with increasing enthusiasm every 30 seconds'
all_tasks[8]

In [None]:
all_tasks[16] = 'Persuade three chickens to stand on the red mat at the same time.'
all_tasks[16]

In [None]:
all_tasks[17] = 'Without moving the fishbowls, transfer the water from fishbowl A to fishbowl B. You may only use the items on this table. Also, you must commentate on your attempt throughout the task, always referring to yourself in the third person. If you eat any of the chocolate you will be docked five points.'
all_tasks[17]

In [None]:
all_tasks[21] = 'Make the highest splash. Before your you commence the task you must say who you think will win the task. Those who correctly guess the winner will win an extra five points.'
all_tasks[21]

In [None]:
all_tasks[28] = "Maintaining constant eye contact and making continuous small talk with this Swedish person, put on the wetsuit, flippers, face mask and snorkel. Your head may not leave the Swedish person's frame at any point. You may not move the laptop from its current position. Whoever puts the wetsuit on best. Best small talk."
all_tasks[28]

In [None]:
all_tasks[30] = 'Seal the top of this bathtub with cling film.'
all_tasks[30]

In [None]:
all_tasks[31] = 'Fill the bathtub with water.'
all_tasks[31]

In [None]:
all_tasks[33] = 'Make the longest continuous noise. Best noise.'
all_tasks[33]

In [None]:
all_tasks[37] = 'Score the best goal with this plastic bag. You may not handbag the bag. Best goal.'
all_tasks[37]

In [None]:
all_tasks[52] = 'Make the most exotic sandwich. Eat your exotic sandwich.'
all_tasks[52]

In [None]:
all_tasks[55] = 'Draw the median duck. The median duck alone wins. Best duck picture.'
all_tasks[55]

## NLP

In [None]:
tasks_split = [task.split() for task in all_tasks]
pprint(tasks_split)

In [None]:
task_words = []
for task in tasks_split:
    task_words += task

task_words

In [None]:
tasks_lower = [task.lower() for task in task_words]
tasks_lower

In [None]:
tasks_no_punc = []

for word in tasks_lower:
    for punctuation in string.punctuation:
        word = word.replace(punctuation, '')
    tasks_no_punc.append(word)
        
tasks_no_punc

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
tasks_no_stop = [word for word in tasks_no_punc if word not in stop_words]

In [None]:
len(tasks_lower)

In [None]:
len(tasks_no_stop)

In [None]:
# Lemmatizing

from nltk.stem import WordNetLemmatizer

# Lemmatizing the verbs
task_verb_lem = [
    WordNetLemmatizer().lemmatize(word, pos = "v") # v --> verbs
    for word in tasks_no_stop
]

task_verb_lem

In [None]:
tasks_final = [
    WordNetLemmatizer().lemmatize(word, pos = "n") # n --> nouns
    for word in task_verb_lem
]

tasks_final

## Word count

In [None]:
df = pd.DataFrame(tasks_final)
df

In [None]:
df[0].value_counts().nlargest(15)

## Word cloud

In [None]:
from os import path
from PIL import Image
from wordcloud import STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

In [None]:
text = ''
for words in tasks_final:
    text += words + ' '

text

In [None]:
wordcloud = WordCloud(background_color="white").generate(text)

In [None]:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()