# Data Scraping

In this notebook I will explain the process I made to create the pipeline that can be found in `src/pipeline.py`.

In [1]:
# Libraries

from chat_downloader import ChatDownloader # library for live chat data scraping
import requests as req
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By

import time
import json
import datetime

from pymongo import MongoClient
from passwords import STR_CONN # a file with my connection string to the mongodb atlas db

from pysentimiento import create_analyzer # sentiment analysis nlp

First, load analyzers and initialize driver's options

In [9]:
analyzer = create_analyzer(task="sentiment", lang="es")

hate_speech_analyzer = create_analyzer(task="context_hate_speech", lang="es")

In [2]:
opciones=Options()

opciones.add_experimental_option('excludeSwitches', ['enable-automation'])
opciones.add_experimental_option('useAutomationExtension', False)
opciones.headless=False
opciones.add_argument('--start-maximized')
opciones.add_argument('--incognito')

The next part of the notebook will be on a `while True` loop, but for the sake of explanation this will only cover one iteration.

In [5]:
driver = webdriver.Chrome(opciones)

time.sleep(2)

url = 'https://twitchtracker.com/channels/live/spanish' # web with top live streams in spanish at the moment

driver.get(url)

table = driver.find_element(By.CSS_SELECTOR, 'table')
top_5 = table.find_elements(By.CSS_SELECTOR, 'tr')[:5]

users = []

for e in top_5:
    users.append(e.find_elements(By.CSS_SELECTOR, 'a')[1].text.lower())

driver.quit()
users

['thegrefg', 'willyrex', 'lolitofdez', 'mixwell', 'agustin51']

Now that we have the top 5 twtich streamers that are live now, we will take their data. Ideally this would work with all of them at the same time, but we'll take only the first one for this explanation.

In [6]:
user = users[0]

url = f'https://www.twitch.tv/{user}'
chat = ChatDownloader().get_chat(url,
                                retry_timeout = -1, # -1 makes the downloader to retreive a message as soon as is published
                                timeout = 150)      # 150 secs of scrapping
temp = []
for message in chat:                        
    temp.append(message)

In [7]:
# load mongo cursor
cursor = MongoClient(STR_CONN)

db = cursor.live_chats

In [11]:
# video data
vid_id = chat.__dict__['id']
vid_url = url
vid_title = chat.__dict__['title']

video_son = {
            '_id': vid_id,
            'title': vid_title,
            'recording': {
                            'start': datetime.datetime.now(),
                            'finish': ''
                            },
            'sample': True
            }

# try pass just in case the author id is already there
try:
    db.video.insert_one(video_son)
except:
    pass
# message and author
messages = []
commentors = []

for samp in temp:
    # message
    mess = samp['message']
    mess_id = samp['message_id']
    sent = analyzer.predict(mess).__dict__['output']
    hate = hate_speech_analyzer.predict(mess).__dict__['output']
    # common
    ts = samp['timestamp']
    # author
    name = samp['author']['name']
    com_id = samp['author']['id']

    mess_son = {
                '_id': mess_id,
                'message': mess,
                'date': datetime.datetime.now(),
                'timestamp': ts,
                'commentator_id': com_id,
                'video_id': vid_id,
                'sentiment_analysis': sent,
                'hat_speech_analysis': hate,
                'sample': True
                }

    auth_son = {
                '_id': com_id,
                'name': name,
                'last_update': datetime.datetime.now(),
                'sample': True
                }
    
    messages.append(mess_son)
    commentors.append(auth_son)

We clear repeated users and insert into mongo.

In [12]:
curated_comms = []

for commentor in commentors:
    flag = 1
    for comm in curated_comms:
        if (comm) and (comm['_id'] == commentor['_id']):
            flag = 0
            continue
    if flag:
        curated_comms.append(commentor)

db.message.insert_many(messages)
db.user.insert_many(curated_comms)

InsertManyResult(['960088216', '879409122', '461907942', '805116643', '19264788', '508687620', '751653653', '799119213', '779249169', '234620237', '524349985', '486312448', '827931197', '980927106', '910753363', '930795865', '897089591', '528487028', '249679139', '936851964', '615911677', '511822498', '944974851', '889741999', '469535618', '922164801', '502675279', '550093633', '825308369', '704199343', '671957599', '230978364', '235059228', '621010473', '214230291', '981607019', '891301133', '965724325', '478005240', '583310583', '222358961', '524328955', '599269764', '698168684', '520245487', '833321014', '731300701', '413293880', '464936415', '982075112', '562336258', '554991770', '468325883', '752566304', '266418541', '944799204', '234066712', '409747483', '218720600', '407746017', '554757200', '755697060', '591534520', '976732393', '580056840', '526150974', '421117308', '515408485', '981776695', '491243567', '573229538', '544409509', '982073726', '709060161', '715951231', '5338353