# **Web Scrapping**

## Importing modules

In [None]:
!pip install praw &> /dev/null
!pip install scrubadub &> /dev/null #to autodetect the mail and phone numbers
!pip install spacy &> /dev/null #to identify with NLP parts of the sentence
!pip install tweepy &> /dev/null
!apt-get update &> /dev/null
!apt install chromium-chromedriver &> /dev/null
%pip install selenium &> /dev/null
!cp /usr/lib/chromium-browser/chromedriver /usr/bin &> /dev/null

In [None]:
#Scrapper modules
import praw #Reddit
import tweepy #Twitter
#Note: Youtube has special modules listed in its cell.

#Filtering modules
import re
import spacy
import scrubadub

#Data modules
import pandas as pd
from datetime import datetime

## Filtering function

In [None]:
#Filtering function

model = spacy.load('en_core_web_sm') #loading the english NLP version
model.tokenizer.rules = {key: value for key, value in model.tokenizer.rules.items() 
                        if "'" not in key and "’" not in key and "‘" not in key} #to avoid spacy splitting words with apostrophe

def remove_emojis(text):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', text)

def filter(text, socialnet="", GPE=True, PERSON=True):
  """
  Filter that removes confidential data
  params:
    text(str): Text to filter.
    socialnet(str): Removes username. "twitter" is supported.
    GPE(bool): If True, removes Geographical locations.
    GPE(bool): If True, removes personal names.
  output:
    filtstr(str): Filtered string.
  """
  text=remove_emojis(text)
  textspacy = model(text) 
  filtstr = ''
  entities=['DATE']
  if GPE:
    entities.append("GPE")
  if PERSON:
    entities.append("PERSON")
  for word in textspacy:
      if word.ent_type_ in entities:
          new_word = ''
      elif word.pos_ == 'PUNCT':
          new_word = word.text
      else:
          new_word = ' {}'.format(word.text)
      filtstr += new_word

  filtstr = filtstr[1:] #Avoiding the first space, all names, dates and locations deleted

  scrubber = scrubadub.Scrubber(post_processor_list=[scrubadub.post_processors.FilthRemover(),]) #For removing {{EMAIL}}

  filtstr = scrubber.clean(filtstr, replace_with='identifier') #Cleaning mails and phone numbers

  #Cleaning urls: https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python
  regex_url=r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'''
  filtstr=re.sub(regex_url,"",filtstr)

  if socialnet=="twitter":
    filtstr=re.sub(r"\@\w+[,]|\@\w+|[,]\@\w+","",filtstr) #Deletes user

  return(filtstr)


In [None]:
re.sub(r"\@\w+[,]|\@\w+|[,]\@\w+","", "sus @amongus sus us")

'sus  sus us'

## **Reddit API**

In [None]:
import praw

# to use PRAW
reddit = praw.Reddit(
    client_id = "wj8_QX85_EbUs2QDepC5Cg",
    client_secret = "Z0jQjlUd9QQMXBfjN9dDKt8HKulCTg",
    username = "xelastro",
    password = "#aA5535241528",
    redirect_uri= "http://localhost:8081",
    user_agent = "second agent"
)

#checking everything is ok

print(reddit.user.me())

It appears that you are using PRAW in an asynchronous environment.
It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



xelastro


### Testing Reddit user output

In [None]:

#Potential edit, change the user you want to see comments from, avoid the /u, only place the username
user = "JohnnyLibRight"


ids = [] #to save the comment IDs
#pattern = (r"Comment/(id='(?P<id>.+)'/)")

for my_id in reddit.redditor(user).comments.new(limit=35): 
    """
    limit set as None can maximum retrieve 1000 comments due to Reddits cache 
    using limit as 35 for waiting less time
    """
    my_id = str(my_id) #casting
    ids.append(my_id) #generate your list with all comment IDs

comment_cont = []

for comment in ids:
    """
    To generate a list with both the comment body and date
    """
    commentObject = reddit.comment(comment) #Generating the comment object
    comment_cont.append([commentObject.body, filter(commentObject.body), datetime.fromtimestamp(commentObject.created_utc)]) 

#Creating the dataframe
data = pd.DataFrame(comment_cont, columns = ['Post', 'Filtered', 'Date'])
filtered_posts = data[['Filtered']]
print(filtered_posts)
print(data.head(15))

#Generating the full .csv
path = user + '_Reddit.csv'
data.to_csv(path)

#Generating the .csv for only the filtered posts for topic modeling
from google.colab import files
path2 = user + '_Filtered_Reddit.csv'
filtered_posts.to_csv(path2)
files.download(path2)

It appears that you are using PRAW in an asynchronous environment.
It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It appears that you are using PRAW in an asynchronous environment.
It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It appears that you are using PRAW in an asynchronous environment.
It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It appears that you are using PRAW in an asynchronous environment.
It is strongly recommended to use Async PRAW: https://asyncpraw.readt

                                             Filtered
0                                                 Mr.
1                                                    
2   The singer commented on a post detailing the s...
3   The singer commented on a post detailing the s...
4                         Ye vs The People underrated
5    I could tell was the mind behind Seinfeld Wan...
6    I could tell was the mind behind Seinfeld \n ...
7                                            Source: 
8                                                  or
9   Let ’s get em the dirt cake It ’s his favorite...
10                                       WestTaleEver
11                           I found this from Edrick
12  A state investigation launched after ’s killin...
13  Rose was allowed to remain on the force for de...
14  An assailant opened fire during a death metal ...
15                                             YZY PF
16                              Oh yeah? Watch this!-
17  Nah this match was Sport

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## **Twitter API**

### Twitter Api declaration

In [None]:
#Run the credentials
import tweepy
consumer_key="aSwakAUteSogeMh51A8U1j556"
consumer_secret="SBKzkHJpAOW1VReY8FhG1LSkacT67phFdfzP8jf5LEWgqv34Fk"
access_token="1311524883306696705-5lIgx9wKegu3zyYArkx8gaZPZdoHpt"
access_token_secret="SARjk6tC2va17NJPyYsWoAegqOvP0ugD0QmA1X0mdu1Yl"
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

### Twitter status by user:

In [None]:
userID = "economics"#<-----User without @ 
maxcount=5000 #<-------------Number of tweets 

#Twitter Api retrieves only a finite amount of tweets. 
#Therefor, here's a loop that retrieves the last tweet id per iteration.

test_tweet = api.user_timeline(screen_name=userID, count=200, include_rts = False)
maxid=test_tweet[0].id #latest maxid in timeline

tweets=[]; parcial_tweets=[1]
while len(tweets)<maxcount and len(parcial_tweets)!=0:
  parcial_tweets = api.user_timeline(screen_name=userID, 
                            # 200 is the maximum allowed count. Without excluding replies and rts.
                            count=200,
                            include_rts = False,
                            # Necessary to keep full_text 
                            # otherwise only the first 140 words are extracted
                            exclude_replies=True,
                            tweet_mode = 'extended',
                            max_id=maxid
                            )
  if len(parcial_tweets)!=0:
    maxid=parcial_tweets[-1].id
  if len(tweets)!=0: #Non repeated tweets after first iter
    parcial_tweets=parcial_tweets[1:]
  tweets.extend(parcial_tweets)
  print(f"progress: {len(tweets)}")
if len(tweets)>maxcount:
  tweets=tweets[0:maxcount]
filename=userID
print("Total tweets: "+str(len(tweets)))

progress: 150
progress: 282
progress: 408
progress: 589
progress: 728
progress: 857
progress: 1024
progress: 1183
progress: 1333
progress: 1484
progress: 1643
progress: 1769
progress: 1909
progress: 2085
progress: 2237
progress: 2390
progress: 2445
progress: 2445
Total tweets: 2445


### Twitter status by query

In [None]:
query="Monterrey -filter:retweets"
#More info about twitter queries;
#https://developer.twitter.com/en/docs/twitter-api/v1/rules-and-filtering/search-operators

tweets=api.search(query, 
                  count=100,
                  tweet_mode = 'extended')
filename="Monterrey-tweets-100"

### Testing Twitter user output: Filter and export

In [None]:
rows = []
for info in tweets:
     rows.append([info.full_text, filter(info.full_text, "twitter", False, False), info.created_at])

df_twitter = pd.DataFrame(rows, columns=["Tweet", "Filtered", "Date"])
filtered_tweets = df_twitter[['Filtered']]
print(df_twitter.head(10))

#Generating the .csv
path3 = filename + '_TW.csv'
df_twitter.to_csv(path3)

#Generating the .csv for only the filtered posts for topic modeling
from google.colab import files
path4 = filename + '_Filtered_TW.csv'
filtered_tweets.to_csv(path4)
files.download(path4)

                                               Tweet  \
0  Waste-to-hydrogen technology firm Concord Blue...   
1  Bitcoin is struggling to gain traction in El S...   
2  The Star Ferry, an icon of Hong Kong and argua...   
3  Food and energy price surges worsened by the U...   
4  President Joe Biden is considering forgiving a...   
5  Inflation, lingering supply chain disruptions ...   
6  Few places in the U.S. have felt the sting of ...   
7  Warren Buffett’s Berkshire Hathaway pulled bac...   
8  The EU seeks a ban on Russian oil by the end o...   
9  Mexico's fastest inflation in two decades may ...   

                                            Filtered    Date of creation  
0  Waste- to- hydrogen technology firm Concord Bl... 2022-04-30 18:29:21  
1  Bitcoin is struggling to gain traction in El S... 2022-04-30 18:08:05  
2  The Star Ferry, an icon of Hong Kong and argua... 2022-04-30 17:35:35  
3  Food and energy price surges worsened by the U... 2022-04-30 17:08:03  
4  Presi

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## **Youtube API**

In [None]:
#Referencia: https://stackoverflow.com/questions/63608189/scraping-youtube-comments-using-selenium-and-google-colab-is-slow
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome('chromedriver',options=chrome_options)
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

def scrapecomments(url):
  wd.get(url)
  tic = time.perf_counter()
  wait = WebDriverWait(wd,1)
  #wd.get(url)
  data1=[]
  data2=[]
  data3=[]
  for item in range(200): 
          wait.until(EC.visibility_of_element_located((By.TAG_NAME,                "body"))).send_keys(Keys.END)
          #time.sleep(15)
  for author in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#author-text"))):
    if len(data1) == 1000:
      break
    else:
      data1.append(author.text)
  for comment in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#content-text"))):
          data2.append(comment.text)
  for likes in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#vote-count-middle"))):
          data3.append(likes.text)

  def merge(list1, list2, list3):
    merged_list = [(list1[i], list2[i], list3[i]) for i in range(0, len(list1))] 
    return merged_list
  
  alldata = merge(data1,data2,data3)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
  comments = pd.DataFrame(alldata,columns=['user_id','comment','likes'])
  comments['rank'] = comments.reset_index().index +1
  channel_name = wd.find_element_by_id('channel-name').text
  comments['source'] = channel_name
  toc = time.perf_counter()
  print(f"Completed scraping {len(data1)} comments in {toc - tic:0.4f} seconds from YouTube {channel_name} channel.")
  return comments

url = 'https://www.youtube.com/watch?v=tH2tKigOPBU&ab_channel=MarkRober'
df_comments = scrapecomments(url)

### Testing Youtube output

In [None]:
df_comments['Filtered'] = df_comments['comment'].apply(filter)
filtered_comments = filtered_tweets = df_twitter[['Filtered']]

#Generating the .csv
path5 = 'youtube' + '_YT.csv'
df_comments.to_csv(path5)
files.download(path5)

#Generating the .csv for only the filtered posts for topic modeling
from google.colab import files
path6 = 'youtube' + '_Filtered_YT.csv'
filtered_comments .to_csv(path6)
files.download(path6)

Unnamed: 0,user_id,comment,likes,rank,source,Clean Comments
0,,TOUCHDOWN CONFIRMED!! WE ARE SAFE ON MARS! C...,34K,1,Mark Rober,TOUCHDOWN CONFIRMED!! WE ARE SAFE ON MARS! ...
1,,Thanks for spreading the word!! It's go time 😎,29K,2,Mark Rober,Thanks for spreading the word!! It 's go time
2,LordEmor23,My dad helped make the battery for the rover a...,1.3K,3,Mark Rober,My dad helped make the battery for the rover a...
3,Toronto Guy,Never realized how big the rovers were until p...,1.7K,4,Mark Rober,Never realized how big the rovers were until p...
4,Shiza Soomro,Bro this man never fails to make me emotional ...,266,5,Mark Rober,Bro this man never fails to make me emotional ...
...,...,...,...,...,...,...
995,General Usage,Is “Mars Rover” just “Mark Rober” mispronounced?,2,996,Mark Rober,Is“ Mars Rover” just“ Mark Rober” mispronounced?
996,Jacek-Jan,A Schrodinger Mars rover...\nIt would be terri...,,997,Mark Rober,A Schrodinger Mars rover... \n It would be ter...
997,BLIZZARD,This guy smarter then Elbert,,998,Mark Rober,This guy smarter then Elbert
998,Cooper Keel,Touchdown confirmed perseverance is safe on mars,117,999,Mark Rober,Touchdown confirmed perseverance is safe on mars
