# Sample Site Scraper

Max Rizzuto | DFRLAB 2022

------
This notebook uses Selenium and Firefox's Gecko Driver to the posts from a simplified Twitter clone at [maxrizzuto.com/selenium_example.html](https://maxrizzuto.com/selenium_example.html).
This notebook runs on Python 3.7 and uses Selenium

In [151]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

from fake_useragent import UserAgent

from time import sleep
import pandas as pd
import os

### Load the page

In [10]:
def get_site(page):
    """ Takes page sub-directory as input . 
        Loads maxrizzuto.com/{} with a random user agent to get around anti-crawling measures. 
    """
    def make_fake_user():
        """ A applies "fake_useragent" to help us obfuscate our robo-browser."""
        # Generate User Agent
        ua = UserAgent()
        user_agent = ua.random

        # Set up webdriver with user agent... 
        profile = webdriver.FirefoxProfile()
        profile.set_preference("general.useragent.override", user_agent)
        return profile
    
    profile = make_fake_user()
    
    wd = webdriver.Firefox(profile)
    
    # Make URL with parameters:
    query_url = f"http://maxrizzuto.com/{page}"
    
    # Get website with parameters
    wd.get(query_url)
    sleep(2)
    print("Page Loaded")
                    
    return wd

In [11]:
wd = get_site('selenium_example.html')

Page Loaded


### Gather Page Results

In [86]:
def gather_page_results(wd):
    """ Takes most recent webdriver state as input.
        Returns a list of dictionaries as output containing parsed page results.
    """
    # A list of all the post elements on the page
    posts = wd.find_element(By.CLASS_NAME, "timeline").find_elements(By.XPATH, "./*")
    
    # A list to store each posts' data once its been parsed
    post_list = []
    
    # Post Parsing Loop - calls get_post_data, appends extracted data to post_list 
    for post in posts:   
        
        post_data = {"post_url": [v.get_attribute("href") for v in post.find_elements(By.CLASS_NAME, "tweet-link")],
                    "post_author": [v.get_attribute("title") for v in post.find_elements(By.CLASS_NAME, "fullname")],
                    "post_username": [v.get_attribute("title") for v in post.find_elements(By.CLASS_NAME, "username")],
                    "post_date": [v.get_attribute("title") for v in post.find_element(By.CLASS_NAME, "tweet-date").find_elements(By.XPATH, "./*")],
                    "post_content": [v.text for v in post.find_elements(By.CLASS_NAME, "tweet-content")],
                    "post_card": [v.get_attribute("href") for v in post.find_elements(By.CLASS_NAME, "card-container")],
                    "post_comments": [v.text for v in post.find_element(By.CLASS_NAME, "tweet-stats").find_element(By.CLASS_NAME, "icon-comment").find_elements(By.XPATH,"..")],
                    "post_retweets": [v.text for v in post.find_element(By.CLASS_NAME, "tweet-stats").find_element(By.CLASS_NAME, "icon-retweet").find_elements(By.XPATH,"..")],
                    "post_quotes": [v.text for v in post.find_element(By.CLASS_NAME, "tweet-stats").find_element(By.CLASS_NAME, "icon-quote").find_elements(By.XPATH,"..")],
                    "post_likes": [v.text for v in post.find_element(By.CLASS_NAME, "tweet-stats").find_element(By.CLASS_NAME, "icon-heart").find_elements(By.XPATH,"..")]                
                    }        
        post_list.append(post_data)
    
    return post_list  

In [90]:
page_data = gather_page_results(wd)

In [168]:
page_data

[{'post_url': ['https://twitter.com/DFRLab/status/1565710212895580161#m'],
  'post_author': ['DFRLab'],
  'post_username': ['@DFRLab'],
  'post_date': ['Sep 2, 2022 · 2:36 PM UTC'],
  'post_content': ["A @DFRLab study analyzing early online debates about the war in #Ukraine found RT en Español was one of the most shared sites in Spanish-language Twitter discussions. The network's reach was expanded by Russian MFA accounts and, likely, bots.\nbloomberg.com/news/features/…"],
  'post_card': ['https://www.bloomberg.com/news/features/2022-09-01/ukraine-war-propaganda-from-russia-today-rt-thrives-despite-sanctions?srnd=businessweek-v2'],
  'post_comments': [''],
  'post_retweets': ['7'],
  'post_quotes': ['1'],
  'post_likes': ['7']},
 {'post_url': ['https://twitter.com/EtoBuziashvili/status/1565365831491518464#m'],
  'post_author': ['Eto Buziashvili'],
  'post_username': ['@EtoBuziashvili'],
  'post_date': ['Sep 1, 2022 · 3:47 PM UTC'],
  'post_content': ["Yours truly together with @acarvi

### Clean Data
To clean the data we're going to use Python's Pandas data science library.
<br> Like Selenium, Pandas uses a somewhat different syntax paradigm.
<br> We like pandas because it can turn our list of dictionaries into table data in one line:
```pd.DataFrame(our![image.png](attachment:55ac9c91-e5f5-4299-8e3f-1e97d916e0a2.png)_list_of_dictionaries)```

In [98]:
pd.DataFrame(page_data)

Unnamed: 0,post_url,post_author,post_username,post_date,post_content,post_card,post_comments,post_retweets,post_quotes,post_likes
0,[https://twitter.com/DFRLab/status/15657102128...,[DFRLab],[@DFRLab],"[Sep 2, 2022 · 2:36 PM UTC]",[A @DFRLab study analyzing early online debate...,[https://www.bloomberg.com/news/features/2022-...,[],[7],[1],[7]
1,[https://twitter.com/EtoBuziashvili/status/156...,[Eto Buziashvili],[@EtoBuziashvili],"[Sep 1, 2022 · 3:47 PM UTC]",[Yours truly together with @acarvin on Russia'...,[https://link.medium.com/NlZSnR9aYsb],[2],[17],[3],[43]
2,[https://twitter.com/IST_org/status/1565415812...,[Institute for Security and Technology],[@IST_org],"[Sep 1, 2022 · 7:06 PM UTC]",[Image analysis during wartime is critical to ...,[https://securityandtechnology.org/virtual-lib...,[1],[3],[1],[6]
3,[https://twitter.com/DFRLab/status/15654043537...,[DFRLab],[@DFRLab],"[Sep 1, 2022 · 6:20 PM UTC]","[""The available evidence provided above sugges...",[https://medium.com/dfrlab/twitter-campaign-pu...,[],[3],[],[7]
4,[https://twitter.com/DFRLab/status/15649864510...,[DFRLab],[@DFRLab],"[Aug 31, 2022 · 2:40 PM UTC]",[Chinese Students and Scholars Associations (C...,[https://medium.com/dfrlab/wechat-channels-kee...,[1],[26],[],[33]
5,[https://twitter.com/DFRLab/status/15653341266...,[DFRLab],[@DFRLab],"[Sep 1, 2022 · 1:41 PM UTC]",[#Russia has launched its counter to @Wikipedi...,[https://medium.com/dfrlab/pro-kremlin-wikiped...,[1],[12],[2],[26]
6,[https://twitter.com/NSC/status/15650040014139...,[Iria Puyosa],[@NSC],"[Aug 31, 2022 · 3:50 PM UTC]",[CSSA WeChat channels align with the CCP's com...,[https://medium.com/dfrlab/wechat-channels-kee...,[1],[9],[],[13]
7,[https://twitter.com/DFRLab/status/15646454011...,[DFRLab],[@DFRLab],"[Aug 30, 2022 · 4:05 PM UTC]",[A largely inauthentic Twitter campaign appear...,[https://medium.com/dfrlab/twitter-campaign-pu...,[1],[33],[2],[34]


In [208]:
df = pd.DataFrame(page_data)

list_columns = list(df.columns)

formatted_df = (df.explode(list_columns).fillna("")
                   .assign(post_content = lambda x: x['post_content'].str.replace("\n"," "), # replacing line break characters with spaces because line breaks characters are annoying.
                           datetime = lambda x: pd.to_datetime(x['post_date'], format="%b %d, %Y · %H:%M %p %Z", utc=False), # https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
                           hashtags = lambda x: x['post_content'].str.findall(r'#.*?(?=\s|$)'), # I didn't know how to do this, I just googled:"python extract all twitter handles from pandas column"
                           mentions = lambda x: x['post_content'].str.findall(r'@.*?(?=\s|$)')  # Then I just changed the character in the regular expression from a @ to a #. Works decent.
                          )
                )
display(formatted_df)

formatted_df.to_csv("./data/sample_output.csv", encoding='utf-8-sig')

Unnamed: 0,post_url,post_author,post_username,post_date,post_content,post_card,post_comments,post_retweets,post_quotes,post_likes,datetime,hashtags,mentions
0,https://twitter.com/DFRLab/status/156571021289...,DFRLab,@DFRLab,"Sep 2, 2022 · 2:36 PM UTC",A @DFRLab study analyzing early online debates...,https://www.bloomberg.com/news/features/2022-0...,,7,1.0,7,2022-09-02 02:36:00+00:00,[#Ukraine],[@DFRLab]
1,https://twitter.com/EtoBuziashvili/status/1565...,Eto Buziashvili,@EtoBuziashvili,"Sep 1, 2022 · 3:47 PM UTC",Yours truly together with @acarvin on Russia's...,https://link.medium.com/NlZSnR9aYsb,2.0,17,3.0,43,2022-09-01 03:47:00+00:00,[],[@acarvin]
2,https://twitter.com/IST_org/status/15654158129...,Institute for Security and Technology,@IST_org,"Sep 1, 2022 · 7:06 PM UTC",Image analysis during wartime is critical to i...,https://securityandtechnology.org/virtual-libr...,1.0,3,1.0,6,2022-09-01 07:06:00+00:00,[#Ukraine],"[@NataliaAntonova, @r_osadchuk, @LAndriukaitis]"
3,https://twitter.com/DFRLab/status/156540435375...,DFRLab,@DFRLab,"Sep 1, 2022 · 6:20 PM UTC","""The available evidence provided above suggest...",https://medium.com/dfrlab/twitter-campaign-pus...,,3,,7,2022-09-01 06:20:00+00:00,"[#/StopUkrainizacjiPolski.""]",[@GGigitashvili_]
4,https://twitter.com/DFRLab/status/156498645106...,DFRLab,@DFRLab,"Aug 31, 2022 · 2:40 PM UTC",Chinese Students and Scholars Associations (CS...,https://medium.com/dfrlab/wechat-channels-keep...,1.0,26,,33,2022-08-31 02:40:00+00:00,[],"[@WeChatApp, @NSC.]"
5,https://twitter.com/DFRLab/status/156533412663...,DFRLab,@DFRLab,"Sep 1, 2022 · 1:41 PM UTC",#Russia has launched its counter to @Wikipedia...,https://medium.com/dfrlab/pro-kremlin-wikipedi...,1.0,12,2.0,26,2022-09-01 01:41:00+00:00,"[#Russia, #disinformation, #Ukraine]","[@Wikipedia,, @EtoBuziashvili's]"
6,https://twitter.com/NSC/status/156500400141393...,Iria Puyosa,@NSC,"Aug 31, 2022 · 3:50 PM UTC",CSSA WeChat channels align with the CCP's comm...,https://medium.com/dfrlab/wechat-channels-keep...,1.0,9,,13,2022-08-31 03:50:00+00:00,[],[]
7,https://twitter.com/DFRLab/status/156464540114...,DFRLab,@DFRLab,"Aug 30, 2022 · 4:05 PM UTC",A largely inauthentic Twitter campaign appeare...,https://medium.com/dfrlab/twitter-campaign-pus...,1.0,33,2.0,34,2022-08-30 04:05:00+00:00,[#Poland],[@GGigitashvili_]


#### Alternate, arguably more complicated approach that produces identical results but scales better.

This is a kind of syntax I have been using ever since I learned how to use the .pipe function in Pandas.
<br> I believe that this approach allows me to see, more clearly, what transformations are being done on the dataset and for large datasets the progress of 
each transformation is reassuring. Pandas is good, but when you run commands such as these on a million or so rows, it can take a while. I have more faith in this system and it makes debugging easier.  

In [209]:
def explode_list_columns(df):
    """Takes as input a Pandas DataFrame
       Returns a pandas dataframe where all of the columns are "exploded," i.e. no longer lists
    """
    print("Exploding list cols...")
    return(df.explode(list(df.columns))
             .fillna("")
          )

def convert_dates_to_datetime(df):
    """Takes as input a Pandas DataFrame
       Returns a pandas dataframe with a new "datetime" column, a machine readable version of the "post_date" column.
    """
    print("Converting date format...")
    return(df.assign(datetime = lambda x: pd.to_datetime(x['post_date'], format="%b %d, %Y · %H:%M %p %Z")))

def extract_values_from_text(df):
    print("Extracting values from post_content...")
    return(df.assign(post_content = lambda x: x['post_content'].str.replace("\n"," "), # Converting line breaks to spaces because line breaks do not always translate well in table data.
                     hashtags = lambda x: x['post_content'].str.findall(r'#.*?(?=\s|$)'), # I didn't know how to do this, I just googled:"python extract all twitter handles from pandas column"
                     mentions = lambda x: x['post_content'].str.findall(r'@.*?(?=\s|$)') # Then I just changed the character in the regular expression from a @ to a #. Works decent.
                    )
          )

def export_data_as_csv(df):
    """ Takes a Pandas DataFrame as input, creates a csv file as output. 
    """
    file_path = "./data/"
    if os.path.exists(file_path):
        pass
    else:
        print("Creating 'data' folder...")  
        os.mkdir(file_path)
    print("Saving csv in ./data/...")
    return df.to_csv(f"{file_path}output_{pd.Timestamp.now().strftime('%Y-%m-%d_%f')}.csv", encoding='utf-8-sig')

def export_data_as_pickle(df):
    """ Takes a Pandas DataFrame as input, creates a pickle file as output. 
    """
    file_path = "./data/"
    if os.path.exists(file_path):
        pass
    else:
        print("Creating 'data' folder...")  
        os.mkdir(file_path)
    print("Saving pickle in ./data/...")
    return df.to_pickle(f"{file_path}output_{pd.Timestamp.now().strftime('%Y-%m-%d_%f')}.pkl")

In [202]:
# We can use panda's pipe command to apply each of the above functions to the data frame. 
# The result of the formatting related functions will be saved as "formatted_df"
formatted_df = (pd.DataFrame(page_data)
                     .pipe(explode_list_columns)
                     .pipe(convert_dates_to_datetime)
                     .pipe(extract_values_from_text)     
                )

# These functions enable us to save the output in various formats. 
formatted_df.pipe(export_data_as_csv)
formatted_df.pipe(export_data_as_pickle) # pickle files play nice with Pandas. If you intend to re-import data later, this can save time.

Exploding list cols...
Converting date format...
Extracting values from post_content...
Saving csv in ./data/...
Saving pickle in ./data/...


In [203]:
formatted_df

Unnamed: 0,post_url,post_author,post_username,post_date,post_content,post_card,post_comments,post_retweets,post_quotes,post_likes,datetime,hashtags,mentions
0,https://twitter.com/DFRLab/status/156571021289...,DFRLab,@DFRLab,"Sep 2, 2022 · 2:36 PM UTC",A @DFRLab study analyzing early online debates...,https://www.bloomberg.com/news/features/2022-0...,,7,1.0,7,2022-09-02 02:36:00+00:00,[#Ukraine],[@DFRLab]
1,https://twitter.com/EtoBuziashvili/status/1565...,Eto Buziashvili,@EtoBuziashvili,"Sep 1, 2022 · 3:47 PM UTC",Yours truly together with @acarvin on Russia's...,https://link.medium.com/NlZSnR9aYsb,2.0,17,3.0,43,2022-09-01 03:47:00+00:00,[],[@acarvin]
2,https://twitter.com/IST_org/status/15654158129...,Institute for Security and Technology,@IST_org,"Sep 1, 2022 · 7:06 PM UTC",Image analysis during wartime is critical to i...,https://securityandtechnology.org/virtual-libr...,1.0,3,1.0,6,2022-09-01 07:06:00+00:00,[#Ukraine],"[@NataliaAntonova, @r_osadchuk, @LAndriukaitis]"
3,https://twitter.com/DFRLab/status/156540435375...,DFRLab,@DFRLab,"Sep 1, 2022 · 6:20 PM UTC","""The available evidence provided above suggest...",https://medium.com/dfrlab/twitter-campaign-pus...,,3,,7,2022-09-01 06:20:00+00:00,"[#/StopUkrainizacjiPolski.""]",[@GGigitashvili_]
4,https://twitter.com/DFRLab/status/156498645106...,DFRLab,@DFRLab,"Aug 31, 2022 · 2:40 PM UTC",Chinese Students and Scholars Associations (CS...,https://medium.com/dfrlab/wechat-channels-keep...,1.0,26,,33,2022-08-31 02:40:00+00:00,[],"[@WeChatApp, @NSC.]"
5,https://twitter.com/DFRLab/status/156533412663...,DFRLab,@DFRLab,"Sep 1, 2022 · 1:41 PM UTC",#Russia has launched its counter to @Wikipedia...,https://medium.com/dfrlab/pro-kremlin-wikipedi...,1.0,12,2.0,26,2022-09-01 01:41:00+00:00,"[#Russia, #disinformation, #Ukraine]","[@Wikipedia,, @EtoBuziashvili's]"
6,https://twitter.com/NSC/status/156500400141393...,Iria Puyosa,@NSC,"Aug 31, 2022 · 3:50 PM UTC",CSSA WeChat channels align with the CCP's comm...,https://medium.com/dfrlab/wechat-channels-keep...,1.0,9,,13,2022-08-31 03:50:00+00:00,[],[]
7,https://twitter.com/DFRLab/status/156464540114...,DFRLab,@DFRLab,"Aug 30, 2022 · 4:05 PM UTC",A largely inauthentic Twitter campaign appeare...,https://medium.com/dfrlab/twitter-campaign-pus...,1.0,33,2.0,34,2022-08-30 04:05:00+00:00,[#Poland],[@GGigitashvili_]
