In [1]:
from urllib.request import urlopen, urlretrieve
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from datetime import datetime
import pprint
import re

# Using snscrape to generate a link of statuses

### https://github.com/JustAnotherArchivist/snscrape

In [2]:
# Running snscrape in terminal to get tweets using hashtag

!snscrape --max-results 50 twitter-hashtag GlobalWarming >GlobalWarming_links.csv

In [3]:
# To search for a specific user use the code below instead
# snscrape twitter-user textfiles >twitter-@textfiles

In [6]:
df_links = pd.read_csv('GlobalWarming_links.csv')
df_links.columns = ['link']
df_links.head()

Unnamed: 0,link
0,https://twitter.com/danielchayau/status/121509...
1,https://twitter.com/michaelhoney_/status/13092...
2,https://twitter.com/pipsterb/status/1309222189...
3,https://twitter.com/MyMorningMusing/status/130...
4,https://twitter.com/exilarchy/status/130922143...


# Scraping

In [33]:
df = pd.DataFrame(columns=['username',
                           'fullname',
                            'time',
                            'date',
                            'text',
                            'link'])
error_list = []
error_links = []

counter = 0

for item in df_links['link']:
    sleep(0.2)
    try:
        link = item.split('https://')
        link = 'https://mobile.' +link[1]

        response = urlopen(link)
        html = response.read().decode('utf-8')
        soup = BeautifulSoup(html, 'html.parser')

        tweet = soup.find('div', class_="main-tweet-container")
        username = tweet.find('span', class_="username").get_text().strip()
        fullname = tweet.find('div', class_="fullname").get_text().strip()
        date_time = tweet.find('div', class_="metadata").get_text().strip()
        time = date_time.split('-')[0].strip()
        date = date_time.split('-')[1].strip()
        text = tweet.find('div', class_="tweet-text").get_text().strip()
        link = item
        
        print(counter, username, fullname, date, time, text)

        df1 = pd.DataFrame({'username': username,
                            'fullname': fullname,
                            'time': time,
                            'date': date,
                            'text': text,
                            'link': [item]
                            })

        df = pd.concat([df, df1])
        counter += 1
       
    
    except Exception as e: 
        print(e)
        error_list.append(e)
        error_links.append(item)
        counter += 1
    
df = df.reset_index().drop('index', axis = 1)

0 @danielchayau Author 8 Jan 2020 6:12 PM Check out Elven Magic on Steam for the steam sales(coming soon). store.steampowered.com/app/923140/Elv…
1 @michaelhoney_ Michael Honey 🏴󠁧󠁢󠁷󠁬󠁳󠁿 #FBPE 24 Sep 2020 1:20 PM Court losses are piling up for President Trump's environmental deregulation agenda. But a second term in the White House could help them stick. dlvr.it/RhHmP6 #GlobalWarming pic.twitter.com/fIVceJMBeQ
2 @pipsterb 💧Pippa Bailey 💦🐳🌎 24 Sep 2020 1:04 PM Today is a day of global action @Fridays4future #schoolstrike4climate 

There is no planet B 
& no time to lose 
#wildfires
#sealevelrise
#plasticpollution
#massextinction
#GlobalWarming 

We need #NetZero2035 #renewables #FaceTheClimateEmergency 
For a livable Future
#auspol
3 @MyMorningMusing MorningMusings 24 Sep 2020 1:01 PM How can we stop #globalwarming? Well, 8% of global emissions are caused by food waste. Watch today's video to hear @Hennigs solution to tackling #foodwaste and global warming. #stoppingglobalwarming #climate

In [24]:
print (f' CAPTURED TWEETS: {len(df)+1} \n ERROR: {len(error_links)}')

 CAPTURED TWEETS: 50 
 ERROR: 0


## Cleaning the fields

In [34]:
regex_hashtag = re.compile(r"#[^\W]+")
regex_pic = re.compile(r"pic.twitter[^\s]+")
regex_arroba = re.compile(r"@[^\W]+")

regex_html = re.compile(r"[^\s]+twitter.com[^\s]+")
regex_quebra_linha = re.compile(r"(\n)")

In [35]:
def remover(textos, regex):
    if type(textos) == str:
        return regex.sub("", textos)
    else:
        return [regex.sub("", texto) for texto in textos]
    
def capture(texts, regex):
    if type(texts) == str:
        return regex.findall(texts)
        
    else:
        return [regex.findall("", text) for text in texts]

In [36]:
username_clean = []
time_clean = []
date_clean = []
text_clean = []

for item in df['username']:
    item = remover(item, regex_quebra_linha)
    item = item.strip()
    username_clean.append(item)
df['username'] = username_clean

for item in df['time']:
    item = remover(item, regex_quebra_linha)
    item = item.strip()
    time_clean.append(item)
df['time'] = time_clean

for item in df['date']:
    item = remover(item, regex_quebra_linha)
    item = item.strip()
    date_clean.append(item)
df['date'] = date_clean

for item in df['text']:
    item = remover(item, regex_quebra_linha)
    item = item.strip()
    text_clean.append(item)
df['text'] = text_clean   

In [37]:
df.head()

Unnamed: 0,username,fullname,time,date,text,link
0,@danielchayau,Author,6:12 PM,8 Jan 2020,Check out Elven Magic on Steam for the steam s...,https://twitter.com/danielchayau/status/121509...
1,@michaelhoney_,Michael Honey 🏴󠁧󠁢󠁷󠁬󠁳󠁿 #FBPE,1:20 PM,24 Sep 2020,Court losses are piling up for President Trump...,https://twitter.com/michaelhoney_/status/13092...
2,@pipsterb,💧Pippa Bailey 💦🐳🌎,1:04 PM,24 Sep 2020,Today is a day of global action @Fridays4futur...,https://twitter.com/pipsterb/status/1309222189...
3,@MyMorningMusing,MorningMusings,1:01 PM,24 Sep 2020,"How can we stop #globalwarming? Well, 8% of gl...",https://twitter.com/MyMorningMusing/status/130...
4,@exilarchy,Eshaya,1:01 PM,24 Sep 2020,#GlobalWarming #BobcatFire #mtwilsonBobcat Fir...,https://twitter.com/exilarchy/status/130922143...


## Formating date

In [38]:
months = {
            'Jan' : '01',
            'Feb' : '02',
            'Mar' : '03',
            'Apr' : '04',
            'May' : '05',
            'Jun' : '06',
            'Jul' : '07',
            'Aug' : '08',
            'Sep' : '09', 
            'Oct' : '10',
            'Nov' : '11',
            'Dec' : '12'
    }

new_date_list = []

for item in df['date']:
    date = item.split(' ')
    day = date[0]
    month = months[date[1]]
    year = date[2]
    new_date = f'{year}-{month}-{day}'
    new_date = remover(new_date, regex_quebra_linha)
    new_date = datetime.strptime(new_date, '%Y-%m-%d').date()
    new_date_list.append(new_date)
df['date'] = new_date_list

In [39]:
df.head()

Unnamed: 0,username,fullname,time,date,text,link
0,@danielchayau,Author,6:12 PM,2020-01-08,Check out Elven Magic on Steam for the steam s...,https://twitter.com/danielchayau/status/121509...
1,@michaelhoney_,Michael Honey 🏴󠁧󠁢󠁷󠁬󠁳󠁿 #FBPE,1:20 PM,2020-09-24,Court losses are piling up for President Trump...,https://twitter.com/michaelhoney_/status/13092...
2,@pipsterb,💧Pippa Bailey 💦🐳🌎,1:04 PM,2020-09-24,Today is a day of global action @Fridays4futur...,https://twitter.com/pipsterb/status/1309222189...
3,@MyMorningMusing,MorningMusings,1:01 PM,2020-09-24,"How can we stop #globalwarming? Well, 8% of gl...",https://twitter.com/MyMorningMusing/status/130...
4,@exilarchy,Eshaya,1:01 PM,2020-09-24,#GlobalWarming #BobcatFire #mtwilsonBobcat Fir...,https://twitter.com/exilarchy/status/130922143...


## Extracting hashtags, mentions, and pictures from text

In [40]:
at_list_of_lists = []
hashtag_list_of_lists = []
picture_list_of_lists =[]
html_list_of_lists = []

for item in df['text']:
    at_list = capture(item, regex_arroba)
    at_list_of_lists.append(at_list)
    
    hashtag_list = capture(item, regex_hashtag)
    hashtag_list_of_lists.append(hashtag_list)
    
    picture_list = capture(item, regex_pic)
    picture_list_of_lists.append(picture_list)
    
df['mentions'] = at_list_of_lists
df['hashtags'] = hashtag_list_of_lists
df['photos'] = picture_list_of_lists

In [41]:
df.head(50)

Unnamed: 0,username,fullname,time,date,text,link,mentions,hashtags,photos
0,@danielchayau,Author,6:12 PM,2020-01-08,Check out Elven Magic on Steam for the steam s...,https://twitter.com/danielchayau/status/121509...,[],[],[]
1,@michaelhoney_,Michael Honey 🏴󠁧󠁢󠁷󠁬󠁳󠁿 #FBPE,1:20 PM,2020-09-24,Court losses are piling up for President Trump...,https://twitter.com/michaelhoney_/status/13092...,[],[#GlobalWarming],[pic.twitter.com/fIVceJMBeQ]
2,@pipsterb,💧Pippa Bailey 💦🐳🌎,1:04 PM,2020-09-24,Today is a day of global action @Fridays4futur...,https://twitter.com/pipsterb/status/1309222189...,[@Fridays4future],"[#schoolstrike4climate, #wildfires, #sealevelr...",[]
3,@MyMorningMusing,MorningMusings,1:01 PM,2020-09-24,"How can we stop #globalwarming? Well, 8% of gl...",https://twitter.com/MyMorningMusing/status/130...,[@Hennigs],"[#globalwarming, #foodwaste, #stoppingglobalwa...",[pic.twitter.com/b8pFyX9R2e]
4,@exilarchy,Eshaya,1:01 PM,2020-09-24,#GlobalWarming #BobcatFire #mtwilsonBobcat Fir...,https://twitter.com/exilarchy/status/130922143...,[@Yahoo],"[#GlobalWarming, #BobcatFire, #mtwilsonBobcat]",[]
5,@Frankendr01d,Hell is Expanding,12:59 PM,2020-09-24,The most recognizable word for hell in the new...,https://twitter.com/Frankendr01d/status/130922...,[],[#globalwarming],[]
6,@LeonardoZ,LeonardoZ,12:55 PM,2020-09-24,Ocean Heat Waves Are Directly Linked to Climat...,https://twitter.com/LeonardoZ/status/130921983...,[],"[#GlobalWarming, #Sciencenytimes]",[]
7,@nycpt,EMS,12:52 PM,2020-09-24,#protectkids by voting #prochoice! The world i...,https://twitter.com/nycpt/status/1309219129354...,[],"[#protectkids, #prochoice, #climatechange, #ha...",[]
8,@MisterSalesman,M LeMont,12:52 PM,2020-09-24,pic.twitter.com/rm9VedOQUd RUSSIA: Alarming si...,https://twitter.com/MisterSalesman/status/1309...,[],[#GlobalwarmingGIF],[pic.twitter.com/rm9VedOQUd]
9,@DreyMeans,That Guy,12:52 PM,2020-09-24,"At this point if aliens invaded, I'm sure they...",https://twitter.com/DreyMeans/status/130921907...,[],"[#globalwarming, #blacklivesDOmatter, #wearyou...",[]


In [None]:
df.to_csv('GlobalWarming_tweets.csv', index = False)