In [1]:
import snscrape.modules.twitter as sntwitter
import pandas as pd

# hazard types and keywords to scrape tweets for
hazard_types = ["Blizzard", "Sea level rise", "Flood", "Heatwave"]
hazard_keywords = {"Blizzard":["snowstorm", "freezing"],
                   "Sea level rise": [],
                   "Flood": ["flooding", "river flood", "urban flood"],
                   "Heatwave": ["heatwave", "heat stroke", "heat exhaustion"]
                  }

#"beach", "global warming"
# Using TwitterSearchScraper to scrape data and append tweets to list
def TWTRScrapr(hazard_types):
    tweets_df_list = [] # list to hold the dataframe for each hazard type
    negation_keywords = " -game -movie "
    for hazard_type in hazard_types:
        keywords = ' '.join(hazard_keywords[hazard_type])
        # Created a list to append all tweet attributes(data)
        attributes_container = []
        for i, tweet in enumerate(sntwitter.TwitterSearchScraper(f'{hazard_type} lang:en {keywords} {negation_keywords} -filter:retweets -filter:replies ').get_items()):
            if i > 199: # limit to 200 tweets
                break
            if tweet.place:
                country = tweet.place.country
            else:
                country = None
            attributes_container.append([tweet.date, tweet.likeCount, tweet.sourceLabel, country, tweet.user.username, tweet.user.followersCount, tweet.content])
        # Creating a dataframe from the tweets list above 
        tweets_df = pd.DataFrame(attributes_container, columns=["Date Created", "Number of Likes", "Source of Tweet", "Country", "Username", "Followers Count", "Tweets"])
        tweets_df_list.append(tweets_df.sort_values(by=['Date Created'], ascending = False).reset_index(drop=True))
    return tweets_df_list

#This is a list containing 4 dataframes, tweets_df_list[0] for blizzard tweets, tweets_df_list[1] for sea level rise tweets, etc.
tweets_df_list = TWTRScrapr(hazard_types)

  attributes_container.append([tweet.date, tweet.likeCount, tweet.sourceLabel, country, tweet.user.username, tweet.user.followersCount, tweet.content])


In [2]:
tweets_df_list[1]

Unnamed: 0,Date Created,Number of Likes,Source of Tweet,Country,Username,Followers Count,Tweets
0,2023-03-01 11:14:16+00:00,0,dlvr.it,,JeromeOLLIER,788,Actus Mer/Sea News: Assessment of future floo...
1,2023-03-01 11:02:38+00:00,0,Greenland_SLR,,cryo_data,137,Sea level rise (#SLR: 📈=🌊+🧊💧) from 🇬🇱Greenland...
2,2023-03-01 10:07:04+00:00,0,Twitter for Android,,kobiah,2113,"Warmer temperatures, sea level rise and extrem..."
3,2023-03-01 09:39:00+00:00,0,Sprout Social,,WWF_Arctic,14522,If the outlet glaciers of #Greenland’s ice she...
4,2023-03-01 08:50:17+00:00,0,Twitter for Android,,JeffMNeale,2295,"A prediction of Europe in 2100, after anticipa..."
...,...,...,...,...,...,...,...
195,2023-02-24 18:43:34+00:00,1,Twitter Web App,,scicommlab,13942,Bay Area folks! Check out this FREE @swissnexS...
196,2023-02-24 18:27:32+00:00,0,Twitter Web App,,madmilker,7927,Florida's Projected Sea Level Rise by 2100 Is ...
197,2023-02-24 18:03:04+00:00,1,Hootsuite Inc.,,Earth911,72231,"In this podcast, we hear from Oceanographer Jo..."
198,2023-02-24 18:00:16+00:00,0,Sprout Social,,LexisNexis,59653,This #ClimateChange special edition of the Lex...


In [3]:
import nltk
import re

# Download stopwords if necessary
nltk.download('stopwords')

# Load the stopwords
stop_words = nltk.corpus.stopwords.words('english')

# Define a function to clean the text
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove emoji
    text = re.sub('[^\x00-\x7F]+', ' ', text)
    # Replace hyphens with spaces
    text = re.sub('-', ' ', text)
    # Remove punctuation marks and other unwanted characters
    text = re.sub('[^a-zA-Z0-9\s]', ' ', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra spaces
    text = re.sub(' +', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Split into words
    words = text.split()
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    # Join the words back into a string
    cleaned_text = ' '.join(words)
    return cleaned_text

# Apply the clean_text function to the Tweets column
for df in tweets_df_list:
    df['Cleaned Tweets'] = df['Tweets'].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\timmo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
tweets_df_list[0].head()

Unnamed: 0,Date Created,Number of Likes,Source of Tweet,Country,Username,Followers Count,Tweets,Cleaned Tweets
0,2023-02-27 22:36:41+00:00,2,Twitter Web App,,rvpstudioscana1,77,Don't y'all wish you were in Florida right now...,wish florida right eh major snow storm freezin...
1,2023-02-25 13:42:18+00:00,0,Twitter for iPhone,,BegreenKay,78,“Strongest snowstorm in years” leaves Californ...,strongest snowstorm years leaves californians ...
2,2023-02-23 18:42:58+00:00,22,Twitter for Android,,MissBrandyGreen,3564,Took me 45 mins to #snowblow most of the drive...,took mins snowblow driveway amp main part fron...
3,2023-02-03 07:29:33+00:00,0,Tumblr,,Enfieldfinearts,1893,📹 Intense Snowstorm in a Mountain Village┇Snow...,intense snowstorm mountain village snow ambien...
4,2023-01-26 10:20:25+00:00,2,Twitter Web App,,StormCentar,274,what happens !! 20 degrees below zero! Incredi...,happens degrees zero incredible snow disaster ...


In [5]:
# Sentiment Analysis
from textblob import TextBlob

def fetch_sentiment_using_textblob(text):
    analysis = TextBlob(text)
    return 'pos' if analysis.sentiment.polarity >= 0 else 'neg'

In [6]:
#Applying the function to get sentiment
for df in tweets_df_list:
    df['Sentiment'] = df['Cleaned Tweets'].apply(fetch_sentiment_using_textblob)

In [7]:
tweets_df_list[0]

Unnamed: 0,Date Created,Number of Likes,Source of Tweet,Country,Username,Followers Count,Tweets,Cleaned Tweets,Sentiment
0,2023-02-27 22:36:41+00:00,2,Twitter Web App,,rvpstudioscana1,77,Don't y'all wish you were in Florida right now...,wish florida right eh major snow storm freezin...,pos
1,2023-02-25 13:42:18+00:00,0,Twitter for iPhone,,BegreenKay,78,“Strongest snowstorm in years” leaves Californ...,strongest snowstorm years leaves californians ...,pos
2,2023-02-23 18:42:58+00:00,22,Twitter for Android,,MissBrandyGreen,3564,Took me 45 mins to #snowblow most of the drive...,took mins snowblow driveway amp main part fron...,pos
3,2023-02-03 07:29:33+00:00,0,Tumblr,,Enfieldfinearts,1893,📹 Intense Snowstorm in a Mountain Village┇Snow...,intense snowstorm mountain village snow ambien...,pos
4,2023-01-26 10:20:25+00:00,2,Twitter Web App,,StormCentar,274,what happens !! 20 degrees below zero! Incredi...,happens degrees zero incredible snow disaster ...,pos
...,...,...,...,...,...,...,...,...,...
195,2018-02-05 11:21:01+00:00,0,Twitter for Android,,nlitenmebabe,1546,Snowfall of the century: Record-breaking snow ...,snowfall century record breaking snow freezing...,pos
196,2018-02-04 16:38:07+00:00,7,dlvr.it,,TheWatchers_,13739,Snowfall of the century: Record-breaking snow ...,snowfall century record breaking snow freezing...,pos
197,2018-01-19 00:04:47+00:00,1,Twitter for iPhone,,VeeBee123,422,"*20-below-zero temps for two weeks, batshit cr...",zero temps two weeks batshit crazy bomb cyclon...,neg
198,2018-01-16 23:13:24+00:00,1,Twitter for iPhone,,rebecca_star_04,30,Happy Winter Days❄️ #winter #snow #ice #Januar...,happy winter days winter snow ice january cold...,pos


In [8]:
# Find the maximum number of words in a tweet across all dataframes
max_words = 0
for tweets_df in tweets_df_list:
    max_words = max(max_words, tweets_df['Cleaned Tweets'].apply(lambda x: len(x.split())).max())

# Create new columns for each word for each dataframe
for tweets_df in tweets_df_list:
    for i in range(max_words):
        tweets_df[f'Text Token {i+1}'] = tweets_df['Cleaned Tweets'].apply(lambda x: x.split()[i] if len(x.split()) > i else '')

In [9]:
tweets_df_list[0].head()

Unnamed: 0,Date Created,Number of Likes,Source of Tweet,Country,Username,Followers Count,Tweets,Cleaned Tweets,Sentiment,Text Token 1,...,Text Token 31,Text Token 32,Text Token 33,Text Token 34,Text Token 35,Text Token 36,Text Token 37,Text Token 38,Text Token 39,Text Token 40
0,2023-02-27 22:36:41+00:00,2,Twitter Web App,,rvpstudioscana1,77,Don't y'all wish you were in Florida right now...,wish florida right eh major snow storm freezin...,pos,wish,...,,,,,,,,,,
1,2023-02-25 13:42:18+00:00,0,Twitter for iPhone,,BegreenKay,78,“Strongest snowstorm in years” leaves Californ...,strongest snowstorm years leaves californians ...,pos,strongest,...,,,,,,,,,,
2,2023-02-23 18:42:58+00:00,22,Twitter for Android,,MissBrandyGreen,3564,Took me 45 mins to #snowblow most of the drive...,took mins snowblow driveway amp main part fron...,pos,took,...,,,,,,,,,,
3,2023-02-03 07:29:33+00:00,0,Tumblr,,Enfieldfinearts,1893,📹 Intense Snowstorm in a Mountain Village┇Snow...,intense snowstorm mountain village snow ambien...,pos,intense,...,,,,,,,,,,
4,2023-01-26 10:20:25+00:00,2,Twitter Web App,,StormCentar,274,what happens !! 20 degrees below zero! Incredi...,happens degrees zero incredible snow disaster ...,pos,happens,...,,,,,,,,,,


# <u>Part 2</u>

In [10]:
import numpy as np

# List of countries
countries = ['United States', 'China', 'India', 'Indonesia', 'Brazil', 'Pakistan', 'Bangladesh', 
             'Japan', 'Philippines', 'Vietnam', 'Turkey', 'Iran', 'Thailand', 'Myanmar', 'South Korea', 
             'Iraq', 'Afghanistan', 'Saudi Arabia', 'Malaysia', 'North Korea', 'Yemen', 'Nepal', 
             'North Macedonia', 'Kazakhstan', 'Syria', 'Jordan', 'Azerbaijan', 'United Arab Emirates', 
             'Tajikistan', 'Israel', 'Laos', 'Lebanon', 'Kyrgyzstan', 'Turkmenistan', 'Oman', 'State of Palestine', 
             'Kuwait', 'Georgia', 'Armenia', 'Bahrain', 'Cyprus', 'Mongolia', 'Qatar', 'Timor-Leste', 
             'Bahamas', 'Bhutan', 'Maldives', 'Iceland', 'Brunei']

# Function to randomly assign a country to a row
def random_country(row):
    if row['Country'] == None:
        return np.random.choice(countries)
    else:
        return row['Country']

In [11]:
# Apply the function to the dataframe
for df in tweets_df_list:
    df['Country'] = df.apply(random_country, axis=1)

In [12]:
# Create an empty dataframe to store the merged data
merged_df = pd.DataFrame()

# Iterate over each dataframe in tweets_df_list
for i in range(len(tweets_df_list)):
    # Add a column to each dataframe to indicate the type
    tweets_df_list[i]['Type'] = hazard_types[i]
    # Append the dataframe to the merged_df
    merged_df = merged_df.append(tweets_df_list[i])

In [13]:
merged_df.to_csv("Tweets2.csv", index=False)