# **Installing and Importing**

In [74]:
# Install and import spacy, plotly, nbformat, emoji, en_core_web_sm
!pip install spaCy
!pip install plotly
!pip install emoji
!spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m73.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [75]:
# Import spacy
import spacy

# Import os to upload documents and metadata
import os

# Import pandas DataFrame packages
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

# Import packages for cleaning of data
import re
import emoji
import nltk
import numpy as np

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Also make sure we have the right nltk models
nltk.download('words')
words = set(nltk.corpus.words.words())

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


# **Merging Datasets**

In [76]:
# Mount Google Colab to Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [77]:
# Load in the first two CSV files, we will be working with dataframes in this Notebook

# Kaggle Dataset by user 'codebreaker619'
# License: Data files © Original Authors
# https://www.kaggle.com/datasets/codebreaker619/donald-trump-tweets-dataset
tweets_trump_kaggle = pd.read_csv('/content/drive/MyDrive/Data/tweets_trump_kaggle.csv')

# Dataset by the Trump Twitter Archive
# Data is freely usable as the creator aims to “provide a public resource”
# https://www.thetrumparchive.com/
tweets_trump_twitter_archive = pd.read_csv ('/content/drive/MyDrive/Data/tweets_trump_twitter_archive.csv')

In [78]:
# Get an overview about the datasets, such as the columns and different datatypes in the dataframe
tweets_trump_kaggle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56571 entries, 0 to 56570
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         56571 non-null  int64 
 1   text       56571 non-null  object
 2   isRetweet  56571 non-null  object
 3   isDeleted  56571 non-null  object
 4   device     56571 non-null  object
 5   favorites  56571 non-null  int64 
 6   retweets   56571 non-null  int64 
 7   date       56571 non-null  object
 8   isFlagged  56571 non-null  object
dtypes: int64(3), object(6)
memory usage: 3.9+ MB


In [79]:
# Show the first few lines of the dataset for inspection
tweets_trump_kaggle.head()

Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,date,isFlagged
0,98454970654916608,Republicans and Democrats have both created ou...,f,f,TweetDeck,49,255,2011-08-02 18:07:48,f
1,1234653427789070336,I was thrilled to be back in the Great city of...,f,f,Twitter for iPhone,73748,17404,2020-03-03 01:34:50,f
2,1218010753434820614,RT @CBS_Herridge: READ: Letter to surveillance...,t,f,Twitter for iPhone,0,7396,2020-01-17 03:22:47,f
3,1304875170860015617,The Unsolicited Mail In Ballot Scam is a major...,f,f,Twitter for iPhone,80527,23502,2020-09-12 20:10:58,f
4,1218159531554897920,RT @MZHemingway: Very friendly telling of even...,t,f,Twitter for iPhone,0,9081,2020-01-17 13:13:59,f


In [80]:
# Repeat these steps
tweets_trump_twitter_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56571 entries, 0 to 56570
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         56571 non-null  int64 
 1   text       56571 non-null  object
 2   isRetweet  56571 non-null  object
 3   isDeleted  56571 non-null  object
 4   device     56571 non-null  object
 5   favorites  56571 non-null  int64 
 6   retweets   56571 non-null  int64 
 7   date       56571 non-null  object
 8   isFlagged  56571 non-null  object
dtypes: int64(3), object(6)
memory usage: 3.9+ MB


In [81]:
tweets_trump_twitter_archive.head()

Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,date,isFlagged
0,98454970654916608,Republicans and Democrats have both created ou...,f,f,TweetDeck,49,255,2011-08-02 18:07:48,f
1,1234653427789070336,I was thrilled to be back in the Great city of...,f,f,Twitter for iPhone,73748,17404,2020-03-03 01:34:50,f
2,1218010753434820614,RT @CBS_Herridge: READ: Letter to surveillance...,t,f,Twitter for iPhone,0,7396,2020-01-17 03:22:47,f
3,1304875170860015617,The Unsolicited Mail In Ballot Scam is a major...,f,f,Twitter for iPhone,80527,23502,2020-09-12 20:10:58,f
4,1218159531554897920,RT @MZHemingway: Very friendly telling of even...,t,f,Twitter for iPhone,0,9081,2020-01-17 13:13:59,f


In [82]:
# Merge both datasets together to create one big dataset
merged_trump_tweets = pd.concat([tweets_trump_twitter_archive, tweets_trump_kaggle])

In [83]:
# Get an overview about the datasets we just created, such as the columns and different datatypes in the dataframe
merged_trump_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 113142 entries, 0 to 56570
Data columns (total 9 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   id         113142 non-null  int64 
 1   text       113142 non-null  object
 2   isRetweet  113142 non-null  object
 3   isDeleted  113142 non-null  object
 4   device     113142 non-null  object
 5   favorites  113142 non-null  int64 
 6   retweets   113142 non-null  int64 
 7   date       113142 non-null  object
 8   isFlagged  113142 non-null  object
dtypes: int64(3), object(6)
memory usage: 8.6+ MB


In [84]:
# The total lines in the dataset is now 113142

In [85]:
# Delete duplicates from the merged dataset, filtering on the column 'id' and save this to the dataset
merged_trump_tweets.drop_duplicates(subset='id', inplace=True)

# Reset the index after dropping rows
merged_trump_tweets.reset_index(drop=True, inplace=True)

In [86]:
# Get an overview about the dataset with the dropped duplicates
merged_trump_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56571 entries, 0 to 56570
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         56571 non-null  int64 
 1   text       56571 non-null  object
 2   isRetweet  56571 non-null  object
 3   isDeleted  56571 non-null  object
 4   device     56571 non-null  object
 5   favorites  56571 non-null  int64 
 6   retweets   56571 non-null  int64 
 7   date       56571 non-null  object
 8   isFlagged  56571 non-null  object
dtypes: int64(3), object(6)
memory usage: 3.9+ MB


In [87]:
# The total lines in the dataset has now dropped back to 56571

In [88]:
# Show the first few lines of the combined dataset for inspection
merged_trump_tweets.head()

Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,date,isFlagged
0,98454970654916608,Republicans and Democrats have both created ou...,f,f,TweetDeck,49,255,2011-08-02 18:07:48,f
1,1234653427789070336,I was thrilled to be back in the Great city of...,f,f,Twitter for iPhone,73748,17404,2020-03-03 01:34:50,f
2,1218010753434820614,RT @CBS_Herridge: READ: Letter to surveillance...,t,f,Twitter for iPhone,0,7396,2020-01-17 03:22:47,f
3,1304875170860015617,The Unsolicited Mail In Ballot Scam is a major...,f,f,Twitter for iPhone,80527,23502,2020-09-12 20:10:58,f
4,1218159531554897920,RT @MZHemingway: Very friendly telling of even...,t,f,Twitter for iPhone,0,9081,2020-01-17 13:13:59,f


In [89]:
# For further use, we are now going to convert the date column, which also includes time, to only include the date
merged_trump_tweets['date'] = pd.to_datetime(merged_trump_tweets['date'])
merged_trump_tweets['date'] = merged_trump_tweets['date'].dt.date

In [90]:
# Rename columns in merged_trump_tweets for better readability
merged_trump_tweets.rename(columns={
    'isRetweet': 'is_retweet',
    'isDeleted': 'is_deleted'
}, inplace=True)

In [91]:
# Also make sure all columns have a clearer true and false annotation
merged_trump_tweets['is_retweet'] = merged_trump_tweets['is_retweet'].replace({'f': False, 't': True})
merged_trump_tweets['is_deleted'] = merged_trump_tweets['is_deleted'].replace({'f': False, 't': True})

In [92]:
# Drop unneccesary columns
columns_to_drop = ['device', 'isFlagged']
merged_trump_tweets.drop(columns=columns_to_drop, inplace=True)

In [93]:
# Get overview of the dataset
merged_trump_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56571 entries, 0 to 56570
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          56571 non-null  int64 
 1   text        56571 non-null  object
 2   is_retweet  56571 non-null  bool  
 3   is_deleted  56571 non-null  bool  
 4   favorites   56571 non-null  int64 
 5   retweets    56571 non-null  int64 
 6   date        56571 non-null  object
dtypes: bool(2), int64(3), object(2)
memory usage: 2.3+ MB


In [None]:
# We can move on to cleaning the data.

# **Cleaning**

In [94]:
# Delete the Retweets from the dataset
merged_trump_tweets_clean = merged_trump_tweets[merged_trump_tweets['is_retweet'] == False]

In [95]:
# Get overview of the dataset without retweets
merged_trump_tweets_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 46694 entries, 0 to 56570
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          46694 non-null  int64 
 1   text        46694 non-null  object
 2   is_retweet  46694 non-null  bool  
 3   is_deleted  46694 non-null  bool  
 4   favorites   46694 non-null  int64 
 5   retweets    46694 non-null  int64 
 6   date        46694 non-null  object
dtypes: bool(2), int64(3), object(2)
memory usage: 2.2+ MB


In [97]:
def cleaner(tweet):
    original_tweet = tweet  # Save the original tweet
    tweet = re.sub('@[A-Za-z0-9]+', '', tweet)  # Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", '', tweet)  # Remove http links
    tweet = re.sub("amp", '', tweet) # Remove 'amp'
    tweet = " ".join(tweet.split())
    tweet = ''.join(c for c in tweet if c not in emoji.EMOJI_DATA)  # Remove Emojis
    tweet = tweet.replace("#", "").replace("_", " ")  # Remove hashtag sign but keep the text
    return tweet, original_tweet

# Apply the clean function to our text and store it in a seperate column named 'text_clean'
merged_trump_tweets_clean['text_clean'], merged_trump_tweets_clean['text'] = zip(*merged_trump_tweets_clean['text'].apply(cleaner))

In [98]:
# Show the first few lines of the dataset for inspection
merged_trump_tweets_clean.head()

Unnamed: 0,id,text,is_retweet,is_deleted,favorites,retweets,date,text_clean
0,98454970654916608,Republicans and Democrats have both created ou...,False,False,49,255,2011-08-02,Republicans and Democrats have both created ou...
1,1234653427789070336,I was thrilled to be back in the Great city of...,False,False,73748,17404,2020-03-03,I was thrilled to be back in the Great city of...
3,1304875170860015617,The Unsolicited Mail In Ballot Scam is a major...,False,False,80527,23502,2020-09-12,The Unsolicited Mail In Ballot Scam is a major...
6,1223640662689689602,Getting a little exercise this morning! https:...,False,False,285863,30209,2020-02-01,Getting a little exercise this morning!
7,1319501865625784320,https://t.co/4qwCKQOiOw,False,False,130822,19127,2020-10-23,


In [99]:
# Replace empty strings in the 'text_clean' column with NaN
merged_trump_tweets_clean['text_clean'].replace('', np.nan, inplace=True)
# Drop rows with NaN values in the 'text_clean' column
merged_trump_tweets_clean.dropna(subset=['text_clean'], inplace=True)
# Reset the index after dropping rows
merged_trump_tweets_clean.reset_index(drop=True, inplace=True)

In [100]:
# Show the first few lines of the dataset to see if it worked
merged_trump_tweets_clean.head()

Unnamed: 0,id,text,is_retweet,is_deleted,favorites,retweets,date,text_clean
0,98454970654916608,Republicans and Democrats have both created ou...,False,False,49,255,2011-08-02,Republicans and Democrats have both created ou...
1,1234653427789070336,I was thrilled to be back in the Great city of...,False,False,73748,17404,2020-03-03,I was thrilled to be back in the Great city of...
2,1304875170860015617,The Unsolicited Mail In Ballot Scam is a major...,False,False,80527,23502,2020-09-12,The Unsolicited Mail In Ballot Scam is a major...
3,1223640662689689602,Getting a little exercise this morning! https:...,False,False,285863,30209,2020-02-01,Getting a little exercise this morning!
4,1215247978966986752,Thank you Elise! https://t.co/Y4Hb0zf5jk,False,False,48510,11608,2020-01-09,Thank you Elise!


In [101]:
# Show the final number of lines that remain in our dataset
merged_trump_tweets_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45375 entries, 0 to 45374
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          45375 non-null  int64 
 1   text        45375 non-null  object
 2   is_retweet  45375 non-null  bool  
 3   is_deleted  45375 non-null  bool  
 4   favorites   45375 non-null  int64 
 5   retweets    45375 non-null  int64 
 6   date        45375 non-null  object
 7   text_clean  45375 non-null  object
dtypes: bool(2), int64(3), object(3)
memory usage: 2.2+ MB


In [102]:
# Save our dataset as CSV
merged_trump_tweets_clean.to_csv('merged_trump_tweets_clean.csv')

In [103]:
# We now want to specify the specific time-range this research is interested in
# This means we want to select Donald Trump's Tweets from the day of the 2020-election results (November 3rd, 2020) until the day his account got suspensed following the Capitol Riots

# Convert 'date' column to datetime format
merged_trump_tweets_clean['date'] = pd.to_datetime(merged_trump_tweets_clean['date'])

# Specify the time range
start_date = pd.to_datetime('2020-11-03')
end_date = pd.to_datetime('2021-01-06')

# Filter tweets from November 3rd, 2020, and onward
filtered_trump_tweets = merged_trump_tweets_clean[(merged_trump_tweets_clean['date'] >= start_date) & (merged_trump_tweets_clean['date'] <= end_date)]

In [104]:
# Save our dataset as CSV
filtered_trump_tweets.to_csv('filtered_trump_tweets.csv')

In [105]:
# We now want to save the content of the Tweet to txt-files for further analysis
# We want to seperate these by date

# Specify the folder path in Google Drive where to save the files to
folder_path = '/content/drive/MyDrive/Data/trump_tweets_txts/'

In [106]:
# Ensure 'date' column is in datetime format
filtered_trump_tweets['date'] = pd.to_datetime(filtered_trump_tweets['date'])

# Extract the date
filtered_trump_tweets['date'] = filtered_trump_tweets['date'].dt.date

# Group tweets by date
grouped_trump_tweets = filtered_trump_tweets.groupby('date')

In [107]:
# Iterate over each group and save tweets to separate text files and store these for further use
for date, group in grouped_trump_tweets:
    # Create a file name based on the date
    date_str = date.strftime('%Y-%m-%d')
    file_name = f'{date_str}.txt'
    file_path = folder_path + file_name

    # Open the file in write mode
    with open(file_path, 'w') as file:
        # Write each tweet to the file
        for tweet in group['text_clean']:
            file.write(tweet + '\n')

In [108]:
# Define a function that runs the nlp pipeline on any given input text
def process_text(text):
    return nlp(text)

# Apply the function to the "text_clean" column, so that the nlp pipeline is called on each tweet
filtered_trump_tweets['doc'] = filtered_trump_tweets['text_clean'].apply(process_text)

In [109]:
# For an easier way to create a new dataset containing the keywords I am interested in, we will perform the spaCy function for lemmetization
# This can also be understood as the retrieval of the dictionary root word of each word. For researchers using keyword searches, lemmatization reduces noise and improves results.

# Define a function to retrieve lemmas and turn them into lowercase for the keyword search
def get_lemma(text):
    doc = nlp(text)
    return [token.lemma_.lower() for token in doc]


# Run the lemma retrieval function on the text_clean column in our dataframe
filtered_trump_tweets['lemmas'] = filtered_trump_tweets['doc'].apply(get_lemma)

In [110]:
# Show the first few lines of the dataset to see if it worked
filtered_trump_tweets.head()

Unnamed: 0,id,text,is_retweet,is_deleted,favorites,retweets,date,text_clean,doc,lemmas
6,1325884977112883200,The threshold identification of Ballots is tur...,False,False,493076,100609,2020-11-09,The threshold identification of Ballots is tur...,"(The, threshold, identification, of, Ballots, ...","[the, threshold, identification, of, ballot, b..."
11,1325889532840062976,Nevada is turning out to be a cesspool of Fake...,False,False,363489,78378,2020-11-09,Nevada is turning out to be a cesspool of Fake...,"(Nevada, is, turning, out, to, be, a, cesspool...","[nevada, be, turn, out, to, be, a, cesspool, o..."
12,1325891490636320768,Wisconsin is looking very good. Needs a little...,False,False,347994,61006,2020-11-09,Wisconsin is looking very good. Needs a little...,"(Wisconsin, is, looking, very, good, ., Needs,...","[wisconsin, be, look, very, good, ., need, a, ..."
14,1325895380983275524,Pennsylvania prevented us from watching much o...,False,False,479292,81458,2020-11-09,Pennsylvania prevented us from watching much o...,"(Pennsylvania, prevented, us, from, watching, ...","[pennsylvania, prevent, we, from, watch, much,..."
16,1325896369534607360,"Georgia will be a big presidential win, as it ...",False,False,637719,94570,2020-11-09,"Georgia will be a big presidential win, as it ...","(Georgia, will, be, a, big, presidential, win,...","[georgia, will, be, a, big, presidential, win,..."


In [111]:
# Define the keywords to filter the tweets
keywords = [
    'antifa', 'ballot', 'biden', 'collapse', 'certify', 'conspiracy', 'dead', 'digital','dominion',
    'election','enemy','evidence', 'fake', 'fbi', 'flip', 'fraud', 'glitch', 'hack', 'harvest',
    'hoax', 'illegal', 'integrity', 'justice', 'landslide', 'legal', 'lie','observe', 'overturn',
    'poll', 'power', 'radical', 'reject', 'rig', 'steal', 'stop', 'trump', 'undermine', 'verify',
    'vote', 'win', 'arizona', 'georgia', 'michigan', 'pennsylvania', 'wisconsin'
    ]

# Function to check if a list of lemmas contains any of the keywords
def contains_keywords(tweet_lemmas, keywords):
    tweet = ' '.join(tweet_lemmas)  # Join the lemmatized words into a single string
    for keyword in keywords:
        if keyword.lower() in tweet.lower():
            return True
    return False

# Filter tweets containing the keywords based on lemmatized words
filtered_trump_tweets_keywords = filtered_trump_tweets[filtered_trump_tweets['lemmas'].apply(lambda tweet_lemmas: contains_keywords(tweet_lemmas, keywords))]

In [112]:
# Show the first few lines of the dataset to see if it worked
filtered_trump_tweets_keywords.head()

Unnamed: 0,id,text,is_retweet,is_deleted,favorites,retweets,date,text_clean,doc,lemmas
6,1325884977112883200,The threshold identification of Ballots is tur...,False,False,493076,100609,2020-11-09,The threshold identification of Ballots is tur...,"(The, threshold, identification, of, Ballots, ...","[the, threshold, identification, of, ballot, b..."
11,1325889532840062976,Nevada is turning out to be a cesspool of Fake...,False,False,363489,78378,2020-11-09,Nevada is turning out to be a cesspool of Fake...,"(Nevada, is, turning, out, to, be, a, cesspool...","[nevada, be, turn, out, to, be, a, cesspool, o..."
12,1325891490636320768,Wisconsin is looking very good. Needs a little...,False,False,347994,61006,2020-11-09,Wisconsin is looking very good. Needs a little...,"(Wisconsin, is, looking, very, good, ., Needs,...","[wisconsin, be, look, very, good, ., need, a, ..."
14,1325895380983275524,Pennsylvania prevented us from watching much o...,False,False,479292,81458,2020-11-09,Pennsylvania prevented us from watching much o...,"(Pennsylvania, prevented, us, from, watching, ...","[pennsylvania, prevent, we, from, watch, much,..."
16,1325896369534607360,"Georgia will be a big presidential win, as it ...",False,False,637719,94570,2020-11-09,"Georgia will be a big presidential win, as it ...","(Georgia, will, be, a, big, presidential, win,...","[georgia, will, be, a, big, presidential, win,..."


In [113]:
# Save our dataset as CSV
filtered_trump_tweets_keywords.to_csv('filtered_trump_tweets_keywords.csv')