# **Installing and Importing**

In [210]:
# Install and import spacy and plotly.
!pip install spaCy
!pip install plotly
!pip install nbformat==5.1.2
!pip install emoji



In [211]:
# Import spacy
import spacy

# Install English language model
!spacy download en_core_web_sm

# Import os to upload documents and metadata
import os

# Load spaCy visualizer
from spacy import displacy

# Import pandas DataFrame packages
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

# Import graphing packages
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob

# Import packages for cleaning of data
import re
import emoji
import nltk
nltk.download('words')
words = set(nltk.corpus.words.words())
import numpy as np

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


# **Merging Datasets**

In [212]:
# Mount Google Colab to Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [213]:
# Load in the first two CSV files

# Kaggle Dataset by user 'codebreaker619'
# License: Data files © Original Authors
# https://www.kaggle.com/datasets/codebreaker619/donald-trump-tweets-dataset
tweets_kaggle = pd.read_csv('/content/drive/MyDrive/Data/tweets_kaggle.csv')

# Dataset by the Trump Twitter Archive
# Data is freely usable as the creator aims to “provide a public resource”
# https://www.thetrumparchive.com/
tweets_trump_twitter_archive = pd.read_csv ('/content/drive/MyDrive/Data/tweets_trump_twitter_archive.csv')

In [214]:
# Get an overview about the datasets, such as the columns and different datatypes in the dataframe
tweets_kaggle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56571 entries, 0 to 56570
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         56571 non-null  int64 
 1   text       56571 non-null  object
 2   isRetweet  56571 non-null  object
 3   isDeleted  56571 non-null  object
 4   device     56571 non-null  object
 5   favorites  56571 non-null  int64 
 6   retweets   56571 non-null  int64 
 7   date       56571 non-null  object
 8   isFlagged  56571 non-null  object
dtypes: int64(3), object(6)
memory usage: 3.9+ MB


In [215]:
# Show the first few lines of the dataset for inspection
tweets_kaggle.head()

Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,date,isFlagged
0,98454970654916608,Republicans and Democrats have both created ou...,f,f,TweetDeck,49,255,2011-08-02 18:07:48,f
1,1234653427789070336,I was thrilled to be back in the Great city of...,f,f,Twitter for iPhone,73748,17404,2020-03-03 01:34:50,f
2,1218010753434820614,RT @CBS_Herridge: READ: Letter to surveillance...,t,f,Twitter for iPhone,0,7396,2020-01-17 03:22:47,f
3,1304875170860015617,The Unsolicited Mail In Ballot Scam is a major...,f,f,Twitter for iPhone,80527,23502,2020-09-12 20:10:58,f
4,1218159531554897920,RT @MZHemingway: Very friendly telling of even...,t,f,Twitter for iPhone,0,9081,2020-01-17 13:13:59,f


In [216]:
# Repeat these steps
tweets_trump_twitter_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56571 entries, 0 to 56570
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         56571 non-null  int64 
 1   text       56571 non-null  object
 2   isRetweet  56571 non-null  object
 3   isDeleted  56571 non-null  object
 4   device     56571 non-null  object
 5   favorites  56571 non-null  int64 
 6   retweets   56571 non-null  int64 
 7   date       56571 non-null  object
 8   isFlagged  56571 non-null  object
dtypes: int64(3), object(6)
memory usage: 3.9+ MB


In [217]:
tweets_trump_twitter_archive.head()

Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,date,isFlagged
0,98454970654916608,Republicans and Democrats have both created ou...,f,f,TweetDeck,49,255,2011-08-02 18:07:48,f
1,1234653427789070336,I was thrilled to be back in the Great city of...,f,f,Twitter for iPhone,73748,17404,2020-03-03 01:34:50,f
2,1218010753434820614,RT @CBS_Herridge: READ: Letter to surveillance...,t,f,Twitter for iPhone,0,7396,2020-01-17 03:22:47,f
3,1304875170860015617,The Unsolicited Mail In Ballot Scam is a major...,f,f,Twitter for iPhone,80527,23502,2020-09-12 20:10:58,f
4,1218159531554897920,RT @MZHemingway: Very friendly telling of even...,t,f,Twitter for iPhone,0,9081,2020-01-17 13:13:59,f


In [218]:
# Merge both datasets together to create one big dataset
merged_tweets_df = pd.concat([tweets_trump_twitter_archive, tweets_kaggle])

In [219]:
# Get an overview about the datasets we just created, such as the columns and different datatypes in the dataframe
merged_tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 113142 entries, 0 to 56570
Data columns (total 9 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   id         113142 non-null  int64 
 1   text       113142 non-null  object
 2   isRetweet  113142 non-null  object
 3   isDeleted  113142 non-null  object
 4   device     113142 non-null  object
 5   favorites  113142 non-null  int64 
 6   retweets   113142 non-null  int64 
 7   date       113142 non-null  object
 8   isFlagged  113142 non-null  object
dtypes: int64(3), object(6)
memory usage: 8.6+ MB


In [220]:
# The total lines in the dataset is now 113142

In [221]:
# Delete duplicates from the merged dataset, filtering on the column 'id' and save this to the dataset
merged_tweets_df.drop_duplicates(subset='id', inplace=True)
# Reset the index after dropping rows
merged_tweets_df.reset_index(drop=True, inplace=True)

In [222]:
# Get an overview about the dataset with the dropped duplicates
merged_tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56571 entries, 0 to 56570
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         56571 non-null  int64 
 1   text       56571 non-null  object
 2   isRetweet  56571 non-null  object
 3   isDeleted  56571 non-null  object
 4   device     56571 non-null  object
 5   favorites  56571 non-null  int64 
 6   retweets   56571 non-null  int64 
 7   date       56571 non-null  object
 8   isFlagged  56571 non-null  object
dtypes: int64(3), object(6)
memory usage: 3.9+ MB


In [223]:
# The total lines in the dataset has now dropped back to 56571

In [224]:
# Show the first few lines of the combined dataset for inspection
merged_tweets_df.head()

Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,date,isFlagged
0,98454970654916608,Republicans and Democrats have both created ou...,f,f,TweetDeck,49,255,2011-08-02 18:07:48,f
1,1234653427789070336,I was thrilled to be back in the Great city of...,f,f,Twitter for iPhone,73748,17404,2020-03-03 01:34:50,f
2,1218010753434820614,RT @CBS_Herridge: READ: Letter to surveillance...,t,f,Twitter for iPhone,0,7396,2020-01-17 03:22:47,f
3,1304875170860015617,The Unsolicited Mail In Ballot Scam is a major...,f,f,Twitter for iPhone,80527,23502,2020-09-12 20:10:58,f
4,1218159531554897920,RT @MZHemingway: Very friendly telling of even...,t,f,Twitter for iPhone,0,9081,2020-01-17 13:13:59,f


In [225]:
# Load in the third and final CSV file
trump_tweets_archive_kaggle = pd.read_csv ('/content/drive/MyDrive/Data/trump_tweets_archive_kaggle.csv')

In [226]:
# Get an overview about this dataset as well
trump_tweets_archive_kaggle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56571 entries, 0 to 56570
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          56571 non-null  float64
 1   text        56571 non-null  object 
 2   is_retweet  56571 non-null  bool   
 3   is_deleted  56571 non-null  bool   
 4   device      56571 non-null  object 
 5   favorites   56571 non-null  int64  
 6   retweets    56571 non-null  int64  
 7   datetime    56571 non-null  object 
 8   is_flagged  56571 non-null  bool   
 9   date        56571 non-null  object 
dtypes: bool(3), float64(1), int64(2), object(4)
memory usage: 3.2+ MB


In [227]:
# Show the first few lines of the dataset for inspection
trump_tweets_archive_kaggle.head()

Unnamed: 0,id,text,is_retweet,is_deleted,device,favorites,retweets,datetime,is_flagged,date
0,9.845497e+16,Republicans and Democrats have both created ou...,False,False,TweetDeck,49,255,2011-08-02T18:07:48Z,False,2011-08-02
1,1.234653e+18,I was thrilled to be back in the Great city of...,False,False,Twitter for iPhone,73748,17404,2020-03-03T01:34:50Z,False,2020-03-03
2,1.218011e+18,RT @CBS_Herridge: READ: Letter to surveillance...,True,False,Twitter for iPhone,0,7396,2020-01-17T03:22:47Z,False,2020-01-17
3,1.304875e+18,The Unsolicited Mail In Ballot Scam is a major...,False,False,Twitter for iPhone,80527,23502,2020-09-12T20:10:58Z,False,2020-09-12
4,1.21816e+18,RT @MZHemingway: Very friendly telling of even...,True,False,Twitter for iPhone,0,9081,2020-01-17T13:13:59Z,False,2020-01-17


In [228]:
# Merge this dataset with the earlier merged dataset to create a dataset which includes three different sources to make sure all Trump Tweets are included
# Also drop the duplicates again
final_merged_tweets_df = pd.concat([trump_tweets_archive_kaggle, merged_tweets_df])
final_merged_tweets_df.drop_duplicates(subset='id', inplace=True)
# Reset the index after dropping rows
final_merged_tweets_df.reset_index(drop=True, inplace=True)

In [229]:
# Show the first few lines of the dataset for inspection
final_merged_tweets_df.head()

Unnamed: 0,id,text,is_retweet,is_deleted,device,favorites,retweets,datetime,is_flagged,date,isRetweet,isDeleted,isFlagged
0,9.845497e+16,Republicans and Democrats have both created ou...,False,False,TweetDeck,49,255,2011-08-02T18:07:48Z,False,2011-08-02,,,
1,1.234653e+18,I was thrilled to be back in the Great city of...,False,False,Twitter for iPhone,73748,17404,2020-03-03T01:34:50Z,False,2020-03-03,,,
2,1.218011e+18,RT @CBS_Herridge: READ: Letter to surveillance...,True,False,Twitter for iPhone,0,7396,2020-01-17T03:22:47Z,False,2020-01-17,,,
3,1.304875e+18,The Unsolicited Mail In Ballot Scam is a major...,False,False,Twitter for iPhone,80527,23502,2020-09-12T20:10:58Z,False,2020-09-12,,,
4,1.21816e+18,RT @MZHemingway: Very friendly telling of even...,True,False,Twitter for iPhone,0,9081,2020-01-17T13:13:59Z,False,2020-01-17,,,


In [230]:
# Get an overview about this dataset as well
final_merged_tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64936 entries, 0 to 64935
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          64936 non-null  float64
 1   text        64936 non-null  object 
 2   is_retweet  56570 non-null  object 
 3   is_deleted  56570 non-null  object 
 4   device      64936 non-null  object 
 5   favorites   64936 non-null  int64  
 6   retweets    64936 non-null  int64  
 7   datetime    56570 non-null  object 
 8   is_flagged  56570 non-null  object 
 9   date        64936 non-null  object 
 10  isRetweet   8366 non-null   object 
 11  isDeleted   8366 non-null   object 
 12  isFlagged   8366 non-null   object 
dtypes: float64(1), int64(2), object(10)
memory usage: 6.4+ MB


In [231]:
# Combine the 'isDeleted' column with the 'is_deleted' column
final_merged_tweets_df.loc[final_merged_tweets_df['is_deleted'].isnull(), 'is_deleted'] = final_merged_tweets_df['isDeleted']
# Combine the 'isRetweet' column with the 'is_retweet' column
final_merged_tweets_df.loc[final_merged_tweets_df['is_retweet'].isnull(), 'is_retweet'] = final_merged_tweets_df['isRetweet']

In [232]:
# Get an overview about this dataset to see if it worked
final_merged_tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64936 entries, 0 to 64935
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          64936 non-null  float64
 1   text        64936 non-null  object 
 2   is_retweet  64936 non-null  object 
 3   is_deleted  64936 non-null  object 
 4   device      64936 non-null  object 
 5   favorites   64936 non-null  int64  
 6   retweets    64936 non-null  int64  
 7   datetime    56570 non-null  object 
 8   is_flagged  56570 non-null  object 
 9   date        64936 non-null  object 
 10  isRetweet   8366 non-null   object 
 11  isDeleted   8366 non-null   object 
 12  isFlagged   8366 non-null   object 
dtypes: float64(1), int64(2), object(10)
memory usage: 6.4+ MB


In [233]:
# Delete unneccesary columns
columns_to_drop = ['device', 'isFlagged', 'isRetweet', 'isDeleted', 'is_flagged', 'datetime']
final_merged_tweets_df.drop(columns=columns_to_drop, inplace=True)

In [234]:
# Show the first few lines of the dataset for inspection
final_merged_tweets_df.head()

Unnamed: 0,id,text,is_retweet,is_deleted,favorites,retweets,date
0,9.845497e+16,Republicans and Democrats have both created ou...,False,False,49,255,2011-08-02
1,1.234653e+18,I was thrilled to be back in the Great city of...,False,False,73748,17404,2020-03-03
2,1.218011e+18,RT @CBS_Herridge: READ: Letter to surveillance...,True,False,0,7396,2020-01-17
3,1.304875e+18,The Unsolicited Mail In Ballot Scam is a major...,False,False,80527,23502,2020-09-12
4,1.21816e+18,RT @MZHemingway: Very friendly telling of even...,True,False,0,9081,2020-01-17


In [235]:
# Make sure all columns have the same true and false annotation
final_merged_tweets_df['is_retweet'] = final_merged_tweets_df['is_retweet'].replace({'f': False, 't': True})
final_merged_tweets_df['is_deleted'] = final_merged_tweets_df['is_retweet'].replace({'f': False, 't': True})

In [236]:
# Get overview of the dataset
final_merged_tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64936 entries, 0 to 64935
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          64936 non-null  float64
 1   text        64936 non-null  object 
 2   is_retweet  64936 non-null  bool   
 3   is_deleted  64936 non-null  bool   
 4   favorites   64936 non-null  int64  
 5   retweets    64936 non-null  int64  
 6   date        64936 non-null  object 
dtypes: bool(2), float64(1), int64(2), object(2)
memory usage: 2.6+ MB


In [237]:
# Save the csv to computer's working directory
final_merged_tweets_df.to_csv('final_merged_tweets.csv')

# **Cleaning**

In [238]:
# Delete the Retweets from the dataset
final_merged_tweets_df_clean = final_merged_tweets_df[final_merged_tweets_df['is_retweet'] == False]

In [239]:
# Get overview of the dataset without retweets
final_merged_tweets_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53938 entries, 0 to 64934
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          53938 non-null  float64
 1   text        53938 non-null  object 
 2   is_retweet  53938 non-null  bool   
 3   is_deleted  53938 non-null  bool   
 4   favorites   53938 non-null  int64  
 5   retweets    53938 non-null  int64  
 6   date        53938 non-null  object 
dtypes: bool(2), float64(1), int64(2), object(2)
memory usage: 2.6+ MB


In [240]:
def cleaner(tweet):
    original_tweet = tweet  # Save the original tweet
    tweet = re.sub("@[A-Za-z0-9]+", "", tweet)  # Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet)  # Remove http links
    tweet = " ".join(tweet.split())
    tweet = ''.join(c for c in tweet if c not in emoji.EMOJI_DATA)  # Remove Emojis
    tweet = tweet.replace("#", "").replace("_", " ")  # Remove hashtag sign but keep the text
    tweet = tweet.lower()  # Convert text to lowercase
    return tweet, original_tweet

# Apply the clean function to our text and store it in a seperate column named 'text_clean'
final_merged_tweets_df_clean['text_clean'], final_merged_tweets_df_clean['text'] = zip(*final_merged_tweets_df_clean['text'].apply(cleaner))

In [241]:
# Show the first few lines of the dataset for inspection
final_merged_tweets_df_clean.head()

Unnamed: 0,id,text,is_retweet,is_deleted,favorites,retweets,date,text_clean
0,9.845497e+16,Republicans and Democrats have both created ou...,False,False,49,255,2011-08-02,republicans and democrats have both created ou...
1,1.234653e+18,I was thrilled to be back in the Great city of...,False,False,73748,17404,2020-03-03,i was thrilled to be back in the great city of...
3,1.304875e+18,The Unsolicited Mail In Ballot Scam is a major...,False,False,80527,23502,2020-09-12,the unsolicited mail in ballot scam is a major...
6,1.223641e+18,Getting a little exercise this morning! https:...,False,False,285863,30209,2020-02-01,getting a little exercise this morning!
7,1.319502e+18,https://t.co/4qwCKQOiOw,False,False,130822,19127,2020-10-23,


In [242]:
# Replace empty strings in the 'text_clean' column with NaN
final_merged_tweets_df_clean['text_clean'].replace('', np.nan, inplace=True)
# Drop rows with NaN values in the 'text_clean' column
final_merged_tweets_df_clean.dropna(subset=['text_clean'], inplace=True)
# Reset the index after dropping rows
final_merged_tweets_df_clean.reset_index(drop=True, inplace=True)

In [243]:
# Show the first few lines of the dataset to see if it worked
final_merged_tweets_df_clean.head()

Unnamed: 0,id,text,is_retweet,is_deleted,favorites,retweets,date,text_clean
0,9.845497e+16,Republicans and Democrats have both created ou...,False,False,49,255,2011-08-02,republicans and democrats have both created ou...
1,1.234653e+18,I was thrilled to be back in the Great city of...,False,False,73748,17404,2020-03-03,i was thrilled to be back in the great city of...
2,1.304875e+18,The Unsolicited Mail In Ballot Scam is a major...,False,False,80527,23502,2020-09-12,the unsolicited mail in ballot scam is a major...
3,1.223641e+18,Getting a little exercise this morning! https:...,False,False,285863,30209,2020-02-01,getting a little exercise this morning!
4,1.215248e+18,Thank you Elise! https://t.co/Y4Hb0zf5jk,False,False,48510,11608,2020-01-09,thank you elise!


In [244]:
# Show the final number of lines that remain in our dataset
final_merged_tweets_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52481 entries, 0 to 52480
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          52481 non-null  float64
 1   text        52481 non-null  object 
 2   is_retweet  52481 non-null  bool   
 3   is_deleted  52481 non-null  bool   
 4   favorites   52481 non-null  int64  
 5   retweets    52481 non-null  int64  
 6   date        52481 non-null  object 
 7   text_clean  52481 non-null  object 
dtypes: bool(2), float64(1), int64(2), object(3)
memory usage: 2.5+ MB


In [245]:
# Save our dataset as CSV
final_merged_tweets_df_clean.to_csv('final_merged_tweets_clean.csv')
# Save only the 'text_clean' column for further analysis
clean_text = "clean_text.txt"

# Open the file in write mode
with open(file_path, 'w') as file:
    # Write each line of the text_clean column to the file
    for text in final_merged_tweets_df_clean['text_clean']:
        file.write(text + '\n')