In [1]:
#####################################################################
#          Author:  Amy Louise Lang                                 #
#            Date:  05/29/2021                                      #
#     Data source:  https://www.kaggle.com/zusmani/trumps-legacy    #
#####################################################################

In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import matplotlib.pyplot as plt
import re
import seaborn as sns
from textblob import TextBlob
from wordcloud import WordCloud
import plotly.express as px

In [None]:
# I had problems importing the file so this code is a test to see my current working directory
# In fact, it wasn't what I thought it was!
import os

cwd = os.getcwd()  # Get the current working directory (cwd)
files = os.listdir(cwd)  # Get all the files in that directory
print("Files in %r: %s" % (cwd, files))

In [None]:
df = pd.read_csv("TrumpsLegcy.csv", quotechar='"', encoding ='utf-8',delimiter=',').dropna()
df

In [None]:
#Confirm date format - not sure this is necessary
df['date'] = pd.to_datetime(df['date'])
df

In [None]:
# We want Trump's first 100 days of tweets - date ranges specified here
start_date = '01-20-2017'
end_date = '04-30-2017'

In [None]:
# Set up filter statement
filter100 = (df['date'] >= start_date) & (df['date'] <= end_date)

In [None]:
df = df.loc[filter100]
df

In [None]:
print (df.columns)
print(df.shape)

In [None]:
df.info()

# Data cleaning and formatting

In [None]:
# Dropping the id column since it is Donald Trump
# However, since there are multiple ids, maybe I should count how many Twitter accounts are associated with Trump?
# It would be interesting to compare that to Biden. It would show how many staff are participating in his messaging.
df.drop(columns=['id'], inplace=True)

In [None]:
df = df.drop_duplicates('text')
print(df.shape)

In [None]:
# use regular expressions to strip each tweet of mentions, hashtags, retweet information, and links
def clean_tweet_text(text):
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'RT[\s]+', '', text)
    text = re.sub(r'https?:\/\/\S+', '', text)
    text = text.lower()
    return text

In [None]:
# the following line makes use of an apply function-- it will call clean_tweet_text on every element in the 'text' column
df['text'].transform(clean_tweet_text)
df.head()

In [None]:
df['date'] = pd.to_datetime(df['date']).dt.date
df.head()

# Data exploration

In [None]:
df.describe()

In [None]:
# which device is he tweeting from the most?
df['device'].value_counts().head(n=5).plot.bar()

In [None]:
# what are the top 10 most retweeted tweets?
pd.set_option('display.max_colwidth', 400)
df.sort_values(by='retweets', ascending=False)[['text', 'date', 'favorites', 'retweets']].head(n=10)

In [None]:
# what are the top 10 favorited tweets?
df.sort_values(by=['date', 'favorites'], ascending=[True, False])[['text', 'date', 'favorites', 'retweets']].head(n=10)

# Sentiment analysis

We will be using the TextBlob library to perform sentiment analysis on the tweets in our dataset. 
TextBlob provides a simple API for diving into common natural language processing (NLP) tasks such as 
part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, translation, and more. 

In [None]:
# Sentiment of top 5 retweets
tweet = TextBlob("Peaceful protests are a hallmark of our democracy. Even if I don't always agree, I recognize the rights of people to express their views.")
print(tweet.sentiment)
tweet = TextBlob("What an amazing comeback and win by the Patriots. Tom Brady, Bob Kraft and Coach B are total winners. Wow!")
print(tweet.sentiment)
tweet = TextBlob("It all begins today! I will see you at 11:00 A.M. for the swearing-in. THE MOVEMENT CONTINUES - THE WORK BEGINS!")
print(tweet.sentiment)
tweet = TextBlob("Terrible! Just found out that Obama had my ""wires tapped"" in Trump Tower just before the victory. Nothing found. This is McCarthyism!")
print(tweet.sentiment)
tweet = TextBlob("January 20th 2017, will be remembered as the day the people became the rulers of this nation again.")
print(tweet.sentiment)


In [None]:
# Applying the TextBlob API onto our data to perform sentiment analysis for each tweet
df['polarity'] = df['text'].apply(lambda x: TextBlob(x).sentiment.polarity)
df['subjectivity'] = df['text'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
df.head(10)

In [None]:
# Plot a histogram of polarity analysis results
fig = plt.figure(figsize=(10, 6))
df['polarity'].hist()
plt.xlabel('Polarity Score', fontsize=18)
plt.ylabel('Frequency', fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
fig.savefig("Trump_polarity_hist.png")

In [None]:
# Plot a histogram of subjectivity analysis results
fig = plt.figure(figsize=(10, 6))
df['subjectivity'].hist()
plt.xlabel('Subjectivity Score', fontsize=18)
plt.ylabel('Frequency', fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
fig.savefig("Trump_subjectivity_hist.png")

In [None]:
# inspect the most negatively charged tweets
df.sort_values(by='polarity', ascending=True)[['text', 'polarity', 'subjectivity']].reset_index(drop=True).head(n=10)

In [None]:
# inspect the most positively charged tweets
df.sort_values(by='polarity', ascending=False)[['text', 'polarity', 'subjectivity']].reset_index(drop=True).head(n=10)

In [None]:
# inspect the most subjective tweets (NOTE: subjectivity scale ranges from 0 to 1)
df.sort_values(by='subjectivity', ascending=True)[['text', 'polarity', 'subjectivity']].reset_index(drop=True).head(n=10)

In [None]:
# inspect the most objective tweets
df.sort_values(by='subjectivity', ascending=False)[['text', 'polarity', 'subjectivity']].reset_index(drop=True).head(n=10)

In [None]:
# Looking at # of tweets per day for first 100
timeline = df.groupby(['date']).count().reset_index()
timeline['count'] = timeline['text']
timeline = timeline[['date', 'count']]
fig = px.bar(timeline, x='date', y='count', labels={'date': 'Date', 'count': 'Tweet Count'})

fig.show()
#fig.write_image("Trump_tweet_freq_.png")

# Time series sentiment analysis

In [None]:
# polarity values ranging from -1 to 1 are used for sentiment analysis
# We will categorize by grouping our data into 3 classes (negative, neutral, and positive) for vsiualization
criteria = [df['polarity'].between(-1, -0.01), df['polarity'].between(-0.01, 0.01), df['polarity'].between(0.01, 1)]
values = ['Negative', 'Neutral', 'Positive']
df['sentiment'] = np.select(criteria, values, 0)

# plot sentiment counts
fig = plt.figure(figsize=(10, 6))
df['sentiment'].value_counts().sort_index().plot.bar()
plt.xlabel('Sentiment Label', fontsize=18)
plt.ylabel('Tweet Count', fontsize=18)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()
plt.tight_layout()
fig.savefig("Trump_sentiment_value_counts", bbox_inches='tight');

In [None]:
timeline = df.groupby(['date']).agg(np.nanmean).reset_index()
timeline['count'] = df.groupby(['date']).count().reset_index()['retweets']
timeline = timeline[['date', 'count', 'polarity', 'retweets', 'favorites', 'subjectivity']]
timeline["polarity"] = timeline["polarity"].astype(float)
timeline["subjectivity"] = timeline["subjectivity"].astype(float)
timeline

In [None]:
timeline.sort_values(by='polarity', ascending=False)

In [None]:
# Polarity scores over first 100 days
fig = px.bar(timeline, x='date', y='count', color='polarity')
fig.show()
fig.write_image("Trump_polarity_100.png")

In [None]:
# Subjectivity scores over first 100 days
fig = px.bar(timeline, x='date', y='count', color='subjectivity')
fig.show()
fig.write_image("Trump_subjectivity_100.png")

In [None]:
from wordcloud import WordCloud, ImageColorGenerator
import wordninja
from spellchecker import SpellChecker
from collections import Counter
import nltk
import math
import random
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('english'))  
stop_words.add("amp")

In [None]:
tweet_df = df
words = ' '.join([word for word in tweet_df['text']])
word_cloud = WordCloud(width=1000, height=500, random_state=20, max_font_size=120, background_color='white').generate(words)

fig, ax = plt.subplots(figsize=(12,6))
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis('off')
fig.savefig("Trump_cloud_100", bbox_inches='tight');