In [96]:
# Import necessary packages
import pandas as pd
import re
from ast import literal_eval
from collections import Counter

In [97]:
# Mount Google Colab to Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [98]:
# Open keyword csv-file
filtered_trump_tweets_keywords = pd.read_csv('/content/drive/MyDrive/Data/filtered_trump_tweets_keywords.csv')

In [99]:
# Show the first few lines to inspect the data
filtered_trump_tweets_keywords.head()

Unnamed: 0.1,Unnamed: 0,id,text,is_retweet,is_deleted,favorites,retweets,date,text_clean,doc,lemmas
0,6,1.325885e+18,The threshold identification of Ballots is tur...,False,False,493076,100609,2020-11-09,The threshold identification of Ballots is tur...,The threshold identification of Ballots is tur...,"['the', 'threshold', 'identification', 'of', '..."
1,11,1.32589e+18,Nevada is turning out to be a cesspool of Fake...,False,False,363489,78378,2020-11-09,Nevada is turning out to be a cesspool of Fake...,Nevada is turning out to be a cesspool of Fake...,"['nevada', 'be', 'turn', 'out', 'to', 'be', 'a..."
2,14,1.325895e+18,Pennsylvania prevented us from watching much o...,False,False,479292,81458,2020-11-09,Pennsylvania prevented us from watching much o...,Pennsylvania prevented us from watching much o...,"['pennsylvania', 'prevent', 'we', 'from', 'wat..."
3,16,1.325896e+18,"Georgia will be a big presidential win, as it ...",False,False,637719,94570,2020-11-09,"Georgia will be a big presidential win, as it ...","Georgia will be a big presidential win, as it ...","['georgia', 'will', 'be', 'a', 'big', 'preside..."
4,18,1.346111e+18,How can you certify an election when the numbe...,False,False,211508,56945,2021-01-04,How can you certify an election when the numbe...,How can you certify an election when the numbe...,"['how', 'can', 'you', 'certify', 'an', 'electi..."


In [100]:
# Drop the unneccesary unnamed column
filtered_trump_tweets_keywords.drop(columns=['Unnamed: 0'], inplace=True)

In [101]:
# Lists of common metaphorical keywords and expressions sampled from Trump's tweets to search in lemmas
keywords_metaphors = ["abuse", "attack", "battle", "betray", "chapter", "cesspool", "crooked", "cure",
    "defend", "destroy", "drench", "dump", "fight", "flood", "game", "garbage", "goldmine", "harvesting", "hill", "hustle",
    "inundate", "kill", "landslide", "lose", "play", "plague", "pour", "run", "race", "save", "shatter", "sick",
    "silent", "sleepy", "spearhead", "steal", "stuff", "tank", "thin", "toss", "turtle", "undermine", "victory",
    "war", "witch"]

In [102]:
# Convert the lemmas from a string of list to an actual list
filtered_trump_tweets_keywords['lemmas'] = filtered_trump_tweets_keywords['lemmas'].apply(literal_eval)

In [103]:
#  Check if any metaphor keywords are in the lemmas
def contains_keywords_metaphors(lemmas):
    return any(keyword in lemmas for keyword in keywords_metaphors)

# Apply the function to identify rows containing metaphor keywords
filtered_trump_tweets_keywords['contains_metaphor'] = filtered_trump_tweets_keywords['lemmas'].apply(contains_keywords_metaphors)

# Filter tweets that contain metaphor keywords
candidate_metaphor_trump_tweets = filtered_trump_tweets_keywords[filtered_trump_tweets_keywords['contains_metaphor']]

# Show the results
candidate_metaphor_trump_tweets.head()

Unnamed: 0,id,text,is_retweet,is_deleted,favorites,retweets,date,text_clean,doc,lemmas,contains_metaphor
1,1.32589e+18,Nevada is turning out to be a cesspool of Fake...,False,False,363489,78378,2020-11-09,Nevada is turning out to be a cesspool of Fake...,Nevada is turning out to be a cesspool of Fake...,"[nevada, be, turn, out, to, be, a, cesspool, o...",True
10,1.328329e+18,Dominion is running our Election. Rigged! http...,False,False,163578,40680,2020-11-16,Dominion is running our Election. Rigged!,Dominion is running our Election. Rigged!,"[dominion, be, run, our, election, ., rig, !]",True
13,1.325961e+18,"As I have long said, @Pfizer and the others wo...",False,False,292245,59870,2020-11-10,"As I have long said, and the others would only...","As I have long said, and the others would only...","[as, i, have, long, say, ,, and, the, other, w...",True
16,1.346469e+18,Pleased to announce that @KLoeffler & @sendavi...,False,False,177332,42347,2021-01-05,Pleased to announce that & have just joined ou...,Pleased to announce that & have just joined ou...,"[pleased, to, announce, that, &, have, just, j...",True
17,1.325962e+18,"If Joe Biden were President, you wouldn’t have...",False,False,332731,56327,2020-11-10,"If Joe Biden were President, you wouldn’t have...","If Joe Biden were President, you wouldn’t have...","[if, joe, biden, be, president, ,, you, would,...",True


In [104]:
# Turn ID scientific notation to full notation so we can drop columns based on IDs from the tweets we concluded contained no metaphor later
candidate_metaphor_trump_tweets.loc[:, 'id'] = candidate_metaphor_trump_tweets['id'].apply(lambda x: f"{x:.0f}")

In [105]:
# Gather some information on the data
candidate_metaphor_trump_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 160 entries, 1 to 584
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 160 non-null    object
 1   text               160 non-null    object
 2   is_retweet         160 non-null    bool  
 3   is_deleted         160 non-null    bool  
 4   favorites          160 non-null    int64 
 5   retweets           160 non-null    int64 
 6   date               160 non-null    object
 7   text_clean         160 non-null    object
 8   doc                160 non-null    object
 9   lemmas             160 non-null    object
 10  contains_metaphor  160 non-null    bool  
dtypes: bool(3), int64(2), object(6)
memory usage: 15.8+ KB


In [106]:
# Save our dataframe as CSV
candidate_metaphor_trump_tweets.to_csv('candidate_metaphor_trump_tweets.csv')

In [107]:
# Extract the context of the metaphor keyword, set window to 8 for complete picture
def extract_context(text, keyword, window=8):
    words = text.split()
    context_words = []
    for i, word in enumerate(words):
        if word.lower() == keyword:
            start = max(0, i - window)
            end = min(len(words), i + window + 1)
            context = words[start:end]
            context_words.append(' '.join(context))
    return context_words


# Now extract contexts for each metaphor keyword
def extract_all_contexts_from_df(df, metaphor_keywords, window=5):
    contexts = []
    for idx, row in df.iterrows():
        for keyword in metaphor_keywords:
            if keyword in row['lemmas']:
                extracted_contexts = extract_context(row['text'], keyword, window)
                for context in extracted_contexts:
                    if context:
                        context_entry = {
                            'id': row['id'],
                            'date': row['date'],
                            'text': row['text'],
                            'keyword': keyword,
                            'context': context,
                            'is_retweet':row['is_retweet'],
                            'is_deleted':	row['is_deleted'],
                            'favorites':row['favorites'],
                            'retweets':row['retweets'],
                            'lemmas': row['lemmas']
                          }
                        contexts.append(context)
    return pd.DataFrame(contexts)

# Apply function: extract contexts of the tweets in our dataset
context_candidate_metaphor_trump_tweets = extract_all_contexts(candidate_metaphor_trump_tweets,keywords_metaphors, window=8)

# Show a few examples for inspection
context_candidate_metaphor_trump_tweets.head()

Unnamed: 0,id,date,text,keyword,context,is_retweet,is_deleted,favorites,retweets,lemmas
0,1325889532840062976,2020-11-09,Nevada is turning out to be a cesspool of Fake...,cesspool,Nevada is turning out to be a cesspool of Fake...,False,False,363489,78378,"[nevada, be, turn, out, to, be, a, cesspool, o..."
1,1346469204740902912,2021-01-05,Pleased to announce that @KLoeffler & @sendavi...,fight,our great #StopTheSteal group of Senators. The...,False,False,177332,42347,"[pleased, to, announce, that, &, have, just, j..."
2,1346578706437963776,2021-01-05,Washington is being inundated with people who ...,victory,people who don’t want to see an election victo...,False,False,328137,82574,"[washington, be, be, inundate, with, people, w..."
3,1346580318745206784,2021-01-05,"I hope the Democrats, and even more importantl...",landslide,pouring into D.C. They won’t stand for a lands...,False,False,210988,55852,"[i, hope, the, democrats, ,, and, even, more, ..."
4,1346580318745206784,2021-01-05,"I hope the Democrats, and even more importantl...",victory,D.C. They won’t stand for a landslide election...,False,False,210988,55852,"[i, hope, the, democrats, ,, and, even, more, ..."


In [108]:
# Save our dataframe as CSV to check the contexts manually
context_candidate_metaphor_trump_tweets.to_csv('context_candidate_metaphor_trump_tweets.csv')

In [109]:
# We will now drop the rows which do not contain metaphorical language, but rather literal language.

# List of IDs to drop
ids_to_drop = [
    '1327824600760274944', '1327828007311073280', '1327824600760274944',
    '1328089401008807936', '1336691515398250496', '1337195257776238592',
    '1337494429221916672', '1344160786252525568', '1335351633459310592',
    '1342832582606597888', '1340306154031857664', '1328483862490574848',
    '1329871775707107328', '1334711509977608192', '1340520653653458944',
    '1341043285368909824']

# Filter rows where id is not in ids_to_drop
metaphor_trump_tweets = candidate_metaphor_trump_tweets[~candidate_metaphor_trump_tweets['id'].isin(ids_to_drop)]

In [110]:
# Check if dropping the IDs was succesfull
metaphor_trump_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 144 entries, 1 to 584
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 144 non-null    object
 1   text               144 non-null    object
 2   is_retweet         144 non-null    bool  
 3   is_deleted         144 non-null    bool  
 4   favorites          144 non-null    int64 
 5   retweets           144 non-null    int64 
 6   date               144 non-null    object
 7   text_clean         144 non-null    object
 8   doc                144 non-null    object
 9   lemmas             144 non-null    object
 10  contains_metaphor  144 non-null    bool  
dtypes: bool(3), int64(2), object(6)
memory usage: 10.5+ KB


In [111]:
# Drop the duplicate rows
metaphor_trump_tweets.loc[:, :] = metaphor_trump_tweets.drop_duplicates(subset=['id'], inplace=False)

In [112]:
# Check if dropping the duplicates was succesfull
metaphor_trump_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 144 entries, 1 to 584
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 136 non-null    object 
 1   text               136 non-null    object 
 2   is_retweet         136 non-null    object 
 3   is_deleted         136 non-null    object 
 4   favorites          136 non-null    float64
 5   retweets           136 non-null    float64
 6   date               136 non-null    object 
 7   text_clean         136 non-null    object 
 8   doc                136 non-null    object 
 9   lemmas             136 non-null    object 
 10  contains_metaphor  136 non-null    object 
dtypes: float64(2), object(9)
memory usage: 13.5+ KB


In [113]:
# The rows were dropped succesfully, so we can now save the dataset for further analysis
metaphor_trump_tweets.to_csv('metaphor_trump_tweets.csv')

In [114]:
# Now calculate the frequency of each metaphor keyword in the tweets
from collections import Counter

# First ensure 'lemmas' is a list for all rows
metaphor_trump_tweets.loc[:, 'lemmas'] = metaphor_trump_tweets['lemmas'].apply(lambda x: x if isinstance(x, list) else [])

# Flatten the list of lemmas
all_lemmas = [lemma for lemmas in metaphor_trump_tweets['lemmas'] for lemma in lemmas]

# Count the frequency of each keyword
keyword_counts = Counter(all_lemmas)

# Filter to only include the metaphor keywords
metaphor_keyword_counts = {keyword: keyword_counts[keyword] for keyword in keywords_metaphors if keyword in keyword_counts}

# Convert to a DataFrame for clearer results
metaphor_keyword_counts_df = pd.DataFrame(list(metaphor_keyword_counts.items()), columns=['Keyword', 'Frequency'])

# Show the list of keyword frequencies
metaphor_keyword_counts_df.sort_values(by='Frequency', ascending=False)

Unnamed: 0,Keyword,Frequency
34,steal,31
12,fight,26
22,lose,18
21,landslide,11
28,save,9
11,dump,8
26,run,7
41,victory,6
23,play,6
35,stuff,5
