# **Installing and Importing**


In [55]:
# Import necessary packages
import pandas as pd
import re
import ast
from ast import literal_eval
from collections import Counter

In [56]:
# Mount Google Colab to Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Identify Candidate Metaphors**

In [57]:
# Open keyword csv-file
filtered_trump_tweets_keywords = pd.read_csv('/content/drive/MyDrive/Data/filtered_trump_tweets_keywords.csv')

In [58]:
# Show the first few lines to inspect the data
filtered_trump_tweets_keywords.head()

Unnamed: 0.1,Unnamed: 0,id,text,is_retweet,is_deleted,favorites,retweets,date,text_clean,doc,lemmas
0,6,1325884977112883200,The threshold identification of Ballots is tur...,False,False,493076,100609,2020-11-09,The threshold identification of Ballots is tur...,The threshold identification of Ballots is tur...,"['the', 'threshold', 'identification', 'of', '..."
1,11,1325889532840062976,Nevada is turning out to be a cesspool of Fake...,False,False,363489,78378,2020-11-09,Nevada is turning out to be a cesspool of Fake...,Nevada is turning out to be a cesspool of Fake...,"['nevada', 'be', 'turn', 'out', 'to', 'be', 'a..."
2,12,1325891490636320768,Wisconsin is looking very good. Needs a little...,False,False,347994,61006,2020-11-09,Wisconsin is looking very good. Needs a little...,Wisconsin is looking very good. Needs a little...,"['wisconsin', 'be', 'look', 'very', 'good', '...."
3,14,1325895380983275524,Pennsylvania prevented us from watching much o...,False,False,479292,81458,2020-11-09,Pennsylvania prevented us from watching much o...,Pennsylvania prevented us from watching much o...,"['pennsylvania', 'prevent', 'we', 'from', 'wat..."
4,16,1325896369534607360,"Georgia will be a big presidential win, as it ...",False,False,637719,94570,2020-11-09,"Georgia will be a big presidential win, as it ...","Georgia will be a big presidential win, as it ...","['georgia', 'will', 'be', 'a', 'big', 'preside..."


In [59]:
# And gather information on the data
filtered_trump_tweets_keywords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 563 entries, 0 to 562
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  563 non-null    int64 
 1   id          563 non-null    int64 
 2   text        563 non-null    object
 3   is_retweet  563 non-null    bool  
 4   is_deleted  563 non-null    bool  
 5   favorites   563 non-null    int64 
 6   retweets    563 non-null    int64 
 7   date        563 non-null    object
 8   text_clean  563 non-null    object
 9   doc         563 non-null    object
 10  lemmas      563 non-null    object
dtypes: bool(2), int64(4), object(5)
memory usage: 40.8+ KB


In [60]:
# Drop the unneccesary unnamed column
filtered_trump_tweets_keywords.drop(columns=['Unnamed: 0'], inplace=True)


In [61]:
keywords_metaphors = [
    "abuse", "ascend", "attack", "battle", "betray", "bleed", "blockbuster", "chapter", "cesspool", "charge",
    "clean", "concede","country", "coup", "crooked", "crush", "cure", "death", "defend", "destroy", "destruction",
    "destruct", "drench", "dump", "fail", "fight", "float", "flow", "flood", "game", "garbage", "goldmine", "harvesting",
    "hill", "hide","hiding", "history", "home", "hu stle", "inundate", "kill", "killer", "landslide", "lose", "magic",
    "media", "mountain" "pack", "play","plague", "pour", "protect", "put", "puppet", "race", "rant", "rot", "route", "run", "save",
    "shatter", "shielding", "sick", "silent", "sleepy", "soar", "spearhead", "speed", "steal", "stuff", "suit", "surrender",
    "swindle", "tank", "thin", "toss", "turtle", "undermine", "usa", "victory", "voice", "war", "waste", "witch"
    ]

keyword_phrases = ['long way to go', 'machines are corrupt', 'dominion-izing', "world is watching", "america great", "spirits will soar", "give an election", "sit back"]

# is watching, magic, rotten, razor thin, waste time, sick joke, ascend, mountain, rock, receive
# remove; history, lose, harvest, stuff?

In [62]:
# Convert the lemmas from a string of list to an actual list
filtered_trump_tweets_keywords['lemmas'] = filtered_trump_tweets_keywords['lemmas'].apply(literal_eval)

In [63]:
#  Check if any metaphor keywords are in the lemmas
def contains_keywords_metaphors(lemmas):
    return any(keyword in lemmas for keyword in keywords_metaphors)

# Filter tweets that contain metaphor keywords directly
candidate_single_metaphor_trump_tweets = filtered_trump_tweets_keywords[filtered_trump_tweets_keywords['lemmas'].apply(contains_keywords_metaphors)]

# Gather information on the data
candidate_single_metaphor_trump_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 237 entries, 1 to 559
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          237 non-null    int64 
 1   text        237 non-null    object
 2   is_retweet  237 non-null    bool  
 3   is_deleted  237 non-null    bool  
 4   favorites   237 non-null    int64 
 5   retweets    237 non-null    int64 
 6   date        237 non-null    object
 7   text_clean  237 non-null    object
 8   doc         237 non-null    object
 9   lemmas      237 non-null    object
dtypes: bool(2), int64(3), object(5)
memory usage: 17.1+ KB


In [64]:
# Now we want to add in the metaphor phrases which we found manually
# Create an empty DataFrame to store the filtered keyword phrases
candidate_metaphor_trump_tweets_phrases = pd.DataFrame()

# Iterate over each keyword phrase and filter the DataFrame
for phrase in keyword_phrases:
  candidate_metaphor_trump_tweets_phrases = pd.concat([candidate_metaphor_trump_tweets_phrases,filtered_trump_tweets_keywords[filtered_trump_tweets_keywords['text_clean'].str.contains(phrase, case=False, na=False)]])

In [65]:
# Gather some information on the data
candidate_metaphor_trump_tweets_phrases.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15 entries, 264 to 435
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          15 non-null     int64 
 1   text        15 non-null     object
 2   is_retweet  15 non-null     bool  
 3   is_deleted  15 non-null     bool  
 4   favorites   15 non-null     int64 
 5   retweets    15 non-null     int64 
 6   date        15 non-null     object
 7   text_clean  15 non-null     object
 8   doc         15 non-null     object
 9   lemmas      15 non-null     object
dtypes: bool(2), int64(3), object(5)
memory usage: 1.1+ KB


In [66]:
# Merge both datasets
all_candidate_metaphors_trump_tweets = pd.concat([candidate_metaphor_trump_tweets_phrases, candidate_single_metaphor_trump_tweets])

In [67]:
# Check if it worked by looking into the info
all_candidate_metaphors_trump_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 252 entries, 264 to 559
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          252 non-null    int64 
 1   text        252 non-null    object
 2   is_retweet  252 non-null    bool  
 3   is_deleted  252 non-null    bool  
 4   favorites   252 non-null    int64 
 5   retweets    252 non-null    int64 
 6   date        252 non-null    object
 7   text_clean  252 non-null    object
 8   doc         252 non-null    object
 9   lemmas      252 non-null    object
dtypes: bool(2), int64(3), object(5)
memory usage: 18.2+ KB


In [68]:
# Save our data as CSV
all_candidate_metaphors_trump_tweets.to_csv('all_candidate_metaphors_trump_tweets.csv')

In [69]:
# We now want to create a dataset which includes the keywords and context around the keyword for easier manual reading

In [70]:
# Function to extract context of the metaphor keyword in a tweet
def extract_context(text, keyword, window=8):
    words = text.split()
    context_words = []
    for i, word in enumerate(words):
        if re.search(re.escape(keyword), word, re.IGNORECASE):
            start = max(0, i - window)
            end = min(len(words), i + window + 1)
            context = words[start:end]
            context_words.append(' '.join(context))
    return context_words

# Extract contexts for each metaphor keyword in the tweets
def extract_all_contexts_from_df(df, metaphor_keywords, window=5):
    contexts = []
    for idx, row in df.iterrows():
        for keyword in metaphor_keywords:
            if keyword in row['lemmas']:
                extracted_contexts = extract_context(row['text'], keyword, window)
                for context in extracted_contexts:
                    if context:
                        context_entry = {
                            'id': row['id'],
                            'date': row['date'],
                            'text': row['text'],
                            'keyword': keyword,
                            'context': context,
                            'is_retweet': row['is_retweet'],
                            'is_deleted': row['is_deleted'],
                            'favorites': row['favorites'],
                            'retweets': row['retweets'],
                            'lemmas': row['lemmas']
                        }
                        contexts.append(context_entry)
    return pd.DataFrame(contexts)

# Convert contexts to a DataFrame
context_single_candidate_metaphor_trump_tweets = extract_all_contexts_from_df(candidate_single_metaphor_trump_tweets, keywords_metaphors, window=8)

# Display a few examples for inspection
context_single_candidate_metaphor_trump_tweets.head()

Unnamed: 0,id,date,text,keyword,context,is_retweet,is_deleted,favorites,retweets,lemmas
0,1325889532840062976,2020-11-09,Nevada is turning out to be a cesspool of Fake...,cesspool,Nevada is turning out to be a cesspool of Fake...,False,False,363489,78378,"[nevada, be, turn, out, to, be, a, cesspool, o..."
1,1325895380983275524,2020-11-09,Pennsylvania prevented us from watching much o...,country,the Ballot count. Unthinkable and illegal in t...,False,False,479292,81458,"[pennsylvania, prevent, we, from, watch, much,..."
2,1346120645613150208,2021-01-04,The “Surrender Caucus” within the Republican P...,surrender,The “Surrender Caucus” within the Republican P...,False,False,235516,60242,"[the, "", surrender, caucus, "", within, the, re..."
3,1329054683441278977,2020-11-18,...AND I WON THE ELECTION. VOTER FRAUD ALL OVE...,country,WON THE ELECTION. VOTER FRAUD ALL OVER THE COU...,False,False,246973,54880,"[..., and, i, won, the, election, ., voter, fr..."
4,1328328547598000130,2020-11-16,Dominion is running our Election. Rigged! http...,run,Dominion is running our Election. Rigged! http...,False,False,163578,40680,"[dominion, be, run, our, election, ., rig, !]"


In [71]:
# Function to extract context of the metaphor keyword phrases in a tweet
def extract_context(text, keyword, window=15):
    context_words = []
    clean_text = text.lower()

    pattern = re.compile(keyword)
    matches = list(pattern.finditer(clean_text))

    for match in matches:
        start = max(0, match.start() - window)
        end = min(len(clean_text), match.end() + window)
        context = clean_text[start:end]
        context_words.append(''.join(context))
    return context_words

def extract_all_contexts_from_df(df, keyword_phrases, window=15):
    contexts = []
    for idx, row in df.iterrows():
        for phrase in keyword_phrases:
            if phrase.lower() in row['text_clean'].lower():
                extracted_contexts = extract_context(row['text_clean'], phrase, window)
                for context in extracted_contexts:
                    context_entry = {
                        'id': row['id'],
                        'date': row['date'],
                        'text': row['text'],
                        'keyword': phrase,
                        'context': context,
                        'is_retweet': row['is_retweet'],
                        'is_deleted': row['is_deleted'],
                        'favorites': row['favorites'],
                        'retweets': row['retweets'],
                        'lemmas': row['lemmas']  # Assuming 'lemmas' is a list of strings
                    }
                    contexts.append(context_entry)
    return pd.DataFrame(contexts)

# Convert contexts to a DataFrame
context_candidate_metaphor_phrases_trump_tweets = extract_all_contexts_from_df(candidate_metaphor_trump_tweets_phrases, keyword_phrases, window=15)

# Display a few examples for inspection
context_candidate_metaphor_phrases_trump_tweets.head()

Unnamed: 0,id,date,text,keyword,context,is_retweet,is_deleted,favorites,retweets,lemmas
0,1327979630477922304,2020-11-15,He only won in the eyes of the FAKE NEWS MEDIA...,long way to go,ing! we have a long way to go. this was a ri,False,False,572034,109164,"[he, only, win, in, the, eye, of, the, fake, n..."
1,1325442336957018112,2020-11-08,“We believe these people are thieves. The big ...,machines are corrupt,. the big city machines are corrupt. this was ...,False,False,516040,101484,"["", we, believe, these, people, be, thief, ., ..."
2,1330368206968643584,2020-11-22,“Dominion-izing the Vote”\nPart Two via @OANN ...,dominion-izing,“dominion-izing the vote” part,False,False,73209,21095,"["", dominion, -, ize, the, vote, "", part, two,..."
3,1330367988621594625,2020-11-22,“Dominion-izing the Vote”\nPart One via @OANN ...,dominion-izing,“dominion-izing the vote” part,False,False,136743,41797,"["", dominion, -, ize, the, vote, "", part, one,..."
4,1330368448552169474,2020-11-22,“Dominion-izing the Vote”\nPart Three via @OAN...,dominion-izing,“dominion-izing the vote” part,False,False,70478,20578,"["", dominion, -, ize, the, vote, "", part, thre..."


In [72]:
# Merge both datasets
all_context_candidate_metaphors = pd.concat([context_candidate_metaphor_phrases_trump_tweets, context_single_candidate_metaphor_trump_tweets])

In [73]:
all_context_candidate_metaphors.info()

<class 'pandas.core.frame.DataFrame'>
Index: 367 entries, 0 to 351
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          367 non-null    int64 
 1   date        367 non-null    object
 2   text        367 non-null    object
 3   keyword     367 non-null    object
 4   context     367 non-null    object
 5   is_retweet  367 non-null    bool  
 6   is_deleted  367 non-null    bool  
 7   favorites   367 non-null    int64 
 8   retweets    367 non-null    int64 
 9   lemmas      367 non-null    object
dtypes: bool(2), int64(3), object(5)
memory usage: 26.5+ KB


In [74]:
# Save our dataframe as CSV to check the contexts manually
all_context_candidate_metaphors.to_csv('all_context_candidate_metaphors.csv')

# **Filter to Identified Metaphors**

In [75]:
# After manually reviewing all the candidate metaphors, we will now only keep the tweets which contain a metaphorical construction of disinformation and conspiracy theories
# List of Tweet IDs to keep
tweet_ids_to_keep = [
    1323422929850372098, 1323424722911170560, 1323425208255086592, 1323430341512622080,
    1323864823680126977, 1324004491612618752, 1324007806694023169, 1324108206801563650,
    1324730186798161925, 1324752659996397575, 1325065540390559745, 1325442336957018112,
    1325889532840062976, 1325961672343875585, 1326186126709821440, 1326186297157963777,
    1326194143132082178, 1326342742801326083, 1326519025552265216, 1326525851752656898,
    1327436963658469376, 1327715767916392449, 1327811527123103746, 1327836002631766016,
    1327836670742450177, 1327840088731029504, 1327956491056279552, 1327958640309972992,
    1327979630477922304, 1328093294782509060, 1328100945570500608, 1328152462331699202,
    1328328547598000130, 1328361451497664512, 1329087255168708608, 1329233502139715586,
    1329424134166687744, 1329602736053252107, 1330248268656357376, 1330278689200857088,
    1330288302663069701, 1330368206968643584, 1330374020613758977, 1330487246236028935,
    1331086969183621120, 1331214247955738624, 1331219093563781122, 1331404288149643264,
    1332169753004224515, 1332337895584264197, 1332552283553476608, 1332778938121203720,
    1333215466022727686, 1333243285389914112, 1333405854297632770, 1333410418119864320,
    1333610260381175813, 1333856259662077954, 1334573529107460096, 1334711509977608192,
    1334949289160740866, 1335351629810286592, 1335351633459310593, 1335413770139549697,
    1336114633485266944, 1336404093028478976, 1336407510069161988, 1336730906107768842,
    1337040387349893121, 1337040883988959232, 1337041669368258565, 1337042201738612736,
    1337042714924380166, 1337385736530780161, 1337617458962817028, 1337629306919538694,
    1337745268591259648, 1338246163368046593, 1338347204440952832, 1338483200046354434,
    1338715842931023873, 1338851758819389441, 1338718081443622912, 1338871862315667456,
    1339198867275980802, 1339709639584337920, 1339748940380577792, 1339937091707351046,
    1340333619299147781, 1340376176695631872, 1341043284542713857, 1341138407460925440,
    1341138409092509696, 1342209260026023940, 1342245390540804096, 1342817496924086278,
    1342821189077622792, 1342826019745640450, 1342830505163706369, 1342974370822692867,
    1342974377218994181, 1343919652125241345, 1343998076084748288, 1344036020162093058,
    1344036020162093056, 1344160787384971264, 1344400646066331648, 1345498987617509378,
    1345503150745247746, 1345508977031974918, 1346120645613150208, 1346469204740902915,
    1346578706437963777, 1346580318745206785, 1346659909492998146, 1346809349214248962,
    1346818855298072576, 1346900434540240897, 1346954970910707712
    ]

# Filter the DataFrame to keep only rows with the specified IDs
all_metaphors_trump_tweets = all_candidate_metaphors_trump_tweets[all_candidate_metaphors_trump_tweets['id'].isin(tweet_ids_to_keep)]

In [76]:
# Check if only keeping the selected IDs was succesfull
all_metaphors_trump_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 124 entries, 264 to 553
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          124 non-null    int64 
 1   text        124 non-null    object
 2   is_retweet  124 non-null    bool  
 3   is_deleted  124 non-null    bool  
 4   favorites   124 non-null    int64 
 5   retweets    124 non-null    int64 
 6   date        124 non-null    object
 7   text_clean  124 non-null    object
 8   doc         124 non-null    object
 9   lemmas      124 non-null    object
dtypes: bool(2), int64(3), object(5)
memory usage: 9.0+ KB


In [77]:
# Make sure duplicate IDs are dropped
all_metaphors_trump_tweets.drop_duplicates(subset='id', inplace=False)

Unnamed: 0,id,text,is_retweet,is_deleted,favorites,retweets,date,text_clean,doc,lemmas
264,1327979630477922304,He only won in the eyes of the FAKE NEWS MEDIA...,False,False,572034,109164,2020-11-15,He only won in the eyes of the FAKE NEWS MEDIA...,He only won in the eyes of the FAKE NEWS MEDIA...,"[he, only, win, in, the, eye, of, the, fake, n..."
507,1325442336957018112,“We believe these people are thieves. The big ...,False,False,516040,101484,2020-11-08,“We believe these people are thieves. The big ...,“We believe these people are thieves. The big ...,"["", we, believe, these, people, be, thief, ., ..."
142,1330368206968643584,“Dominion-izing the Vote”\nPart Two via @OANN ...,False,False,73209,21095,2020-11-22,“Dominion-izing the Vote” Part Two via,“Dominion-izing the Vote” Part Two via,"["", dominion, -, ize, the, vote, "", part, two,..."
109,1332552283553476608,"The 1,126,940 votes were created out of thin a...",False,False,264054,73949,2020-11-28,"The 1,126,940 votes were created out of thin a...","The 1,126,940 votes were created out of thin a...","[the, 1,126,940, vote, be, create, out, of, th..."
28,1346578706437963777,Washington is being inundated with people who ...,False,False,328137,82574,2021-01-05,Washington is being inundated with people who ...,Washington is being inundated with people who ...,"[washington, be, be, inundate, with, people, w..."
...,...,...,...,...,...,...,...,...,...,...
525,1344160787384971264,"....that, quite frankly, didn’t have much of a...",False,False,128681,25801,2020-12-30,"....that, quite frankly, didn’t have much of a...","....that, quite frankly, didn’t have much of a...","[...., that, ,, quite, frankly, ,, do, not, ha..."
535,1344400646066331648,We now have far more votes than needed to flip...,False,False,348110,85675,2020-12-30,We now have far more votes than needed to flip...,We now have far more votes than needed to flip...,"[we, now, have, far, more, vote, than, need, t..."
550,1345498987617509378,Civil War: Tucker Carlson Hits His Own Network...,False,False,92281,24596,2021-01-02,Civil War: Tucker Carlson Hits His Own Network...,Civil War: Tucker Carlson Hits His Own Network...,"[civil, war, :, tucker, carlson, hit, his, own..."
552,1345503150745247746,"...And after they see the facts, plenty more t...",False,False,147990,35976,2021-01-02,"...And after they see the facts, plenty more t...","...And after they see the facts, plenty more t...","[..., and, after, they, see, the, fact, ,, ple..."


In [78]:
# Check if dropping duplicates was succesfull
all_metaphors_trump_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 124 entries, 264 to 553
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          124 non-null    int64 
 1   text        124 non-null    object
 2   is_retweet  124 non-null    bool  
 3   is_deleted  124 non-null    bool  
 4   favorites   124 non-null    int64 
 5   retweets    124 non-null    int64 
 6   date        124 non-null    object
 7   text_clean  124 non-null    object
 8   doc         124 non-null    object
 9   lemmas      124 non-null    object
dtypes: bool(2), int64(3), object(5)
memory usage: 9.0+ KB


In [79]:
# Inspect the data
all_metaphors_trump_tweets.head()

Unnamed: 0,id,text,is_retweet,is_deleted,favorites,retweets,date,text_clean,doc,lemmas
264,1327979630477922304,He only won in the eyes of the FAKE NEWS MEDIA...,False,False,572034,109164,2020-11-15,He only won in the eyes of the FAKE NEWS MEDIA...,He only won in the eyes of the FAKE NEWS MEDIA...,"[he, only, win, in, the, eye, of, the, fake, n..."
507,1325442336957018112,“We believe these people are thieves. The big ...,False,False,516040,101484,2020-11-08,“We believe these people are thieves. The big ...,“We believe these people are thieves. The big ...,"["", we, believe, these, people, be, thief, ., ..."
142,1330368206968643584,“Dominion-izing the Vote”\nPart Two via @OANN ...,False,False,73209,21095,2020-11-22,“Dominion-izing the Vote” Part Two via,“Dominion-izing the Vote” Part Two via,"["", dominion, -, ize, the, vote, "", part, two,..."
109,1332552283553476608,"The 1,126,940 votes were created out of thin a...",False,False,264054,73949,2020-11-28,"The 1,126,940 votes were created out of thin a...","The 1,126,940 votes were created out of thin a...","[the, 1,126,940, vote, be, create, out, of, th..."
28,1346578706437963777,Washington is being inundated with people who ...,False,False,328137,82574,2021-01-05,Washington is being inundated with people who ...,Washington is being inundated with people who ...,"[washington, be, be, inundate, with, people, w..."


In [80]:
# The rows were dropped succesfully, so we metaphor_trump_tweets now save the dataset for further analysis
all_metaphors_trump_tweets.to_csv('all_metaphors_trump_tweets.csv')