# Data Collection

## Set Up

In [None]:
import os
os.environ['TOKEN'] = ""
from google.colab import drive
drive.mount('/content/drive')

In [None]:
path = "/content/"
error_log_path = "/content/"

In [None]:
import requests 
import pandas as pd 
import time

## Auth

In [None]:
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers
headers = create_headers(os.environ['TOKEN'])

## Search Query

In [None]:
def create_url(query, start_time, end_time, max_results, expansions, tweet_fields, user_fields, place_fields, endpoint):
    
    search_url = endpoint #Change to the endpoint you want to collect data from

    #change params based on the endpoint you are using
    #also can request different fields, e.g ids of users ... 
    query_params = {'query': query,
                    'end_time': end_time,
                    'start_time': start_time,
                    'max_results': max_results,
                    'expansions': expansions,
                    'tweet.fields': tweet_fields,
                    'user.fields': user_fields,
                    'place.fields': place_fields}

    return (search_url, query_params)

In [None]:
def connect_to_endpoint(url, headers, params, next_token = None):
    #only change the default value of next_token if it is a real value returned in the response
    if next_token is not None and next_token != '':
      params['next_token'] = next_token
    #create a "GET" request to the specified url, add headers and parameters
    response = requests.request("GET", url, headers = headers, params = params)
    if response.status_code != 200:
        #if something goes wrong, we need to know
        raise Exception(response.status_code, response.text)
    #otherwise, we want the payload of our response, which contains our tweet(s)
    return response.json()

In [None]:
def get_data(query, start_time, end_time, max_results, expansions, tweet_fields, user_fields, place_fields, endpoint, next_token=""):
  
  results = []


  while next_token is not None:
    try:    
      url = create_url(query, start_time, end_time, max_results, expansions, tweet_fields, user_fields, place_fields, endpoint)
      json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
      #if we have results, they will be in the field 'data' of our response
      if "data" in json_response:
        results.extend(json_response["data"])
        print(str(len(json_response["data"])) + " Tweets downloaded in this batch.")
      #the next_token is added to the field 'meta' of our response
      if "meta" in json_response:
        if "next_token" in json_response["meta"].keys():
          next_token = json_response["meta"]["next_token"]          
        else:
          next_token = None
      else:
        next_token = None

      
      #to control the rate limit we need to slow down our download
      time.sleep(3)

    except Exception as e:
      print("Error occured", e)
      print("Next token value", next_token)
      error_log = {"Error":e, "Next token":next_token, "Day":start_time, 
                   "Downloaded":len(results)}
      pd.DataFrame.from_dict(error_log, orient="index").to_csv(error_log_path+query+"_"+start_time+"_"+next_token+".csv")
      return(results, next_token)

  print("Done")
  
  return (results, next_token)

## Download And Save

In [None]:
start_time = "2022-11-25T13:00:00.000Z"
end_time = "2022-11-25T13:00:20.000Z"
query_text = "#Qatar2022"
endpoint = "https://api.twitter.com/2/tweets/search/recent/"
path = "/content/"
max_results = 100
no_days = 15

In [None]:
tweets = get_data(query_text, start_time = start_time, end_time = end_time, 
          max_results=max_results, expansions='author_id,in_reply_to_user_id,geo.place_id', 
          tweet_fields='id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source,entities',
          user_fields='id,name,username,created_at,description,public_metrics,verified',
          place_fields='full_name,id,country,country_code,geo,name,place_type',
          endpoint=endpoint)[0]          
tweets_df = pd.DataFrame(tweets)
tweets_df.to_pickle(path+"_tweets.pkl")

# Working with Data

In [None]:
tweets_df = pd.read_pickle(path+"tweets.pkl")

In [None]:
tweets_df

Unnamed: 0,created_at,edit_history_tweet_ids,reply_settings,public_metrics,text,lang,id,author_id,conversation_id,entities,referenced_tweets,geo,withheld
0,2022-12-21T23:59:57.000Z,[1605714721721577472],everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",nothing like sexting a hot girl while in the c...,en,1605714721721577472,170108614,1605714721721577472,,,,
1,2022-12-21T23:59:55.000Z,[1605714715631439872],everyone,"{'retweet_count': 101, 'reply_count': 18, 'lik...",it is honestly extremely funny that anyone has...,en,1605714715631439872,165944767,1605714715631439872,"{'urls': [{'start': 122, 'end': 145, 'url': 'h...",,,
2,2022-12-21T23:59:52.000Z,[1605714703820390404],everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",#亚博体育 #百家乐 https://t.co/lGLCl4yWWN\nHave agre...,en,1605714703820390404,4817660103,1605714703820390404,"{'urls': [{'start': 12, 'end': 35, 'url': 'htt...",,,
3,2022-12-21T23:59:52.000Z,[1605714702583189505],everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",Rich argue question large anything Republican ...,en,1605714702583189505,1598002435674472452,1605714702583189505,"{'urls': [{'start': 83, 'end': 106, 'url': 'ht...","[{'type': 'quoted', 'id': '1605714694051835904'}]",,
4,2022-12-21T23:59:52.000Z,[1605714701228425216],everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",Debt-ridden Stacey Abrams panned by Democrats ...,en,1605714701228425216,1192607550585344000,1605714701228425216,"{'urls': [{'start': 86, 'end': 109, 'url': 'ht...",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
25050,2022-12-21T15:50:28.000Z,[1605591539643621376],everyone,"{'retweet_count': 1, 'reply_count': 0, 'like_c...","You mean to tell me, if some republicans senat...",en,1605591539643621376,1585951248628359168,1605591539643621376,,,,
25051,2022-12-21T15:50:26.000Z,[1605591533209587717],everyone,"{'retweet_count': 100, 'reply_count': 20, 'lik...",Mitch McConnell attacks Marjorie Taylor Green ...,en,1605591533209587717,3751750334,1605591533209587717,"{'annotations': [{'start': 0, 'end': 14, 'prob...",,,
25052,2022-12-21T15:50:26.000Z,[1605591532819472384],everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",#反差婊 #校服 #裸聊 #调教 #女S.https://t.co/2l3oIna6Ha\n...,en,1605591532819472384,1566261995946807296,1605591532819472384,"{'hashtags': [{'start': 0, 'end': 4, 'tag': '反...",,,
25053,2022-12-21T15:50:26.000Z,[1605591531213422592],everyone,"{'retweet_count': 6, 'reply_count': 3, 'like_c...",SS and Medicare \nis another scare\nrepublican...,en,1605591531213422592,1226578309707776000,1605591531213422592,"{'annotations': [{'start': 0, 'end': 1, 'proba...",,,


## Preprocessing

In [None]:
tweets_filtered = tweets_df.copy() #it's a good idea to work on the copy of original dataframe, so we can always go back to it if we mess something up
column_list = ["id","author_id","created_at", "text","entities","public_metrics", "lang"]
tweets_filtered = tweets_filtered[column_list]

In [None]:
tweets_filtered

Unnamed: 0,id,author_id,created_at,text,entities,public_metrics,lang
0,1605714721721577472,170108614,2022-12-21T23:59:57.000Z,nothing like sexting a hot girl while in the c...,,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",en
1,1605714715631439872,165944767,2022-12-21T23:59:55.000Z,it is honestly extremely funny that anyone has...,"{'urls': [{'start': 122, 'end': 145, 'url': 'h...","{'retweet_count': 101, 'reply_count': 18, 'lik...",en
2,1605714703820390404,4817660103,2022-12-21T23:59:52.000Z,#亚博体育 #百家乐 https://t.co/lGLCl4yWWN\nHave agre...,"{'urls': [{'start': 12, 'end': 35, 'url': 'htt...","{'retweet_count': 0, 'reply_count': 0, 'like_c...",en
3,1605714702583189505,1598002435674472452,2022-12-21T23:59:52.000Z,Rich argue question large anything Republican ...,"{'urls': [{'start': 83, 'end': 106, 'url': 'ht...","{'retweet_count': 0, 'reply_count': 0, 'like_c...",en
4,1605714701228425216,1192607550585344000,2022-12-21T23:59:52.000Z,Debt-ridden Stacey Abrams panned by Democrats ...,"{'urls': [{'start': 86, 'end': 109, 'url': 'ht...","{'retweet_count': 0, 'reply_count': 0, 'like_c...",en
...,...,...,...,...,...,...,...
25050,1605591539643621376,1585951248628359168,2022-12-21T15:50:28.000Z,"You mean to tell me, if some republicans senat...",,"{'retweet_count': 1, 'reply_count': 0, 'like_c...",en
25051,1605591533209587717,3751750334,2022-12-21T15:50:26.000Z,Mitch McConnell attacks Marjorie Taylor Green ...,"{'annotations': [{'start': 0, 'end': 14, 'prob...","{'retweet_count': 100, 'reply_count': 20, 'lik...",en
25052,1605591532819472384,1566261995946807296,2022-12-21T15:50:26.000Z,#反差婊 #校服 #裸聊 #调教 #女S.https://t.co/2l3oIna6Ha\n...,"{'hashtags': [{'start': 0, 'end': 4, 'tag': '反...","{'retweet_count': 0, 'reply_count': 0, 'like_c...",en
25053,1605591531213422592,1226578309707776000,2022-12-21T15:50:26.000Z,SS and Medicare \nis another scare\nrepublican...,"{'annotations': [{'start': 0, 'end': 1, 'proba...","{'retweet_count': 6, 'reply_count': 3, 'like_c...",en


## Sentiment Analysis Addition

In [None]:
import nltk
import re
import string
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
def cleaner_sentiment(tweet):
    tweet = re.sub("@\w+","",tweet) # remove mentions
    tweet = re.sub("#\w+", "",tweet) # remove hashtags
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) # remove http links
    return tweet

In [None]:
tweets_filtered['sentiment'] = tweets_filtered['text'].apply(lambda testo: sid.polarity_scores(cleaner_sentiment(testo))['compound'])

## Extract Words/Hashtags

In [None]:
# NLTK tools
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words("english")
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
from collections import defaultdict
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
tokenizer = nltk.RegexpTokenizer(r'\w+')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
def cleaner(tweet):
    tweet = re.sub("@\w+","",tweet) # remove mentions
    tweet = re.sub("#\w+", "",tweet) # remove hashtags
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) # remove http links
    tweet = " ".join(tweet.split())
    tweet = " ".join(w for w in tokenizer.tokenize(tweet) if ((not w.lower() in stop_words) and len(w)>1 ))
    #remove stop words
    lemma_function = WordNetLemmatizer()
    tweet = " ".join(lemma_function.lemmatize(token, tag_map[tag[0]]) for token, tag in nltk.pos_tag(nltk.wordpunct_tokenize(tweet))) #lemmatize
    tweet = str.lower(tweet) #to lowercase
    return tweet

In [None]:
tweets_filtered["clean_text"] = tweets_filtered["text"].map(cleaner)

In [None]:
tweets_filtered

Unnamed: 0,id,author_id,created_at,text,entities,public_metrics,lang,sentiment,clean_text
0,1605714721721577472,170108614,2022-12-21T23:59:57.000Z,nothing like sexting a hot girl while in the c...,,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",en,-0.2047,nothing like sexting hot girl chick fil drive ...
1,1605714715631439872,165944767,2022-12-21T23:59:55.000Z,it is honestly extremely funny that anyone has...,"{'urls': [{'start': 122, 'end': 145, 'url': 'h...","{'retweet_count': 101, 'reply_count': 18, 'lik...",en,-0.4228,honestly extremely funny anyone say keep repub...
2,1605714703820390404,4817660103,2022-12-21T23:59:52.000Z,#亚博体育 #百家乐 https://t.co/lGLCl4yWWN\nHave agre...,"{'urls': [{'start': 12, 'end': 35, 'url': 'htt...","{'retweet_count': 0, 'reply_count': 0, 'like_c...",en,0.4939,agreement thing republican become
3,1605714702583189505,1598002435674472452,2022-12-21T23:59:52.000Z,Rich argue question large anything Republican ...,"{'urls': [{'start': 83, 'end': 106, 'url': 'ht...","{'retweet_count': 0, 'reply_count': 0, 'like_c...",en,0.2960,rich argue question large anything republican ...
4,1605714701228425216,1192607550585344000,2022-12-21T23:59:52.000Z,Debt-ridden Stacey Abrams panned by Democrats ...,"{'urls': [{'start': 86, 'end': 109, 'url': 'ht...","{'retweet_count': 0, 'reply_count': 0, 'like_c...",en,-0.5849,debt ridden stacey abrams pan democrats incred...
...,...,...,...,...,...,...,...,...,...
25050,1605591539643621376,1585951248628359168,2022-12-21T15:50:28.000Z,"You mean to tell me, if some republicans senat...",,"{'retweet_count': 1, 'reply_count': 0, 'like_c...",en,0.7633,mean tell republicans senator start read omnib...
25051,1605591533209587717,3751750334,2022-12-21T15:50:26.000Z,Mitch McConnell attacks Marjorie Taylor Green ...,"{'annotations': [{'start': 0, 'end': 14, 'prob...","{'retweet_count': 100, 'reply_count': 20, 'lik...",en,-0.9422,mitch mcconnell attack marjorie taylor green a...
25052,1605591532819472384,1566261995946807296,2022-12-21T15:50:26.000Z,#反差婊 #校服 #裸聊 #调教 #女S.https://t.co/2l3oIna6Ha\n...,"{'hashtags': [{'start': 0, 'end': 4, 'tag': '反...","{'retweet_count': 0, 'reply_count': 0, 'like_c...",en,0.0000,democrat short watch major need answer situation
25053,1605591531213422592,1226578309707776000,2022-12-21T15:50:26.000Z,SS and Medicare \nis another scare\nrepublican...,"{'annotations': [{'start': 0, 'end': 1, 'proba...","{'retweet_count': 6, 'reply_count': 3, 'like_c...",en,-0.7003,ss medicare another scare republicans threaten...


In [None]:
tweets_filtered.loc[tweets_filtered["clean_text"].isnull(),"clean_text"] = ""

In [None]:
#initialize an empty dict
unique_words = {}
unique_weights = {}

for idx, row in tweets_filtered.iterrows():
  if row["clean_text"] != "":
    for word in tokenizer.tokenize(row["clean_text"]):
      unique_words.setdefault(word,0)
      unique_words[word] += 1
      unique_weights.setdefault(word,float(0))
      unique_weights[word] += float(row["sentiment"])

In [None]:
uw_df = pd.DataFrame.from_dict(unique_words, orient='index').reset_index()
uw_df.rename(columns = {'index':'Word', 0:'Count'}, inplace=True)
uw_df['Sentiment'] = uw_df['Word'].apply(lambda word: unique_weights[word])/uw_df['Count']
uw_df.sort_values(by=['Count'], ascending=False, inplace=True)
uw_df = uw_df.reset_index().drop(columns=["index"])

In [None]:
uw_df

Unnamed: 0,Word,Count,Sentiment
0,republican,7929,-0.001767
1,republicans,6779,-0.085274
2,democrat,5659,-0.008159
3,democrats,5454,-0.109007
4,trump,2830,-0.109518
...,...,...,...
18139,ushering,1,-0.792500
18140,persecuting,1,-0.792500
18141,colon,1,-0.863800
18142,magician,1,0.250000


### Extract Hashtags

In [None]:
tweets_filtered.loc[tweets_df["entities"].isnull(), "entities"] = None
tweets_filtered["hashtags"] = ""

In [None]:
unique_hashtags = {}
unique_hweights = {}
index = 0

for idx, row in tweets_filtered.iterrows():
  if row["entities"] is not None and "hashtags" in row["entities"]:
    hl = []
    for hashtag in row["entities"]["hashtags"]:
      tag = '#' + hashtag["tag"].lower()
      unique_hashtags.setdefault(tag, 0)
      unique_hashtags[tag] += 1
      hl.append(tag)
      unique_hweights.setdefault(tag,float(0))
      unique_hweights[tag] += float(row["sentiment"])
 
    tweets_filtered.at[idx,"hashtags"] = hl

In [None]:
unique_hashtags = dict(sorted(unique_hashtags.items(), key=lambda item: item[1], reverse=True))

In [None]:
uh_df = pd.DataFrame.from_dict(unique_hashtags, orient='index').reset_index()
uh_df.rename(columns = {'index':'Hashtag', 0:'Count'}, inplace=True)
uh_df['Sentiment'] = uh_df['Hashtag'].apply(lambda tag: unique_hweights[tag])/uh_df['Count']

In [None]:
uh_df

Unnamed: 0,Hashtag,Count,Sentiment
0,#百家乐,1284,0.094329
1,#世界杯,394,0.098389
2,#democrats,363,-0.115708
3,#republicans,282,-0.072347
4,#nft,220,0.100510
...,...,...,...
2992,#criminal,1,0.000000
2993,#demonic,1,-0.677600
2994,#infanticide,1,-0.677600
2995,#governmentwaste,1,-0.750600


## Build Network

In [None]:
import itertools
import networkx as nx

In [None]:
uh = unique_hashtags.keys()
uw = unique_words.keys()

In [None]:
network = {}
hnetwork = {}
wnetwork = {}
network_key = 0
for index, row in tweets_filtered.iterrows():
    combined_list = [hashtag for hashtag in row["hashtags"]] + [word for word in str.split(row["clean_text"], " ") if word in uw]
    #itertool product creates Cartesian product of each element in the combined list
    for pair in itertools.product(combined_list, combined_list):
        #exclude self-loops and count each pair only once because our graph is undirected and we do not take self-loops into account
        if pair[0]!=pair[1] and not(pair[::-1] in network):
            network.setdefault(pair,0)
            network[pair] += 1
    hashtag_list = [hashtag for hashtag in row["hashtags"]]
    for pair in itertools.product(hashtag_list, hashtag_list):
        if pair[0]!=pair[1] and not(pair[::-1] in hnetwork):
            hnetwork.setdefault(pair,0)
            hnetwork[pair] += 1
    word_list = [word for word in str.split(row["clean_text"], " ") if word in uw]
    for pair in itertools.product(word_list, word_list):
        if pair[0]!=pair[1] and not(pair[::-1] in wnetwork):
            wnetwork.setdefault(pair,0)
            wnetwork[pair] += 1

network_df = pd.DataFrame.from_dict(network, orient="index")
hnetwork_df = pd.DataFrame.from_dict(hnetwork, orient="index")
wnetwork_df = pd.DataFrame.from_dict(wnetwork, orient="index")

In [None]:
network_df.reset_index(inplace=True)
network_df.columns = ["pair","weight"]
network_df.sort_values(by="weight",inplace=True, ascending=False)
network_df

Unnamed: 0,pair,weight
6633,"(republican, party)",1452
642,"(trump, tax)",1358
1008,"(democrats, republicans)",1051
4089,"(tax, return)",970
4078,"(trump, democrats)",892
...,...,...
481585,"(michael, woke)",1
481586,"(michael, hell)",1
481587,"(michael, bill)",1
481588,"(michael, includes)",1


In [None]:
hnetwork_df.reset_index(inplace=True)
hnetwork_df.columns = ["pair","weight"]
hnetwork_df.sort_values(by="weight",inplace=True, ascending=False)
hnetwork_df

Unnamed: 0,pair,weight
134,"(#足球, #世界杯)",85
30,"(#democrats, #republicans)",81
205,"(#开云体育, #世界杯)",73
108,"(#原味, #女s)",60
112,"(#女s, #恋足)",57
...,...,...
5658,"(#gdpr, #trump)",1
5659,"(#gdpr, #facts)",1
5660,"(#gdp, #income)",1
5661,"(#gdp, #stockmarket)",1


In [None]:
wnetwork_df.reset_index(inplace=True)
wnetwork_df.columns = ["pair","weight"]
wnetwork_df.sort_values(by="weight",inplace=True, ascending=False)
wnetwork_df

Unnamed: 0,pair,weight
6312,"(republican, party)",1452
617,"(trump, tax)",1358
983,"(democrats, republicans)",1051
3853,"(tax, return)",970
3842,"(trump, democrats)",892
...,...,...
137810,"(mcconnell, failed)",1
508397,"(dc, ffs)",1
508398,"(dc, robbery)",1
508399,"(dc, plain)",1


In [None]:
#to get weighted graph we need a list of 3-element tuplels (u,v,w) where u and v are nodes and w is a number representing weight
up_weighted = []
for edge in network:
    #we can filter edges by weight by uncommenting the next line and setting desired weight threshold
    #if(network[edge])>1:
    up_weighted.append((edge[0],edge[1],network[edge]))

hup_weighted = []
for edge in hnetwork:
    #we can filter edges by weight by uncommenting the next line and setting desired weight threshold
    #if(network[edge])>1:
    hup_weighted.append((edge[0],edge[1],hnetwork[edge]))

wup_weighted = []
for edge in wnetwork:
    #we can filter edges by weight by uncommenting the next line and setting desired weight threshold
    #if(network[edge])>1:
    wup_weighted.append((edge[0],edge[1],wnetwork[edge]))


G = nx.Graph()
hG = nx.Graph()
wG = nx.Graph()
G.add_weighted_edges_from(up_weighted)
hG.add_weighted_edges_from(hup_weighted)
wG.add_weighted_edges_from(wup_weighted)

In [None]:
print(len(G.nodes()))
print(len(G.edges()))
print(len(hG.nodes()))
print(len(hG.edges()))
print(len(wG.nodes()))
print(len(wG.edges()))

21137
1093113
2607
14052
18141
986426


In [None]:
nx.write_gpickle(G,path+"network.pkl")
nx.write_gpickle(hG,path+"hnetwork.pkl")
nx.write_gpickle(wG,path+"wnetwork.pkl")

In [None]:
filename = path+"/edgelist.csv"
nx.write_weighted_edgelist(G, filename, delimiter=",")
#add header with appropriate column names (works on collab and Linux/Mac(?))
!sed -i.bak 1i"Source,Target,Weight" ./edgelist.csv

In [None]:
filename = path+"/hedgelist.csv"
nx.write_weighted_edgelist(hG, filename, delimiter=",")
#add header with appropriate column names (works on collab and Linux/Mac(?))
!sed -i.bak 1i"Source,Target,Weight" ./hedgelist.csv

In [None]:
filename = path+"/wedgelist.csv"
nx.write_weighted_edgelist(wG, filename, delimiter=",")
#add header with appropriate column names (works on collab and Linux/Mac(?))
!sed -i.bak 1i"Source,Target,Weight" ./wedgelist.csv

## Political Affiliation Addition

In [None]:
uw_df['Political'] = float(0)
uh_df['Political'] = float(0)
uh_df['Political'][uh_df['Hashtag']=='#democrats'] = -1
uh_df['Political'][uh_df['Hashtag']=='#democrat'] = -1
uh_df['Political'][uh_df['Hashtag']=='#republicans'] = 1
uh_df['Political'][uh_df['Hashtag']=='#republican'] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uw_df['Political'][uw_df['Word']=='republican'] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uw_df['Political'][uw_df['Word']=='republicans'] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uw_df['Political'][uw_df['Word']=='democrat'] = -1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uw_df['Polit

In [None]:
word_nodes = uw_df.copy()
word_nodes["Label"] = word_nodes["Word"]
word_nodes.rename(columns={"Word":"Id"},inplace=True)
word_nodes = word_nodes[['Id','Label','Count','Sentiment','Political']]

word_nodes

Unnamed: 0,Id,Label,Count,Sentiment,Political
0,republican,republican,7929,-0.001767,1.0
1,republicans,republicans,6779,-0.085274,1.0
2,democrat,democrat,5659,-0.008159,-1.0
3,democrats,democrats,5454,-0.109007,-1.0
4,trump,trump,2830,-0.109518,0.0
...,...,...,...,...,...
18139,ushering,ushering,1,-0.792500,0.0
18140,persecuting,persecuting,1,-0.792500,0.0
18141,colon,colon,1,-0.863800,0.0
18142,magician,magician,1,0.250000,0.0


In [None]:
hashtag_nodes = uh_df.copy()
hashtag_nodes["Label"] = hashtag_nodes["Hashtag"]
hashtag_nodes.rename(columns={"Hashtag":"Id"},inplace=True)
hashtag_nodes = hashtag_nodes[['Id','Label','Count','Sentiment','Political']]

hashtag_nodes

Unnamed: 0,Id,Label,Count,Sentiment,Political
0,#百家乐,#百家乐,1284,0.094329,0.0
1,#世界杯,#世界杯,394,0.098389,0.0
2,#democrats,#democrats,363,-0.115708,-1.0
3,#republicans,#republicans,282,-0.072347,1.0
4,#nft,#nft,220,0.100510,0.0
...,...,...,...,...,...
2992,#criminal,#criminal,1,0.000000,0.0
2993,#demonic,#demonic,1,-0.677600,0.0
2994,#infanticide,#infanticide,1,-0.677600,0.0
2995,#governmentwaste,#governmentwaste,1,-0.750600,0.0


In [None]:
hindex_order = []
for node in list(hG.nodes()):
  hindex_order.append(hashtag_nodes.index[hashtag_nodes['Id'] == node].tolist()[0])

In [None]:
hashtag_nodes = hashtag_nodes.reindex(hindex_order)

In [None]:
hashtag_nodes

Unnamed: 0,Id,Label,Count,Sentiment,Political
77,#亚博体育,#亚博体育,30,0.215300,0.0
0,#百家乐,#百家乐,1284,0.094329,0.0
1,#世界杯,#世界杯,394,0.098389,0.0
157,#世界杯总决赛,#世界杯总决赛,17,0.170606,0.0
112,#trump2024,#trump2024,24,0.144104,0.0
...,...,...,...,...,...
2992,#criminal,#criminal,1,0.000000,0.0
2993,#demonic,#demonic,1,-0.677600,0.0
2994,#infanticide,#infanticide,1,-0.677600,0.0
2995,#governmentwaste,#governmentwaste,1,-0.750600,0.0


## PLMP

In [None]:
import numpy as np
import scipy
from networkx.linalg.graphmatrix import adjacency_matrix
hM = adjacency_matrix(hG)

  hM = adjacency_matrix(hG)


In [None]:
from sklearn.preprocessing import normalize
rnhM = normalize(hM, axis=1, norm='l1').todense()

In [None]:
#nodelist.to_csv("nodelist.csv",index=False)

In [None]:
#hrnM.toarray()

In [None]:
#np.savetxt("adj_matrix.csv", rnM.toarray(), delimiter=",")

In [None]:
hcopy = np.transpose(np.matrix(hashtag_nodes.copy()['Political']))
i = 0
diff = 1
while not(i>1000 or diff<(10**-6)):
  temphcopy = 0.9*np.dot(rnhM,hcopy)+0.1*np.transpose(np.matrix(hashtag_nodes['Political']))
  diff = np.nansum(np.abs(temphcopy-hcopy), dtype=np.float64)
  hcopy = temphcopy
  i+=1
hcopy

matrix([[-0.00026584],
        [-0.00031943],
        [-0.00010112],
        ...,
        [-0.04476829],
        [-0.01873908],
        [-0.01873908]])

In [None]:
a = hashtag_nodes.copy()

In [None]:
a['Political'] = np.transpose(hcopy).tolist()[0]

In [None]:
a['Political'].max()

0.11323219187263261

In [None]:
a.sort_values(by="Political",inplace=True, ascending=False)

In [None]:
a['Political'] = a['Political'].apply(lambda row: ((row>=0)*row/(hcopy.max()))+((row<0)*row/(-hcopy.min())))

In [None]:
a['Political'].max()

1.0

In [None]:
a['Alignment'] = a['Political']*a['Sentiment']

In [None]:
a.sort_values(by="Count",inplace=True, ascending=False)

In [None]:
hashtag_nodes = a

## Expansion to tweets and words

In [None]:
tweets_filtered['political'] = float(0)
htlist = a['Id'].tolist()
for index, row in tweets_filtered.iterrows():
  if row['hashtags']:
    algn = float(0)
    for ht in row['hashtags']:
      if ht in htlist:
        algn += float(a[a['Id']==ht]['Political'])
    tweets_filtered.loc[index,'political'] = algn/len(row['hashtags'])

In [None]:
unique_political = {}
for idx, row in tweets_filtered.iterrows():
  if row["clean_text"] != "":
    for word in tokenizer.tokenize(row["clean_text"]):
      unique_political.setdefault(word,float(0))
      unique_political[word] += float(row["political"])
word_nodes['Political'] = word_nodes['Id'].apply(lambda word: unique_political[word])/word_nodes['Count']

In [None]:
tweets_filtered['alignment'] = tweets_filtered['political']*tweets_filtered['sentiment']

In [None]:
word_nodes['Alignment'] = word_nodes['Political']*word_nodes['Sentiment']

## nodelist output

In [None]:
nodelist = hashtag_nodes.append(word_nodes, ignore_index=True)

In [None]:
nodelist.to_csv("nodelist.csv",index=False)
hashtag_nodes.to_csv("hashtaglist.csv",index=False)
word_nodes.to_csv("wordlist.csv",index=False)

In [None]:
nodelist

Unnamed: 0,Id,Label,Count,Sentiment,Political,Alignment
0,#百家乐,#百家乐,1284,0.094329,-0.003100,-0.000292
1,#世界杯,#世界杯,394,0.098389,-0.000982,-0.000097
2,#democrats,#democrats,363,-0.115708,-1.000000,0.115708
3,#republicans,#republicans,282,-0.072347,0.916451,-0.066303
4,#nft,#nft,220,0.100510,0.009505,0.000955
...,...,...,...,...,...,...
20746,mambo,mambo,1,-0.750600,-1.000000,0.750600
20747,stalked,stalked,1,0.077200,-1.000000,-0.077200
20748,countrywide,countrywide,1,-0.985200,-1.000000,0.985200
20749,pwotest,pwotest,1,0.000000,-1.000000,-0.000000
