#Updating libraries to avoid compatibility issues
Always run at the start of runtime

In [None]:
!pip install scipy==1.8.0

In [None]:
!pip install networkx==2.8.8

# Data Collection

## Set Up

In [None]:
import os
os.environ['TOKEN'] = ""

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
path = "/content/"
error_log_path = "/content/"

In [None]:
import requests 
import pandas as pd 
import time

## Auth

In [None]:
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers
headers = create_headers(os.environ['TOKEN'])

## Search Query

In [None]:
def create_url(query, start_time, end_time, max_results, expansions, tweet_fields, user_fields, place_fields, endpoint):
    
    search_url = endpoint #Change to the endpoint you want to collect data from

    #change params based on the endpoint you are using
    #also can request different fields, e.g ids of users ... 
    query_params = {'query': query,
                    'end_time': end_time,
                    'start_time': start_time,
                    'max_results': max_results,
                    'expansions': expansions,
                    'tweet.fields': tweet_fields,
                    'user.fields': user_fields,
                    'place.fields': place_fields}

    return (search_url, query_params)

In [None]:
def connect_to_endpoint(url, headers, params, next_token = None):
    #only change the default value of next_token if it is a real value returned in the response
    if next_token is not None and next_token != '':
      params['next_token'] = next_token
    #create a "GET" request to the specified url, add headers and parameters
    response = requests.request("GET", url, headers = headers, params = params)
    if response.status_code != 200:
        #if something goes wrong, we need to know
        raise Exception(response.status_code, response.text)
    #otherwise, we want the payload of our response, which contains our tweet(s)
    return response.json()

In [None]:
def get_data(query, start_time, end_time, max_results, expansions, tweet_fields, user_fields, place_fields, endpoint, next_token=""):
  
  results = []


  while next_token is not None:
    try:    
      url = create_url(query, start_time, end_time, max_results, expansions, tweet_fields, user_fields, place_fields, endpoint)
      json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
      #if we have results, they will be in the field 'data' of our response
      if "data" in json_response:
        results.extend(json_response["data"])
        print(str(len(json_response["data"])) + " Tweets downloaded in this batch.")
      #the next_token is added to the field 'meta' of our response
      if "meta" in json_response:
        if "next_token" in json_response["meta"].keys():
          next_token = json_response["meta"]["next_token"]          
        else:
          next_token = None
      else:
        next_token = None

      
      #to control the rate limit we need to slow down our download
      time.sleep(3)

    except Exception as e:
      print("Error occured", e)
      print("Next token value", next_token)
      error_log = {"Error":e, "Next token":next_token, "Day":start_time, 
                   "Downloaded":len(results)}
      pd.DataFrame.from_dict(error_log, orient="index").to_csv(error_log_path+query+"_"+start_time+"_"+next_token+".csv")
      return(results, next_token)

  print("Done")
  
  return (results, next_token)

## Download And Save

In [None]:
start_time = "2022-11-25T13:00:00.000Z"
end_time = "2022-11-25T13:00:20.000Z"
query_text = "(#democrat OR #democrats OR #republican OR #republicans) -is:reply -is:retweet"
endpoint = "https://api.twitter.com/2/tweets/search/all/"
path = "/content/"
max_results = 500
no_days = 15

In [None]:
tweets = get_data(query_text, start_time = start_time, end_time = end_time, 
          max_results=max_results, expansions='author_id,in_reply_to_user_id,geo.place_id', 
          tweet_fields='id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source,entities',
          user_fields='id,name,username,created_at,description,public_metrics,verified',
          place_fields='full_name,id,country,country_code,geo,name,place_type',
          endpoint=endpoint)[0]          
tweets_df = pd.DataFrame(tweets)
tweets_df.to_pickle(path+"_tweets.pkl")

# Working with Data

## Upload

In [None]:
import pandas as pd
path = "/content/"
error_log_path = "/content/"

In [None]:
tweets_df = pd.read_pickle(path+"tweets.pkl")

In [None]:
tweets_df

## Preprocessing

In [None]:
tweets_filtered = tweets_df.copy() #it's a good idea to work on the copy of original dataframe, so we can always go back to it if we mess something up
column_list = ["id","author_id","created_at", "text","entities","public_metrics", "lang"]
tweets_filtered = tweets_filtered[column_list]

In [None]:
tweets_filtered

## Sentiment Analysis Addition

In [None]:
import nltk
import re
import string
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [None]:
def cleaner_sentiment(tweet):
    tweet = re.sub("@\w+","",tweet) # remove mentions
    tweet = re.sub("#\w+", "",tweet) # remove hashtags
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) # remove http links
    return tweet

In [None]:
tweets_filtered['sentiment'] = tweets_filtered['text'].apply(lambda testo: sid.polarity_scores(cleaner_sentiment(testo))['compound'])

## Extract Words/Hashtags

In [None]:
# NLTK tools
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words("english")
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
from collections import defaultdict
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
tokenizer = nltk.RegexpTokenizer(r'\w+')

In [None]:
stop_words.append('amp')

In [None]:
def cleaner(tweet):
    tweet = re.sub("@\w+","",tweet) # remove mentions
    tweet = re.sub("#\w+", "",tweet) # remove hashtags
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) # remove http links
    tweet = " ".join(tweet.split())
    tweet = " ".join(w for w in tokenizer.tokenize(tweet) if ((not w.lower() in stop_words) and len(w)>1 ))
    #remove stop words
    lemma_function = WordNetLemmatizer()
    tweet = " ".join(lemma_function.lemmatize(token, tag_map[tag[0]]) for token, tag in nltk.pos_tag(nltk.wordpunct_tokenize(tweet))) #lemmatize
    tweet = str.lower(tweet) #to lowercase
    return tweet

In [None]:
tweets_filtered["clean_text"] = tweets_filtered["text"].map(cleaner)

In [None]:
tweets_filtered

In [None]:
tweets_filtered.loc[tweets_filtered["clean_text"].isnull(),"clean_text"] = ""

In [None]:
#initialize an empty dict
unique_words = {}
unique_weights = {}

for idx, row in tweets_filtered.iterrows():
  if row["clean_text"] != "":
    for word in tokenizer.tokenize(row["clean_text"]):
      unique_words.setdefault(word,0)
      unique_words[word] += 1
      unique_weights.setdefault(word,float(0))
      unique_weights[word] += float(row["sentiment"])

In [None]:
uw_df = pd.DataFrame.from_dict(unique_words, orient='index').reset_index()
uw_df.rename(columns = {'index':'Word', 0:'Count'}, inplace=True)
uw_df['Sentiment'] = uw_df['Word'].apply(lambda word: unique_weights[word])/uw_df['Count']
uw_df.sort_values(by=['Count'], ascending=False, inplace=True)
uw_df = uw_df.reset_index().drop(columns=["index"])
uw_df['Sentiment'] = ((uw_df['Sentiment'])**(1/2)).fillna(0)-((-uw_df['Sentiment'])**(1/2)).fillna(0)

In [None]:
uw_df

### Extract Hashtags

In [None]:
tweets_filtered.loc[tweets_df["entities"].isnull(), "entities"] = None
tweets_filtered["hashtags"] = ""

In [None]:
unique_hashtags = {}
unique_hweights = {}
index = 0

for idx, row in tweets_filtered.iterrows():
  if row["entities"] is not None and "hashtags" in row["entities"]:
    hl = []
    for hashtag in row["entities"]["hashtags"]:
      tag = '#' + hashtag["tag"].lower()
      unique_hashtags.setdefault(tag, 0)
      unique_hashtags[tag] += 1
      hl.append(tag)
      unique_hweights.setdefault(tag,float(0))
      unique_hweights[tag] += float(row["sentiment"])
 
    tweets_filtered.at[idx,"hashtags"] = hl

In [None]:
unique_hashtags = dict(sorted(unique_hashtags.items(), key=lambda item: item[1], reverse=True))

In [None]:
uh_df = pd.DataFrame.from_dict(unique_hashtags, orient='index').reset_index()
uh_df.rename(columns = {'index':'Hashtag', 0:'Count'}, inplace=True)
uh_df['Sentiment'] = uh_df['Hashtag'].apply(lambda tag: unique_hweights[tag])/uh_df['Count']
uh_df['Sentiment'] = ((uh_df['Sentiment'])**(1/2)).fillna(0)-((-uh_df['Sentiment'])**(1/2)).fillna(0)

In [None]:
uh_df

## Build Network

In [None]:
import itertools
import networkx as nx

In [None]:
uh = unique_hashtags.keys()
uw = unique_words.keys()

In [None]:
network = {}
hnetwork = {}
wnetwork = {}
network_key = 0
for index, row in tweets_filtered.iterrows():
    combined_list = [hashtag for hashtag in row["hashtags"]] + [word for word in str.split(row["clean_text"], " ") if word in uw]
    #itertool product creates Cartesian product of each element in the combined list
    for pair in itertools.product(combined_list, combined_list):
        #exclude self-loops and count each pair only once because our graph is undirected and we do not take self-loops into account
        if pair[0]!=pair[1] and not(pair[::-1] in network):
            network.setdefault(pair,0)
            network[pair] += 1
    hashtag_list = [hashtag for hashtag in row["hashtags"]]
    for pair in itertools.product(hashtag_list, hashtag_list):
        if pair[0]!=pair[1] and not(pair[::-1] in hnetwork):
            hnetwork.setdefault(pair,0)
            hnetwork[pair] += 1
    word_list = [word for word in str.split(row["clean_text"], " ") if word in uw]
    for pair in itertools.product(word_list, word_list):
        if pair[0]!=pair[1] and not(pair[::-1] in wnetwork):
            wnetwork.setdefault(pair,0)
            wnetwork[pair] += 1

network_df = pd.DataFrame.from_dict(network, orient="index")
hnetwork_df = pd.DataFrame.from_dict(hnetwork, orient="index")
wnetwork_df = pd.DataFrame.from_dict(wnetwork, orient="index")

In [None]:
network_df.reset_index(inplace=True)
network_df.columns = ["pair","weight"]
network_df.sort_values(by="weight",inplace=True, ascending=False)
network_df

In [None]:
hnetwork_df.reset_index(inplace=True)
hnetwork_df.columns = ["pair","weight"]
hnetwork_df.sort_values(by="weight",inplace=True, ascending=False)
hnetwork_df

In [None]:
wnetwork_df.reset_index(inplace=True)
wnetwork_df.columns = ["pair","weight"]
wnetwork_df.sort_values(by="weight",inplace=True, ascending=False)
wnetwork_df

In [None]:
#to get weighted graph we need a list of 3-element tuplels (u,v,w) where u and v are nodes and w is a number representing weight
up_weighted = []
for edge in network:
    #we can filter edges by weight by uncommenting the next line and setting desired weight threshold
    #if(network[edge])>1:
    up_weighted.append((edge[0],edge[1],network[edge]))

hup_weighted = []
for edge in hnetwork:
    #we can filter edges by weight by uncommenting the next line and setting desired weight threshold
    #if(network[edge])>1:
    hup_weighted.append((edge[0],edge[1],hnetwork[edge]))

wup_weighted = []
for edge in wnetwork:
    #we can filter edges by weight by uncommenting the next line and setting desired weight threshold
    #if(network[edge])>1:
    wup_weighted.append((edge[0],edge[1],wnetwork[edge]))


G = nx.Graph()
hG = nx.Graph()
wG = nx.Graph()
G.add_weighted_edges_from(up_weighted)
hG.add_weighted_edges_from(hup_weighted)
wG.add_weighted_edges_from(wup_weighted)

In [None]:
print(len(G.nodes()))
print(len(G.edges()))
print(len(hG.nodes()))
print(len(hG.edges()))
print(len(wG.nodes()))
print(len(wG.edges()))

In [None]:
nx.write_gpickle(G,path+"network.pkl")
nx.write_gpickle(hG,path+"hnetwork.pkl")
nx.write_gpickle(wG,path+"wnetwork.pkl")

In [None]:
filename = path+"/edgelist.csv"
nx.write_weighted_edgelist(G, filename, delimiter=",")
#add header with appropriate column names (works on collab and Linux/Mac(?))
!sed -i.bak 1i"Source,Target,Weight" ./edgelist.csv

In [None]:
filename = path+"/hedgelist.csv"
nx.write_weighted_edgelist(hG, filename, delimiter=",")
#add header with appropriate column names (works on collab and Linux/Mac(?))
!sed -i.bak 1i"Source,Target,Weight" ./hedgelist.csv

In [None]:
filename = path+"/wedgelist.csv"
nx.write_weighted_edgelist(wG, filename, delimiter=",")
#add header with appropriate column names (works on collab and Linux/Mac(?))
!sed -i.bak 1i"Source,Target,Weight" ./wedgelist.csv

## Political Affiliation Addition

In [None]:
uw_df['Political'] = float(0)
uh_df['Political'] = float(0)
uh_df['Political'][uh_df['Hashtag']=='#democrats'] = -1
uh_df['Political'][uh_df['Hashtag']=='#democrat'] = -1
uh_df['Political'][uh_df['Hashtag']=='#republicans'] = 1
uh_df['Political'][uh_df['Hashtag']=='#republican'] = 1

In [None]:
word_nodes = uw_df.copy()
word_nodes["Label"] = word_nodes["Word"]
word_nodes.rename(columns={"Word":"Id"},inplace=True)
word_nodes = word_nodes[['Id','Label','Count','Sentiment','Political']]

word_nodes

In [None]:
hashtag_nodes = uh_df.copy()
hashtag_nodes["Label"] = hashtag_nodes["Hashtag"]
hashtag_nodes.rename(columns={"Hashtag":"Id"},inplace=True)
hashtag_nodes = hashtag_nodes[['Id','Label','Count','Sentiment','Political']]

hashtag_nodes

In [None]:
hindex_order = []
for node in list(hG.nodes()):
  hindex_order.append(hashtag_nodes.index[hashtag_nodes['Id'] == node].tolist()[0])

In [None]:
hashtag_nodes = hashtag_nodes.reindex(hindex_order)

In [None]:
hashtag_nodes

## PLMP

In [None]:
import numpy as np
import scipy
from networkx.linalg.graphmatrix import adjacency_matrix
hM = adjacency_matrix(hG)

In [None]:
from sklearn.preprocessing import normalize
rnhM = normalize(hM, axis=1, norm='l1').todense()

In [None]:
#nodelist.to_csv("nodelist.csv",index=False)

In [None]:
#hrnM.toarray()

In [None]:
#np.savetxt("adj_matrix.csv", rnM.toarray(), delimiter=",")

In [None]:
hcopy = np.transpose(np.matrix(hashtag_nodes.copy()['Political']))
i = 0
diff = 1
while not(i>1000 or diff<(10**-6)):
  temphcopy = 0.80*np.dot(rnhM,hcopy)+0.2*np.transpose(np.matrix(hashtag_nodes['Political']))
  diff = np.nansum(np.abs(temphcopy-hcopy), dtype=np.float64)
  hcopy = temphcopy
  i+=1
hcopy

In [None]:
a = hashtag_nodes.copy()

In [None]:
a['Political'] = np.transpose(hcopy).tolist()[0]

In [None]:
a['Political'].max()

In [None]:
a['Political'].mean()

In [None]:
a.sort_values(by="Political",inplace=True, ascending=False)

In [None]:
wm = (a['Political']*a['Count']).sum()/(a['Count'].sum())

In [None]:
wm

In [None]:
a['Political'] = a['Political'].apply(lambda row: ((row)>=0)*((row)/(hcopy.max()))+((row)<0)*((row)/(-hcopy.min())))

In [None]:
a['Political'].mean()

In [None]:
a['Political'].min()

In [None]:
a['Political']= ((a['Political'])**(1/2)).fillna(0)-((-a['Political'])**(1/2)).fillna(0)

In [None]:
a['Alignment'] = a['Political']*a['Sentiment']

In [None]:
a.sort_values(by="Count",inplace=True, ascending=False)

In [None]:
hashtag_nodes = a

In [None]:
(a['Political']*a['Count']).mean()/(a['Count'].mean())

In [None]:
a[0:50]

In [None]:
a['Political'].plot(kind='hist')

## Expansion to tweets and words

In [None]:
tweets_filtered['political'] = float(0)
htlist = a['Id'].tolist()
for index, row in tweets_filtered.iterrows():
  if row['hashtags']:
    algn = float(0)
    for ht in row['hashtags']:
      if ht in htlist:
        algn += float(a[a['Id']==ht]['Political'])
    tweets_filtered.loc[index,'political'] = algn/len(row['hashtags'])

In [None]:
unique_political = {}
for idx, row in tweets_filtered.iterrows():
  if row["clean_text"] != "":
    for word in tokenizer.tokenize(row["clean_text"]):
      unique_political.setdefault(word,float(0))
      unique_political[word] += float(row["political"])
word_nodes['Political'] = word_nodes['Id'].apply(lambda word: unique_political[word])/word_nodes['Count']

In [None]:
tweets_filtered['alignment'] = tweets_filtered['political']*tweets_filtered['sentiment']

In [None]:
word_nodes['Alignment'] = word_nodes['Political']*word_nodes['Sentiment']

## nodelist output

In [None]:
nodelist = hashtag_nodes.append(word_nodes, ignore_index=True)

In [None]:
nodelist.to_csv("nodelist.csv",index=False)
hashtag_nodes.to_csv("hashtaglist.csv",index=False)
word_nodes.to_csv("wordlist.csv",index=False)

In [None]:
nodelist

##Output tweets_filtered

In [None]:
tweets_filtered.to_csv(path+"tweets_filtered.csv")

# Replies collection
Uses functions from Data Collection and Working with Data

In [None]:
tweets_ids = []
for idx, row in tweets_filtered.iterrows():
  if row['public_metrics']['reply_count']>5:
      tweets_ids.append([row['id'],row['author_id']])

In [None]:
def get_replies(query, start_time, end_time, max_results, expansions, tweet_fields, user_fields, place_fields, endpoint, next_token=""):
  
  results = []


  while next_token is not None:
    try:    
      url = create_url(query, start_time, end_time, max_results, expansions, tweet_fields, user_fields, place_fields, endpoint)
      json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
      #if we have results, they will be in the field 'data' of our response
      if "data" in json_response:
        results.extend(json_response["data"])
        print(str(len(json_response["data"])) + " Tweets downloaded in this batch.")
      #the next_token is added to the field 'meta' of our response
      if "meta" in json_response:
        if "next_token" in json_response["meta"].keys():
          next_token = json_response["meta"]["next_token"]          
        else:
          next_token = None
      else:
        next_token = None

      
      #to control the rate limit we need to slow down our download
      time.sleep(1)

    except Exception as e:
      print("Error occured", e)
      print("Next token value", next_token)
      error_log = {"Error":e, "Next token":next_token, "Day":start_time, 
                   "Downloaded":len(results)}
      pd.DataFrame.from_dict(error_log, orient="index").to_csv(error_log_path+query+"_"+start_time+"_"+next_token+".csv")
      return(results, next_token)
  
  return (results, next_token)

In [None]:
start_time = "2022-02-01T13:00:00.000Z"
end_time = "2023-01-11T13:00:20.000Z"
query_text = "conversation_id:"
endpoint = "https://api.twitter.com/2/tweets/search/all/"
path = "/content/"
max_results = 100
no_days = 15

replies = pd.DataFrame(columns=tweets_df.columns)
for tweet_id in tweets_ids:
  query_text = "in_reply_to_status_id:"+str(tweet_id[0])+" -from:"+str(tweet_id[1])
  replies_single = get_replies(query_text, start_time = start_time, end_time = end_time, 
          max_results=max_results, expansions='author_id,in_reply_to_user_id,geo.place_id', 
          tweet_fields='id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source,entities',
          user_fields='id,name,username,created_at,description,public_metrics,verified',
          place_fields='full_name,id,country,country_code,geo,name,place_type',
          endpoint=endpoint)[0]          
  replies_single_df = pd.DataFrame(replies_single)
  replies = replies.append(replies_single_df, ignore_index=True)

print("Done")
replies.to_pickle(path+"replies.pkl")

In [None]:
replies

#Working with Replies Data

In [None]:
import pandas as pd
path = "/content/"
error_log_path = "/content/"

In [None]:
replies = pd.read_pickle(path+"replies.pkl")

In [None]:
replies_filtered = replies.copy() #it's a good idea to work on the copy of original dataframe, so we can always go back to it if we mess something up
column_list = ["id","conversation_id","author_id","created_at", "text","entities","public_metrics", "lang"]
replies_filtered = replies_filtered[column_list]

In [None]:
replies_filtered['sentiment'] = replies_filtered['text'].apply(lambda testo: sid.polarity_scores(cleaner_sentiment(testo))['compound'])

In [None]:
replies_filtered["clean_text"] = replies_filtered["text"].map(cleaner)
replies_filtered.loc[replies_filtered["clean_text"].isnull(),"clean_text"] = ""

In [None]:
replies_filtered.loc[replies["entities"].isnull(), "entities"] = None
replies_filtered["hashtags"] = ""

In [None]:
unique_h = {}
index = 0

for idx, row in replies_filtered.iterrows():
  if row["entities"] is not None and "hashtags" in row["entities"]:
    hl = []
    for hashtag in row["entities"]["hashtags"]:
      tag = '#' + hashtag["tag"].lower()
      unique_h.setdefault(tag, 0)
      unique_h[tag] += 1
      hl.append(tag)
 
    replies_filtered.at[idx,"hashtags"] = hl

In [None]:
replies_filtered['political'] = float(0)
for index, row in replies_filtered.iterrows():
  check=False
  algn = float(0)
  cnt=0
  if row['hashtags']:
    for ht in row['hashtags']:
      if ht in htlist:
        cnt+=1
        algn += float(a[a['Id']==ht]['Political'])
  if (cnt==0 and row['clean_text'] != ""):
    for word in str.split(row["clean_text"], " "):
      if word in list(word_nodes['Id']):
        cnt+=1
        algn += float(word_nodes[word_nodes['Id']==word]['Political'])
  if cnt!= 0:
    replies_filtered.loc[index,'political'] = algn/cnt

In [None]:
replies_filtered['alignment'] = replies_filtered['political']*replies_filtered['sentiment']

In [None]:
replies_filtered

In [None]:
replies_filtered.to_csv(path+"replies_filtered.csv")

##Average measurement for replies to tweets (neighbors)

In [None]:
tweets_ids = []
for idx, row in tweets_filtered.iterrows():
  if row['public_metrics']['reply_count']>5:
      tweets_ids.append([row['id'],row['author_id']])

In [None]:
neigh = pd.DataFrame(columns=['id','sentiment','political','alignment'])
lista=[]
for tweet_id in tweets_ids:
  lista = list(replies_filtered[replies_filtered['conversation_id']==str(tweet_id[0])][['sentiment','political', 'alignment']].mean())
  lista.append(tweet_id[0])
  neigh = neigh.append(pd.DataFrame([lista], columns=["sentiment","political","alignment","id"]), ignore_index=True)

In [None]:
neigh

In [None]:
neigh.to_csv(path+"neigh.csv")