# Data collection

### Import libraries and initial setup

In [None]:
import tweepy
import numpy as np
import pandas as pd
import time
import datetime
from tqdm import tqdm

# Import user specific keys to access twitter
from app_cred import CONSUMER_KEY, CONSUMER_SECRET
# Import user specific keys to access twitter
from app_cred import ACCESS_TOKEN, ACCESS_TOKEN_SECRET 

auth = tweepy.OAuthHandler(
    CONSUMER_KEY, 
    CONSUMER_SECRET
)

auth.set_access_token(
    ACCESS_TOKEN, 
    ACCESS_TOKEN_SECRET
)

api = tweepy.API(
    auth, wait_on_rate_limit = True,
    wait_on_rate_limit_notify = True,
    timeout=900
)

pd.set_option("display.max_columns", None)

### Split actorlist into four parts

In [None]:
handles=pd.read_excel("actor_list.xlsx")
handles=handles["Twitter Handle (uden @)"]
handle1, handle2, handle3, handle4 = 
i = 1
for handle in np.array_split(handles, 4):
    handle.to_csv(f"handle{i}.csv", index=False)
    i += 1

### Define functions

In [None]:

list_of_keys_user=[
    "id","name","screen_name","location",
    "description","followers_count","friends_count",
    "statuses_count","created_at"
]

list_of_keys_tweet=[
    "created_at","id","lang","full_text",
    "retweeted","retweeted_status","retweet_count",
    "is_quote_status","quoted_status","quote_count",
    "entities"
]

def limit_handled(cursor):
    """Generator to throttle scraping of Twitter-user timeline.
    Yields next tweet in timeline"""
    while True:
        try:
            yield next(cursor)
        # If rate limit is reached sleep for 15 minutes
        except tweepy.RateLimitError as r: 
            print(r.reason) 
            time.sleep(900)
        
        except tweepy.TweepError as e: 
            print(e.reason)
            time.sleep(5)
        except StopIteration:
            break

def get_all_tweets(handle):
    """Function is supposed to return all possible tweets from a user in a df
    Handle is the handle of the account"""
    
    timeline = tweepy.Cursor(
        api.user_timeline, 
        screen_name=handle,
        tweet_mode="extended",
        since=start_date
    )
    
    tweet_list = [status._json for status in limit_handled(timeline.items())]
    tweets=list()
    
    for tweet in tweet_list:
        for key in tweet:
            temp_dict=dict()
            
            # Access information on user
            for user_key in list_of_keys_user:  
                try:
                    temp_dict["user_"+user_key]=tweet["user"][user_key]
                except KeyError:
                    temp_dict["user_"+user_key]=None
                    
            # Access information on tweet
            for tweet_key in list_of_keys_tweet: 
                try: 
                    temp_dict["tweet_"+tweet_key]=tweet[tweet_key]
                except KeyError:
                    temp_dict["tweet_"+tweet_key]=None

        tweets.append(temp_dict)
        
    df = pd.DataFrame(tweets) 
    df = df.fillna(value=np.nan)
    return df

def get_tweets_from_handles(handlefile,print_handle=False):
    """Give this function the csv file with the handle 
    it returns a df including the tweets from all the handles"""
    
    handles = pd.read_csv(handlefile)
    handles=handles["twitter_handle"].to_list()
    df = pd.DataFrame()
    
    for handle in tqdm(handles):
        if print_handle:
            print(handle)
        temp = get_all_tweets(handle)
        df = pd.concat(
            [df, temp],
            ignore_index = True
        )
    return df

### Scrape timelines

In [None]:
handles=get_tweets_from_handles("handle1.csv")

compression_options = dict(method="zip", archive_name="handles1.csv")
df.to_csv("handles1.zip", compression=compression_options, index=False)

# Preprocessing

### Import libraries and initial setup

In [None]:
%reset
import numpy as np
import pandas as pd
import re
import spacy
import tqdm;tqdm.tqdm.pandas()

nlp = spacy.load("da_core_news_lg")
all_stopwords = nlp.Defaults.stop_words

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)

### Define functions

In [None]:
def remove_stopwords(sentence):
    """
    Removes Danish stopwords imported from spacy and returns filtered string
    """  
    tokens = sentence.split(" ")
    tokens_filtered= " ".join([word for word in tokens if not word in all_stopwords])
    return tokens_filtered

def preproccessor(string, verb_noun_only=False):
    """
    Helper function for lemmatizer().
    Preprocesses the string by:
    1) lowercasing string
    2) removing urls
    3) remove mentions, hashtags, and RT
    4) remove non-alphanumerical values
    5) remove multiple whitespaces
    6) remove trailing whitespaces
    """  
    # Lowercase
    string=string.lower()
    
    # Remove url
    string=re.sub(
        r"(https|http?):\/\/(\w|\.|\/|\?|\=|\&|\%)*\b",
        "", 
        string)
    
    # Remove weird remaining http
    string = re.sub(r'https?', '', string)
    
    # Remove mentions, hashtags, and RT
    string=re.sub("@\w+|#\w+|^rt","", string)
    
    # Remove non-alphanumerical values
    string=re.sub(r"\W"," ", string)
 
    # Remove more than one whitespace
    string=re.sub(r"\s{2,}", " ", string)
    
    # Remove trailing whitespaces
    string=string.strip()
    
    # Remove stopwords
    string=remove_stopwords(string)
    
    # Create and return doc object
    return nlp(string)  
   
def lemmatizer(string):
    """
    Lemmatize the preprocessed string using spacy's lemmatizer
    """
    doc=preproccessor(string)
    
    lemma=" ".join(
        [token.lemma_ for token in doc]
    )
    return lemma

def lemmatizer_reduced(string):
    """
    Reduce the allready lemmatized string by only including proper nouns, nouns, and verbs
    """
    doc=nlp(string)
    reduced=" ".join(
    [token.lemma_ for token in doc if len(str(token.lemma_))>3 and token.pos_ in ["PROPN","NOUN","VERB"]]
    )
    return reduced

### Concatenate data into one DataFrame

In [None]:
all_files=[
    "handle1.zip",
    "handle2.zip",
    "handle3.zip",
    "handle4.zip"
]

all_data=list()

# Load in DataFrame and append to all_data
for filename in all_files:
    frame = pd.read_csv(
        filename,
        compression="zip"
    )
    all_data.append(frame)
    
df = pd.concat(all_data, 
               axis=0, 
               ignore_index=True
              )

# Delete all_data from memory
del all_data

### Preprocess tweets

In [None]:
df["tweet_text_lemma"]=df["tweet_full_text"].progress_apply(lambda tweet: lemmatizer(tweet))
df["tweet_text_lemma_reduced"]=df["tweet_text_lemma_reduced"].progress_apply(lambda tweet: lemmatizer_reduced(tweet))

### Save data

In [None]:
compression_options = dict(method="zip", archive_name="data.csv")
df.to_csv("data.zip", compression=compression_options, index=False)

# Keyword generator

### Import libraries and initial setup

In [None]:
%reset
import graph_tool.all as gt
import pandas as pd
import regex as re
import numpy as np

import nltk
import string
import ast

from spacy.lang.da.stop_words import STOP_WORDS
from hSBM_Topicmodel.sbmtm import sbmtm
from gensim.models import KeyedVectors
from collections import Counter

### Define functions

In [None]:
def get_topic_df(topics):
    '''takes in model.topic and returns a df that can be used to save the topics 
    could (and probably should) be rewritten with list comprehensions!'''
    topic_nr = []
    words = []
    weights = []
    for topic in topics: 
        for word in topics[topic]:
            topic_nr.append(
                'topic ' + str(topic)
            )
            words.append(word[0])
            weights.append(word[1])

    return pd.DataFrame(
        {'topic_nr': topic_nr,
         'words': words,
         'weights': weights}
    )

def add_words(topic_df):
    '''Function that add related words to the green debate'''
    temp_words = []
    for i, word in enumerate(topic_df.words):
        print('\n', i, word, '\n')
    inp = input(
        '''Are any of these words related to the green debate?
        Input index numbers of the word seperated by spaces:\n''')
    indices = [int(i) for i in inp.split(' ') if i.isdigit()]
    for i in indices:
        temp_words.append(topic_df.words[i])
    return temp_words

def check_topics(n_topics, df):
    '''Function that iterates over each topic
    and returns a df with that topic'''
    all_words = []
    for i in range(n_topics):
        topic_df = df.loc[df.topic_nr == f'topic {i+1}'].reset_index()
        print(topic_df)
        inp = input('\n\nDoes this topic include a green word? y or n?\n')
        if inp == 'y':
            all_words += add_words(topic_df)
        elif inp == 'n':
            continue
        else:
            print('I did not understand that!')
    return all_words

def expand_words(words, n):
    '''takes in a set of words and shows potential words to expand the set'''
    extra_words = []
    for word in words:
        try:
            w2v_words=w2v.most_similar(positive=word, topn=n)
            similar_words = [word[0] for word in w2v_words if word[0] not in words]
        except:
            continue
        # also removes words that are already in extra words!
        similar_words = [word for word in similar_words if word not in extra_words]
        for i, word in enumerate(similar_words):
            print('\n', i, word, '\n')
        inp = input(
            '''Are any of these words related to the green debate?
            Input index numbers of the word seperated by spaces:\n''')
        indices = [int(i) for i in inp.split(' ') if i.isdigit()]
        for i in indices:
            extra_words.append(similar_words[i])
    return extra_words

def extract_list(string):
    """Helper function to extract list from string if string existst"""
    try:
        out = ast.literal_eval(string)
    except:
        out = list()
    return out

### Import data and clean up

In [None]:
# Import preprocessed data
df = pd.read_csv("data.csv", parse_dates=['tweet_created_at'], compression='zip')

# Removing duplicates
df.drop_duplicates(subset='tweet_id')

# Only take tweets after 5 june 2019
df = df.loc[df.tweet_created_at > '05-06-2020']

# Aggregating tweets for actors for each month
# Creating new column with month and year to aggregate on
df.loc[:, 'year'] = df['tweet_created_at'].dt.year  #.astype(str)
df.loc[:, 'month'] = df['tweet_created_at'].dt.month  #.astype(str)

# Removing NaNs from tweet_text_lemma  
#--> viewing them df.loc[df.tweet_text_lemma.isna()]
df = df.dropna(subset=['tweet_text_lemma'])

# Aggregating the dataframe
tweets_agg = df.groupby(
    ['user_screen_name','year','month'],
    as_index = False
).agg(
    {'tweet_text_lemma': ' '.join}
)

# Now we just need to tokenize the tweets! 
tokenizer = nltk.tokenize.casual.TweetTokenizer()
tweets_agg.loc[:, 'tokens'] = tweets_agg.tweet_text_lemma.apply(lambda x: tokenizer.tokenize(x))

### Taking the data that we need for the topic model

In [None]:
tokens = tweets_agg['tokens']
handles = tweets_agg['user_screen_name']

### Getting the data in a format usable for the hSBM

In [None]:
# Remove infrequent words; Snorres solution
cutoff = 5
c = Counter()
for doc in tokens:
    c.update(Counter(doc))
vocab = c.most_common(40000)
vocab = set([word for word, count in vocab if count > 1])
# remove words
docs = [[w for w in doc if w in vocab] for doc in docs]

### Training the hSBM model

In [None]:
# Initialise model
model = sbmtm()

# We have to create the document network from the corpus
model.make_graph(docs)

# Seed for graph-tool's random number generator
gt.seed_rng(42) 

# Fit model
model.fit()
model.save_graph(filename = 'graph_full_dataset_grouped.xml.gz')

topic_df = get_topic_df(model.topics())

### Finding the green words in the topics from hSBM

In [None]:
all_words = check_topics(373, topic_df)

temp = []
for word in all_words:
    temp.append(word.lower())
all_words_unique = set(temp)
all_words_unique

In [None]:
# Save green keywords
with open("green_words.txt", "w") as output:
    output.write(str(all_words_unique))

### Finding additional keywords with W2V

In [None]:
with open('green_words.txt', 'r') as f:
    green_words = f.read()
green_words = extract_list(green_words)

w2v = KeyedVectors.load_word2vec_format('w2v/dsl_skipgram_2020_m5_f500_epoch2_w5.model.w2v.bin', binary=True)
extra_words = expand_words(green_words, 10)

In [None]:
all_words_combined = set(extra_words + list(green_words))

### Adding keywords from hSBM and W2V to the keywords from qualitative methods 

In [None]:
former_words = pd.read_csv('qualitative_keywords.csv', index_col=0)
former_words_set = set(former_words['0'].values)
all_words_final = list(all_words_set|former_words_set)
all_final_words = pd.DataFrame({'words': all_words_final})
all_final_words.to_csv('final_final_keywords.csv')

# Weak labeller

### Import libraries

In [None]:
%reset
import pandas as pd
import re
import numpy as np

### Define functions

In [None]:
def weak_label(df,word_list):
    tweet_list=df.tweet_full_text.to_list()
    i=0
    climate_tweets=[]
    for tweet in tweet_list:
        i+=1 #iterations
        temp=0
        try:
            tweet2=set(tweet.split()) #split tweet into words
        except:
            continue
        for word in word_list:
            try:
                if word in tweet2: #see if word is in list of words from tweet
                    if temp==0:
                        temp+=1
                    else: #if 2 or more, append tweet to list of tweets
                        climate_tweets.append(tweet)
            except:
                fails.append(i)
    tweet_set=set(climate_tweets) #Only keep unique values, each tweet once            
    df['klimarel']=df.tweet_full_text.isin(tweet_set)#Create column with True for tweets in list
    klima_df=df.loc[df['klimarel']==True] #Only keep climate tweets
    return klima_df #return df of only climate tweets

### Import data

In [None]:
df=pd.read_csv('data.csv',compression="zip")
climate_words=pd.read_csv('final_keywords.csv')
climate_words.columns=['0','keywords']
label_words=climate_words.keywords.to_list()

### Apply weak labeller

In [None]:
labeller=weak_label(df,label_words)

### Save data

In [None]:
compression=dict(method="zip",archive_name="only_climate_tweets.csv")
labeller.to_csv("only_climate_tweets.zip",compression=compression,index=False)

# Mention-network

### Import libraries

In [None]:
%reset
import numpy as np
import pandas as pd
import networkx as nx

import re
import ast
import tqdm;tqdm.tqdm.pandas()

from collections import defaultdict

### Import data and clean-up

In [None]:
#read dataset of tweets since election and list of our actors
df=pd.read_csv('only_climate_tweets.csv',compression="zip")
df2=pd.read_excel('actor_list.xlsx')

#remove duplicates
df=df.drop_duplicates(subset='tweet_id')
df = df.loc[df.tweet_created_at > '05-06-2020']

## Define functions

In [None]:
#define extract functions
def extract_list(string):
    """Helper function to extract list from string if string exists.
    Else, return empty dict"""
    try:
        out=ast.literal_eval(string)
    except:
        out=list()
    return out


def extract_dict(string):
    """Helper function to extract dict from string if string exists.
    Else, return empty dict"""
    try:
        out=ast.literal_eval(string)
    except:
        out=dict()
    return out

def extract_from_entities(tweet,ent_key,tag_key):
    """Helper function to extract information from tweet_entities.
    tweet_entities is a dict-of-dicts containing all information on 
    twitter entities from a given tweet.
    ent_key: key used to access the dictionary of interest e.g. "hastags"
    tag_key: key used to access value of interest e.g. "text"
    """
    try:
        out=[tag[tag_key] for tag in tweet[ent_key] if tweet[ent_key]!=ent_key]
    except:
        out=list()
    return out

### Extract tweet entities dict into seperate columns

In [None]:
df["tweet_retweeted_status"]=df["tweet_retweeted_status"].progress_apply(
    lambda x:extract_dict(x)
)

df["tweet_quoted_status"]=df["tweet_quoted_status"].progress_apply(
    lambda x:extract_dict(x)
)

df["tweet_entities"]=df["tweet_entities"].progress_apply(lambda x:extract_dict(x)
)

df["tweet_hashtags"]=df.tweet_entities.progress_apply(
    lambda tweet: extract_from_entities(
        tweet,
        "hashtags",
        "text"
    )
)

df["tweet_mentions"]=df.tweet_entities.progress_apply(
    lambda tweet: extract_from_entities(
        tweet,
        "user_mentions",
        "screen_name"
    )
)

### Create dict

In [None]:
#Create dict of accounts and who they mention
mention_dict = defaultdict(list)
for idx,row in df.iterrows():
    mention_dict[row['user_screen_name']].append(
        row['tweet_mentions']
    )
    
#keep only alphanumerical characters
mention_dict2 = {key: value for key, value in mention_dict.items() if key.isalpha()}

#flatten the lists in dict
for key in mention_dict2.keys():
    mention_dict2[key]=[item for sublist in mention_dict2[key] for item in sublist]

### Network

In [None]:
#create network from dict
mention_network=nx.from_dict_of_lists(mention_dict2)

In [None]:
#create list of nodes that are not in our actor list
not_rel=[]
for i in list(mention_network.nodes()):
    if i not in df2['actors'].to_list():
        if i not in list(klima_df.user_screen_name.unique()):
            if i not in df2['actors'].str.lower():
                not_rel.append(i)

#remove 
for i in not_rel:
    mention_network.remove_node(i)

# Hashtag-network

### Import libraries

In [None]:
%reset
import numpy as np
import pandas as pd
import networkx as nx

import re
import ast
import tqdm;tqdm.tqdm.pandas()

from collections import defaultdict

# PCA

### Import libraries and initial setup

In [None]:
%reset
import factor_analyzer
import tqdm;tqdm.tqdm.pandas()
import exam_utils

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler

from adjustText import adjust_text
from collections import Counter

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
plt.style.use("ggplot")
plt.rcParams["figure.figsize"] = (16,9)

### Define functions

In [None]:
def cronbach_alpha(df):
    """
    Function that returns Cronbach's Alpha from a pandas.DataFrame().
    """
    # Transform the df into a correlation matrix
    df_corr = df.corr()
    
    # Calculate N
    # The number of variables equals the number of columns in the df
    N = df.shape[1]
    
    # Calculate R
    # For this, the function loops through the columns and append every
    # relevant correlation to an array calles "r_s". Then, the function 
    # calculates the mean of "r_s"
    rs = np.array([])
    for i, col in enumerate(df_corr.columns):
        sum_ = df_corr[col][i+1:].values
        rs = np.append(sum_, rs)
    mean_r = np.mean(rs)
    
   # Use the formula to calculate Cronbach's Alpha 
    cronbach_alpha = (N * mean_r) / (1 + (N - 1) * mean_r)
    return cronbach_alpha


### Import data

In [None]:
climate=pd.read_csv("only_climate_tweets.zip",compression="zip")
index=climate.tweet_id.to_list()
actor_types=pd.read_excel("initial_network.xlsx")
actor_types=actor_types[["user_screen_name","actor_type"]]

#Create df with data of interest
df=climate[["user_screen_name","tweet_created_at","tweet_full_text","tweet_text_lemma_reduced"]]
df=df[df["tweet_created_at"]>="26-02-2021"].reset_index(drop=True)

# Remove climate dataframe from memory
del climate

# Aggregate tweets on user
tweets_agg = df.groupby(
    "user_screen_name", 
    as_index = False
).agg(
    {"tweet_text_lemma_reduced": " ".join}
)

# Merge tweets_agg with actor_types
tweets_agg=tweets_agg.merge(
    actor_types,
    on="user_screen_name"
).reset_index(drop=True)

### Create count matrix

In [None]:
# Store attributes in variables
min_df=.05
max_df=0.95
ngram_range=(1,1)

# Initialise CountVectorizer object
vectorizer = CountVectorizer(
    ngram_range=ngram_range,
    min_df=min_df,
    max_df=max_df
)

# Create sparse count matrix
sparse_count = vectorizer.fit_transform(
    tweets_agg.tweet_text_lemma_reduced.to_numpy(),
    tweets_agg.user_screen_name.to_numpy()
)

# Cast data in a pandas.DataFrame()
count_matrix = pd.DataFrame(
    data=sparse_count.toarray(), 
    columns=vectorizer.get_feature_names(),
    index=tweets_agg.user_screen_name
)

# Show shape
# count_matrix.drop(
#     ["000"],axis=1,inplace=True)

feature_names=count_matrix.columns
print(count_matrix.shape)
count_matrix.sample(5)

### Standardise the data

In [None]:
# Initialise StandardScaler()
sc = StandardScaler()

X_std = sc.fit_transform(
    count_matrix.to_numpy()
)

X_std.shape

### Compute principal components

In [None]:
n_components= X_std.shape[0]
columns=[f"PC{d}" for d in range(1,n_components+1)]

# Initialise PCA()
pca = PCA(
    n_components=n_components,
    svd_solver="auto"
)

pca.fit(X_std)

X=pca.transform(X_std)

pc_df=pd.DataFrame(
    X,
    columns=columns
)

### Loadings Matrix

In [None]:
pca_components=pca.components_

loadings = pd.DataFrame(
    pca_components, 
    index=count_matrix.index,
    columns=feature_names
)

# Compute loadings scores
L = pca_components.T * np.sqrt(pca.explained_variance_)

# Scale PC scores
scaled_PC = L / np.max(abs(L), axis=0) 

loading_matrix = pd.DataFrame(
    L, 
    index=feature_names,
    columns=columns
)

print(L.max())
print(L.min())
print(L.shape)

### Eigenvalues

In [None]:
# Identify 
eigen_below_1=np.where(np.sqrt(pca.explained_variance_)<1)[0][0]
fig, ax = plt.subplots(1,1)                                                              
                                                              
ax.set_title("Eigenvalues",fontsize=20)
ax.set_xlabel("Principal component")
ax.set_ylabel("Eigenvalue")
ax.axvline(
    x=eigen_below_1,
    c="grey", 
    linestyle="--"
)

ax.plot(
    np.sqrt(pca.explained_variance_)
       )
plt.show()

### Variance explained

In [None]:
singular_values=np.diag(pca.singular_values_)

covariance_principal_compenents=pd.DataFrame(
    singular_values,
    index=columns,
    columns=columns
)

explained_variance=pca.explained_variance_ratio_
cumulative_variance=np.cumsum(explained_variance)

# How much does the first two PC's explain?
round(cumulative_variance[1]*100,2)

# How many PC's are needed to explain 90% of the total variation?
variance_above_90=np.where(cumulative_variance>=.9)[0][0]

### Plot variance

In [None]:
# Initialise subplots
fig,ax=plt.subplots(1,1)

ymin=0
ax.set_title(
    "Variance captured by principal components\n",
    fontsize = 20
)
# Plot the amount of variance captured by each principal components
plot1 = ax.scatter(
    range(n_components), 
    explained_variance, 
    color="royalblue",
    label="Variance captured by each principal components"
)
ax.set_xlabel("Principal Component")
ax.set_ylabel("Variance captured")
ax.set_ylim(ymin=ymin)

# Plot the cumulative amount of variance captured on second y-axis
ax2 = ax.twinx()  # Create second y-axis
plot2=ax2.scatter(
    range(n_components),
    cumulative_variance,
    label="Cumulative variance captured"
)
ax2.set_ylabel("Cumulative variance captured")
ax2.set_ylim(ymin=ymin)

ax2.axvline(
    x=variance_above_90,
    c="grey", 
    linestyle="--"
)

plt.legend(
    handles = [plot1, plot2],
    loc=4
)

plt.show()

### Socio-symbolic constellation

In [None]:
# Importance of features for PC's 

# 10 most important words for the variance explained in PC1:
loadings_scores_PC1=pd.Series(
    pca.components_[0],
    index=feature_names
)
sorted_loadings_scores_PC1=loadings_scores_PC1.abs().sort_values(ascending=False)
sorted_loadings_scores_PC1[:10]

In [None]:
# 10 most important words for the variance explained in PC2:
loadings_scores_PC2=pd.Series(
    pca.components_[1],
    index=feature_names
)
sorted_loadings_scores_PC2=loadings_scores_PC2.abs().sort_values(ascending=False)
sorted_loadings_scores_PC2[:10]

In [None]:
# Determine how many words to plot in each direction
n = 10

# Identify the indices that sorts the two first components
PC1_loadings = pca_components.T[:,0].argsort()
PC2_loadings = pca_components.T[:,1].argsort()

# Find the terms (indicies) that load most on the first principal component
PC1_plot_indicies = np.concatenate(
    (
        PC1_loadings[:n], 
        PC1_loadings[-n:]
    )
)

# Identify remaining indices
remain_indicies = np.array(
    [index for index in PC2_loadings if index not in PC1_plot_indicies]
)

# Find the remaining terms that load most on the second principal component
PC2_plot_indicies = np.concatenate(
    (
        remain_indicies[:n], 
        remain_indicies[-n:]
    )
)

# Combine the indices
PC_plot_indicies = np.unique(
    np.concatenate(
        (
            PC1_plot_indicies, 
            PC2_plot_indicies
        )
    )
)

# Get the term names
PC_plot_names = feature_names[PC_plot_indicies]

# Get the term loadings
PC_plot_load = L[PC_plot_indicies]
# PC_plot_load=PC_plot_load[:,1]

In [None]:
# Define names to plot in PCA-plot
actor_names=[
    "klimaraadet","KlimaMin","Spolitik","venstredk",
    "DanskEnergi","DanskIndustri","winddenmark","biogasdanmark","EuropeanEnergy_",
    "DenGroenneStud","spisekammeret","FrederikSandby","NOAH_dk","greenpeacedk","Klimabev",
    "SorenHave","larskohler","concitoinfo","DanJoergensen","ExtinctionRDK"
]

# Find actor's index in count_matrix
index=list()
for a in actor_names:
    temp=count_matrix.index.to_list().index(a)
    index.append(temp)

### Plot the socio-symbolic constellation

In [None]:
fig, ax = plt.subplots(1,1)

# Set title 
ax.set_title(
    f"PCA of word matrix ({count_matrix.shape[0]} actors, {count_matrix.shape[1]} words)",
    fontsize = 20
)

# Set x label
ax.set_xlabel(
    f"Principal Component 1 ({round(100*pca.explained_variance_ratio_[0],2)} % of variation)",
    fontsize = 15
)

# Set y label
ax.set_ylabel(
    f"Principal Component 2 ({round(100*pca.explained_variance_ratio_[1],2)} % of variation)",
    fontsize = 15
)

# Mark 0,0 on the coordinate system
ax.axvline(
    x=0,
    c="grey", 
    linestyle="--"
)
ax.axhline(
    y=0, 
    c="grey", 
    linestyle="--"
)

# Plot word loadings
ax.scatter(
    PC_plot_load[:,0], 
    PC_plot_load[:,1], 
    marker = "o", 
    label="Word",
    alpha=.75
)

# Plot standardized principal component scores
ax.scatter(
    scaled_PC[index,0], 
    scaled_PC[index,1], 
    marker = "x", 
    label="Actor", 
    alpha=0.9
)

# Annotate the plot
texts=[]

# Words
for x, y, txt in zip(
    PC_plot_load[:,0], 
    PC_plot_load[:,1], 
    PC_plot_names):
    texts.append(
        plt.text(
            x,
            y,
            txt,
            size=10,
            weight="bold"
        )
)
    
# Actors
for x, y, txt in zip(
    scaled_PC.T[index,0],
    scaled_PC.T[index,1],
    actor_names):
    
    texts.append(
        plt.text(
            x,
            y,
            txt,
            size=12
        )
)
    
adjust_text(
    texts, 
    arrowprops=dict(
        arrowstyle="->", 
        color="grey"
    )
)

plt.show()

### Calculate test-statistics

In [None]:
kmo=factor_analyzer.factor_analyzer.calculate_kmo(np.corrcoef(X))[1]
cronbach_a=cronbach_alpha(pd.DataFrame(X))

### Applying k-cluster algorithm

In [None]:
pc_comp=pd.DataFrame(pca_components)
ks = range(1, 49)
inertias = []
for k in ks:
    # Create a KMeans instance with k clusters: model
    model = KMeans(n_clusters=k)
    
    # Fit model to samples
    model.fit(pc_comp.iloc[:,:1])
    
    # Append the inertia to the list of inertias
    inertias.append(model.inertia_)
    
plt.plot(ks, inertias, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()

In [None]:
data=np.array([PC_plot_load[:,0]
               ,PC_plot_load[:,1]]).T

model = KMeans(n_clusters = 4, init = "k-means++")

label = model.fit_predict(
    data
)

uniq = np.unique(label)
for i in uniq:
    plt.scatter(
        data[label == i , 0],
        data[label == i , 1],
        label = i
    )

texts=[]    
for x, y, txt in zip(
    PC_plot_load[:,0], 
    PC_plot_load[:,1], 
    PC_plot_names):
    texts.append(
        plt.text(
            x,
            y,
            txt,
            size=10,
            weight="bold"
        )
    )    
adjust_text(
    texts, 
    arrowprops=dict(
        arrowstyle="->", 
        color="grey"
    )
)    
plt.legend()
plt.show()

# Active learning

### Import libraries and initial setup

In [None]:
%reset
# utils and general stuff
import pandas as pd
import numpy as np
import re
import datetime

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import cohen_kappa_score
from dateutil.parser import parse

#Packages to create DFM
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.tokenize import TweetTokenizer

#Models to train
from sklearn.linear_model import LogisticRegression

#Packages for cross-validation and parameter tuning
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix, classification_report

sns.set_theme(style="whitegrid")

### Define functions

In [None]:
def time_parser(string):
    """
    Helper function to extract time in format date month year. 
    """
    dt=parse(string)
    dt=dt.strftime("%d-%m-%Y")
    return datetime.datetime.strptime(dt,"%d-%m-%Y")

def sample_dataset(df, n=100, random_state=42):
    '''Takes in a df and returns 100 random tweets to be labelled'''
    temp = df.sample(n, random_state=random_state)
    temp.loc[:, 'label'] = np.nan
    return temp

def split_data(path, test_data=False):
    '''takes in the path to the latest labelled data set and returns X_train, y_train, and a df
    could have used train_test_split'''
    new_df = pd.read_csv(path)
    X = new_df.tweet_text_lemma
    y = new_df.label
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    if test_data:
        return X_train, X_test, y_train, y_test, new_df
    else:
        return X, y, new_df

def get_unlabelled(new_df, old_df):
    '''takes in the new df and removes the ones in the new one from the old one'''
    unlabelled_df = old_df.loc[~old_df.index.isin(new_df.index)]
    return unlabelled_df

def predict_unlabelled(pipeline, unlabelled_df):
    '''takes in a pipeline, the unlabelled df and adds the maximum probability column
    Then it sorts the dataframe by max proba and returns it'''
    # predicts for the three classes for all entries in the dataset
    predictions = pipeline.predict_proba(unlabelled_df.tweet_text_lemma)
    # creates a column with the max probability
    temp = unlabelled_df.copy()
    temp.loc[:, 'max_proba'] = [max(pred) for pred in predictions]
    return temp

def label_new_set(unlabelled_df, labelled_df, new_name):
    '''takes in the df produced above, sorts it and saves a new df to be labelled'''
    unlabelled_df.sort_values(by='max_proba', inplace=True)
    new_df = unlabelled_df[:100].copy()
    new_df.loc[:, 'label'] = np.nan
    new_df = pd.concat([new_df, labelled_df])
    new_df.to_excel(f'{new_name}.xlsx')
    return None

def cohens_kappa(path_to_labelled, full_df):
    '''takes in a labelled dict and an unlabelled dict. Fits a model for each hundred labelled entries
    predicts on the unlabelled set and then calculates cohens kappa for each models prediction
    and the former iterations and returns a list of cohens kappa scores for each iteration
    Could also be augmented to print the score each iteration'''
    df = pd.read_excel(path_to_labelled, index_col=0)
    unlabelled_df = get_unlabelled(df, full_df)
    X_test = unlabelled_df.tweet_text_lemma
    predictions = []
    
    n = df.shape[0]
    for i in range(100, n + 100, 100):
        # creates a temp df with only the n lowest labelled examples
        temp_df = df.tail(i).copy()
        X = temp_df.tweet_text_lemma
        y = temp_df.label
        pipeline.fit(X, y)
        predictions.append(pipeline.predict(X_test))
    kappas = []
    for i, prediction in enumerate(predictions):
        if (i + 1) == len(predictions):
            break
        else:
            kappas.append(cohen_kappa_score(prediction, predictions[i + 1]))
    return kappas

def heatmap(confusion_df, title):
    cmap = sns.dark_palette('seagreen', as_cmap=True)
    plt.figure(figsize = (10,7))
    sns.heatmap(confusion_df, annot=True, cmap=cmap)
    plt.xlabel('Predicted label')
    plt.ylabel('Coded label')
    plt.yticks(rotation=90)
    plt.title(title)

    plt.savefig(title, format='png')
    
heatmap(logreg_eval, 'Logistic regression confusion-matrix')

### Import data and clean-up

In [None]:
df = pd.read_csv('only_climate_tweets.csv', compression='zip')
df.loc[:, 'tweet_created_at'] = df.tweet_created_at.apply(time_parser)

# subsetting only after the electiong and making a copy to get rid of the setting with copy warning
df_ = df.loc[df.tweet_created_at > '2019-06-05'].copy()  
df_.loc[:, 'tweet_id'] = df_ae.loc[:, 'tweet_id'].astype(int)

# removing all retweets
df_ = df_.loc[~df_.tweet_full_text.str.contains('^RT')]

# dropping nans in tweet lemma
df_ = df_.dropna(subset=['tweet_text_lemma'])

### Create label-sample

In [None]:
# Getting the first dataset to label!
label = sample_dataset(df_)
label.to_excel('label_this.xlsx')

### Active learning loop

#### importing data and splitting into X and y

In [None]:
X_train, X_test, y_train, y_test, labelled_df = split_data('label1.csv', test_data=True)
unlabelled_df = get_unlabelled(labelled_df, df_)

In [None]:
# pipeline to train on
tokenizer = TweetTokenizer()

pipeline = Pipeline([ 
    ('cv', CountVectorizer(
        tokenizer=tokenizer.tokenize,
        ngram_range = (1, 2),
        max_df=0.999,
        min_df=0.01)
    ),
    ('tfidf', TfidfTransformer(use_idf = False)),
    ('logreg', LogisticRegression())
])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
unlabelled_df = predict_unlabelled(pipeline, unlabelled_df)
label_new_set(unlabelled_df, labelled_df, 'label1')

### Checking the current score

In [None]:
kappas = cohens_kappa('label1.xlsx', df_)
kappa_df = pd.DataFrame(kappas, columns=["Cohen's kappa score"], index=index)
kappa_df.to_csv('cohens\ kappas.csv')

### Grid search

In [None]:
#Fill in the parameter values in the grid 
parameter_grid = {
    'tfidf__use_idf': [False, True],
    'logreg__penalty': ['l1', 'l2', 'elasticnet'],
    'logreg__C': [0.1, 0.5, 1],
}

#Initializing a kfold with 5 folds
cv = StratifiedKFold(n_splits=5)

#Initializing the GridSearchCV
search = GridSearchCV(
    pipeline, 
    parameter_grid,
    cv=cv, 
    n_jobs = -1,
    verbose=10
)

In [None]:
search.fit(X_train, y_train)
pipeline.score(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_train)
conf = confusion_matrix(y_train, y_pred)

### Visualizing

In [None]:
y_pred = pipeline.predict(X_test)
y_test = np.array(y_test)
confusion_matrix(y_test, y_pred)

In [None]:
categories = ['Neutral', 'Negative', 'Positive', 'Not climate']
logreg_eval = pd.DataFrame(
    confusion_matrix(
        y_test,
        y_pred
    ), 
    columns=categories, 
    index=categories
)

In [None]:
heatmap(
    logreg_eval, 
    'Logistic regression confusion-matrix'
)

In [None]:
class_report_logreg = pd.DataFrame(
    classification_report(
        y_test, 
        y_pred, 
        target_names=categories, 
        output_dict=True
    )
)

logreg_scores = class_report_logreg.loc[
    ['precision', 'recall', 'f1-score'],
    ['Neutral', 'Negative', 'Positive', 'Not climate']
]
print(logreg_scores.to_markdown())

### Cohens K: intercoder-reliability

In [None]:
df_1 = pd.read_excel('label7_1.xlsx', index_col=0)
df_2 = pd.read_excel('label7_2.xlsx', index_col=0)

labels_1 = df_1.head(200).label
labels_2 = df_2.head(200).label

cohen_kappa_score(labels_1, labels_2)

df_1.loc[:, '2_labels'] = df_2.loc[:, 'label'].copy()
df_1.loc[df_1.label != df_1.2_labels].to_csv('disagree_1_2.csv')

# Supervised learning

### Import libraries and intial setup

In [None]:
%reset
# for loading in data and splitting into test and train
import pandas as pd
from datasets import load_dataset
import numpy as np
from datasets import load_metric
import datasets
import tqdm

from transformers import AutoTokenizer
from attack.model_def import ElectraClassifier

# for fine tuning in pytorch with transformers trainer api
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from transformers import EarlyStoppingCallback

#from transformers import ElectraModel
import torch
#import torch.nn.functional as F
#import torch.nn as nn
#from torch.utils.data import Dataset, RandomSampler, DataLoader

import os

### Define functions

In [None]:
def time_parser(string):
    """
    Helper function to extract time in format date month year. 
    """
    dt=parse(string)
    dt=dt.strftime("%d-%m-%Y")
    return datetime.datetime.strptime(dt,"%d-%m-%Y")


def tokenize_function(examples):
    '''
    tokenizing the datasets
    '''
    # pads or truncates the text so it fits with the maximum length the nn can take
    return tokenizer(examples['tweet_full_text'], max_length = 512, padding='max_length', truncation=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def get_labels(trainer, dataset):
    predict = trainer.predict(dataset)
    print('Done with the first part')
    labels = [np.argmax(predict.predictions[i]) for i in range(len(predict.predictions))]
    return labels

def load_model():
    model_checkpoint = 'Maltehb/-l-ctra-danish-electra-small-cased'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
    
    model = torch.load('saving_models_attempt/nearly_done_full_model.pt')

    model.eval()

    return(model, tokenizer)

def make_prediction(dataset):
    input_ids = dataset['input_ids']
    attention_masks = dataset['attention_mask']
    logits = model(input_ids, attention_masks)
    
    logit, preds = torch.max(logits, dim=1)
    return(int(preds))

### Import data and clean-up

In [None]:
df_all = pd.read_csv('only_climate_tweets.csv', compression='zip')
df_all.loc[:, 'tweet_created_at'] = df_all.tweet_created_at.apply(lambda t: time_parser(t))
df_all = df_all.loc[df_all.tweet_created_at > '2019-06-05']
df_sample = df_all.sample(59000)
df_sample.to_csv('sample_for_prediction.csv')

### Import dataset, using PyTorch primitive function

In [None]:
dataset = load_dataset('csv', data_files = ['sample_for_prediction.csv'])

### Initialise -l-ctra model and tokenize data sample

In [None]:
# Initialise the model for finetuning
model = AutoModelForSequenceClassification.from_pretrained('Maltehb/-l-ctra-danish-electra-small-cased', num_labels=4)

# Initialise tokenizer
tokenizer = AutoTokenizer.from_pretrained("Maltehb/-l-ctra-danish-electra-small-cased")
tokenized_datasets = dataset.map(tokenize_function, batched=True)

#tokenized_datasets
sample_predict_dataset = tokenized_datasets['train']

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(8))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(8))
full_train_dataset = tokenized_datasets['train']
full_eval_dataset = tokenized_datasets['test']

### Create Trainer object and train on training data

In [None]:
training_args = TrainingArguments(
    output_dir='final_results',
    num_train_epochs=30,
    evaluation_strategy='epoch',     # computes metrics every epoch!
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.02,               # strength of weight decay higher means less overfitting
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,
    logging_steps=10,
    metric_for_best_model='accuracy'
)

metric = load_metric("accuracy")
cb = [EarlyStoppingCallback(early_stopping_patience=5)]


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_train_dataset,
    eval_dataset=full_eval_dataset,
    compute_metrics=compute_metrics,
    callbacks = cb
)

trainer.train()

In [None]:
labels = get_labels(trainer, sample_predict_dataset)
sample_predicted = sample_predict_dataset.add_column('label_pred', labels)
sample_predicted.to_csv('full_59000_predicted.csv')

In [None]:
torch.save(model, 'saving_models_attempt/full_model.pt')

### Evaluate on test data

In [None]:
model, tokenizer = load_model()
make_prediction(small_eval_dataset)

### Visualization