# Semantic network analysis

This notebook exemplifies how we build the semantic networks. The notebook was run on all three datasets.

In [None]:
# for Danish: !python -m spacy download da_core_news_sm
# for Polish: !python -m spacy download pl_core_news_sm
!python -m spacy download de_core_news_sm

In [None]:
import json
import networkx as nx
import pandas as pd
import ast
import spacy
from collections import Counter
from collections import defaultdict
from community import community_louvain
import os

## Preparing the data

In [None]:
# set working directory
os.chdir(r'C:\Users\maril\Documents\20-21 KU\block 4\DM\twitter')

In [None]:
# load data
df = pd.read_csv(r'final_data_preprocess\de_preprocess.csv')

In [None]:
# function to turn the tokenized/lemmatized list into a readable format
def string_list(text):
    
    # we transform the string representation of the list into an actual list
    text = ast.literal_eval(text)
    
    # return the transformed text
    return text

In [None]:
# apply function: YOU NEED TO SPECIFY ALL RELEVANT COLUMNS HERE
df['token'] = df['token'].apply(string_list)
df['lemma'] = df['lemma'].apply(string_list)
df['token_no_mention'] = df['token_no_mention'].apply(string_list)
df['lemma_no_mention'] = df['lemma_no_mention'].apply(string_list)

# print the dataframe
print(df.shape)
df.head(3)

## Part of Speech Tagging

In [None]:
# spacy model

# for Danish: nlp = spacy.load('da_core_news_sm')
# for Polish: nlp = spacy.load('pl_core_news_sm')
nlp = spacy.load('de_core_news_sm')

In [None]:
# function to conduct the POS tagging

def pos_tagging(keep_types):
    
    """Takes a list of POS types to keep. Returns a list of tuples (lemma, POS type)."""
    
    # words to keep
    keep = []
    
    # iterate through the column containing preprocessed text without mentions
    for doc in df['preprocess_no_mention']:
        
        # apply the spacy pipeline
        doc = nlp(doc)
        
        # iterate through the list of tokens
        for w in doc:
            
            # remove stopwords
            if w.is_stop == False:
                
                # get the POS type
                typ = w.pos_

                # get the lemma
                w = w.lemma_.lower()

                # if the POS type is defined as one we want to keep
                if typ in keep_types:

                    # append the keep list with a tuple of the lemma and the type
                    keep.append((w,typ))
        
    return keep

In [None]:
# applying the function: we specify that we want to keep nouns, verbs and adjectives

# for Danish: only keep nouns and adjectives 

keep_output = pos_tagging(keep_types=set(['NOUN','VERB','ADJ']))

In [None]:
# create a list that contains only the words we want to keep (without the POS tag)
keep_output_words = set(word[0] for word in keep_output)
print(keep_output_words)

In [None]:
# now onto word frequencies: we need to get an overview of the most frequent words in order to be able to adjust 
# for very rare and very frequent words

# set up a counter
count = Counter()

# iterate through the list of lemmas in our dataframe
# NOTE: this list of lemmas comes from an earlier preprocessing step in a different notebook and it therefore
# contains more words than the ones we want to keep - we will handle that issue in the cells below
for line in df['lemma_no_mention']:
    
    # update the counter
    count.update(line)

# print the 60 most frequent words 
print(count.most_common(60))

In [None]:
# remove very rare words

# we create a copy of the count
count_final = count.copy()

# iterate through the count dict
for i in count:
    
    # if the value is below a certain value
    if count[i] < 20:
        
        # delete the entry from the count_final dict
        del count_final[i]

In [None]:
# create the co-occurrence matrix (here: in the form of a dict)

# initiate defaultdict
com = defaultdict(lambda : defaultdict(int))
 
# iterate through the list of lemmas in the 'lemma_no_mention' column
for line in df['lemma_no_mention']: 
    
    # build co-occurrence matrix
    # the -1 and +1 here make sure that we only get co-occurences of the token with all other tokens in the 
    # tweet, but not for itself (this makes sense - I promise:))
    for i in range(len(line)-1): 
        for j in range(i+1, len(line)):
            
            # w1 and w2 are two words which co-occur in a tweet together
            w1, w2 = sorted([line[i], line[j]]) 
            
            # now we filter: if w1 and w2 fulfill all our criteria for being kept, we add them to the com dict
            # and set/update their value by 1
            if w1 in keep_output_words:
                if w1 in count_final:
                    if w2 in keep_output_words:
                        if w2 in count_final:
                            com[w1][w2] += 1

## Non-symmetric association values

...we create symmetric association values inspired by Fuhse et al. (2020) further down.

In [None]:
# create a copy of 'com' because we want to use the original 'com' for the symmetric association values further down
com_uns = com.copy()

### Co-occuring words

In [None]:
# we are interested in the words that most frequently co-occur with the word 'impfung' (ENG: 'vaccine')
# retrieve the 50 most co-occuring words

term_of_interest = 'impfung'

co_occur_uns = sorted(com_uns[term_of_interest].items(), key=lambda x:x[1], reverse=True)[:60]
co_occur_uns = [tup[0] for tup in co_occur_uns]
print(co_occur_uns)

### Edgelist

In [None]:
# for the network, we need a submatrix for these terms and their respective co-occurences among each other

# empty list to save edges and weights in
edgelist_uns = []

# iterate through the co_occur list
for term in co_occur_uns:
    
    # retrive the subdict which is saved for this term in the com dict
    edges = com_uns[term]
    
    # iterate through the keys in this subdict
    for node in edges:
        
        # if the node appears in the co_occur list
        if node in co_occur_uns:
            
            # add the following information to the edgelist: node, node, weight
            edgelist_uns.append([term, node, edges[node]])

In [None]:
# turn the edgelist into a dataframe
edge_df_uns = pd.DataFrame(edgelist_uns, columns=['source', 'target', 'weight'])
edge_df_uns.head()

### Graph

In [None]:
# create an undirected, weighted graph
G_uns = nx.from_pandas_edgelist(edge_df_uns, source='source', target='target', edge_attr='weight')

### Community detection (Louvain algorithm)

In [None]:
# Louvain community detection on weighted graph
partition_uns = community_louvain.best_partition(G_uns, weight='weight', random_state=40)

# check the communities
partition_uns

In [None]:
# add the community as a node attribute
for c in partition_uns:
    G_uns.nodes[c]['community'] = partition_uns[c]

In [None]:
# change wd
os.chdir(r'C:\Users\maril\Documents\20-21 KU\block 4\DM\twitter\semantic_net')

In [None]:
# export to Gephi
nx.write_gexf(G_uns, 'semantic_net_unsymmetric.gexf')

## Sampling tweet from the clusters

In [None]:
# sample tweets from the clusters

# for Danish: there were six clusters, so c5 needs ot be added to the loop

c0 = []
c1 = []
c2 = []
c3 = []
c4 = []

for term in partition_uns:
    if partition_uns[term] == 0:
        c0.append(term)
    
    elif partition_uns[term] == 1:
        c1.append(term)
    
    elif partition_uns[term] == 2:
        c2.append(term)
        
    elif partition_uns[term] == 3:
        c3.append(term)
        
    else:
        c4.append(term)

In [None]:
# get tweets from all the clusters

c0_indices = set()
c1_indices = set()
c2_indices = set()
c3_indices = set()
c4_indices = set()


# iterate through the list of lemmas in the 'lemma_no_mention' column
for i in range(len(df['lemma_no_mention'])):
    
    # match all lists of lemmas that contain the word 'impfung'
    if 'impfung' in df['lemma_no_mention'][i]:
        
        # for each word in the cluster
        for word in c0:
            
            # if the word from the cluster appears in the lemma list as well
            if word in df['lemma_no_mention'][i]:
                
                # save index
                c0_indices.add(i)
                
                # for each word in the cluster
        for word in c1:
            
            # if the word from the cluster appears in the lemma list as well
            if word in df['lemma_no_mention'][i]:
                
                # save index
                c1_indices.add(i)
                
                # for each word in the cluster
        for word in c2:
            
            # if the word from the cluster appears in the lemma list as well
            if word in df['lemma_no_mention'][i]:
                
                # save index
                c2_indices.add(i)
                
                # for each word in the cluster
        for word in c3:
            
            # if the word from the cluster appears in the lemma list as well
            if word in df['lemma_no_mention'][i]:
                
                # save index
                c3_indices.add(i)
                
                # for each word in the cluster
        for word in c4:
            
            # if the word from the cluster appears in the lemma list as well
            if word in df['lemma_no_mention'][i]:
                
                # save index
                c4_indices.add(i)

In [None]:
# subset the dataframe by these indices and draw a random sample of 100 tweets
c0_df = df.iloc[[index for index in c0_indices]].sample(25)
c1_df = df.iloc[[index for index in c1_indices]].sample(25)
c2_df = df.iloc[[index for index in c2_indices]].sample(25)
c3_df = df.iloc[[index for index in c3_indices]].sample(25)
c4_df = df.iloc[[index for index in c4_indices]].sample(25)

In [None]:
# check the sampled dataframes
display(c0_df.head(3))
display(c1_df.head(3))
display(c2_df.head(3))
display(c3_df.head(3))
display(c4_df.head(3))

In [None]:
# save them to csv files
c0_df['text'].to_excel('c0_cluster_de.xlsx', index=False)
c1_df['text'].to_excel('c1_cluster_de.xlsx', index=False)
c2_df['text'].to_excel('c2_cluster_de.xlsx', index=False)
c3_df['text'].to_excel('c3_cluster_de.xlsx', index=False)
c4_df['text'].to_excel('c4_cluster_de.xlsx', index=False)

## Symmetric association values

Fuhse et al. (2020) create "symmetric association values" which are the "number of co-occurences $C_i{_j}$ divided by the overall frequencies of the two terms $F_i$ and $F_j$.

In [None]:
# calculate symmetric association values 

# iterate through the keys in com
for i in com:
    for j in com[i]:
        
        # if the column term is in count_final
        if i in count_final:
            
            # if the row term is in count_final
            if j in count_final:
                
                # update the co-occurence value by dividing the old value by the overall frequency of the two terms
                com[i][j] = com[i][j] / (count_final[i] * count_final[j])

## Co-occuring words

In [None]:
# we are interested in the words that most frequently co-occur with the word "impfung" (ENG: vaccine)
# retrieve the 50 most co-occuring words

term_of_interest = 'impfung'

co_occur = sorted(com[term_of_interest].items(), key=lambda x:x[1], reverse=True)[:35]
co_occur = [tup[0] for tup in co_occur]

## Network edgelist

In [None]:
# for the network, we need a submatrix for these terms and their respective co-occurences among each other

# empty list to save edges and weights in
edgelist = []

# iterate through the co_occur list
for term in co_occur:
    
    # retrive the subdict which is saved for this term in the com dict
    edges = com[term]
    
    # iterate through the keys in this subdict
    for node in edges:
        
        # if the node appears in the co_occur list
        if node in co_occur:
            
            # add the following information to the edgelist: node, node, weight
            edgelist.append([term, node, edges[node]])

In [None]:
# turn the edgelist into a dataframe
edge_df = pd.DataFrame(edgelist, columns=['source', 'target', 'weight'])
edge_df.head()

## Creating the network

In [None]:
# create an undirected, weighted graph
G = nx.from_pandas_edgelist(edge_df, source='source', target='target', edge_attr='weight')

In [None]:
G.number_of_nodes()

In [None]:
# Louvain community detection on weighted graph
partition = community_louvain.best_partition(G, weight='weight')

In [None]:
# check the communities
partition

In [None]:
# add the community as a node attribute
for c in partition:
    G.nodes[c]['community'] = partition[c]

In [None]:
# export to Gephi
nx.write_gexf(G, 'semantic_net_symmetric.gexf')