Use this code to generate "liwc_meta_with_keyword_distances.csv" used for regression analysis. 


In [3]:
import numpy as np
import pandas as pd
import os
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean

# Load GloVe 300-dimensional embeddings
glove = KeyedVectors.load_word2vec_format('~/Downloads/glove.6B/glove.6B.300d.txt', 
                                           binary=False, 
                                           no_header=True)

# Load the input 
df = pd.read_csv('~/Downloads/liwc_meta.csv')

# Function to get document embedding
def get_document_embedding(text, glove):
    words = text.split()  # Split text into words
    word_embeddings = []
    
    for word in words:
        if word in glove.key_to_index:
            word_embeddings.append(glove[word])
    
    if word_embeddings:
        return np.mean(word_embeddings, axis=0)  # Average the embeddings
    else:
        return np.zeros(glove.vector_size)  # Return a zero vector if no words are found

# Generate document embeddings
embeddings = np.array([get_document_embedding(text, glove) for text in df['text']])

# Define keyword pairs
keyword_pairs = [
    ("sovereign", "territory"),
    ("sovereign", "responsibility"),
    ("sovereign", "interven"),
    ("sovereign", "democra"),
    ("sovereign", "liberal"),
    ("sovereign", "colonial"), 
    ("sovereign", "westphalia"),
    ("sovereign", "rights"),
    ("sovereign", "war"),
    ("rights", "responsibility"),
    ("rights", "interven"),
    ("sovereign", "law")
]

# Create a new DataFrame to hold the distances
if keyword_pairs:
    for keyword1, keyword2 in keyword_pairs:
        if keyword1 in glove.key_to_index and keyword2 in glove.key_to_index:
            embedding1 = glove[keyword1].reshape(1, -1)
            embedding2 = glove[keyword2].reshape(1, -1)

            # Initialize lists for each keyword pair
            distances_cosine = []
            distances_euclidean = []

            for emb in embeddings:  # Now embeddings is defined
                # Calculate cosine similarity and convert to distance
                similarity1 = cosine_similarity(embedding1, emb.reshape(1, -1))[0][0]
                similarity2 = cosine_similarity(embedding2, emb.reshape(1, -1))[0][0]
                distance_cosine = 1 - (similarity1 + similarity2) / 2
                distances_cosine.append(distance_cosine)

                # Calculate Euclidean distance
                distance1 = euclidean(embedding1.flatten(), emb.flatten())
                distance2 = euclidean(embedding2.flatten(), emb.flatten())
                distance_euclidean = (distance1 + distance2) / 2
                distances_euclidean.append(distance_euclidean)

            # Append distances as new columns to the original DataFrame
            df[f'distance_{keyword1}_{keyword2}_cosine'] = distances_cosine
            df[f'distance_{keyword1}_{keyword2}_euclidean'] = distances_euclidean

    # Prepare output folder and file path
    output_folder = "~/Downloads"
    os.makedirs(output_folder, exist_ok=True)
    output_path = os.path.join(output_folder, "liwc_meta_with_keyword_distances_euclidean.csv")
    
    # Save the DataFrame to CSV
    df.to_csv(output_path, index=False)
    print(f"Distances between keywords computed and saved successfully in '{output_path}'.")
else:
    print("One or more keywords are not in the vocabulary.")


Distances between keywords computed and saved successfully in '~/Downloads/liwc_meta_with_keyword_distances.csv'.


In [4]:
print(df.head)


<bound method NDFrame.head of        Unnamed: 0 ccode_iso  year  session  \
0               1       AFG  1952        7   
1               2       AFG  1957       12   
2               3       AFG  1958       13   
3               4       AFG  1959       14   
4               5       AFG  1960       15   
...           ...       ...   ...      ...   
10563       10564       ZWE  2018       73   
10564       10565       ZWE  2019       74   
10565       10566       ZWE  2020       75   
10566       10567       ZWE  2021       76   
10567       10568       ZWE  2022       77   

                                                    text  Segment    WC  \
0      I consider it a great honour and privilege to ...        1   669   
1      It is my privilege to express to you, Mr. Pres...        1  1010   
2      Before using this opportunity to make a statem...        1  2775   
3      I speak for a small country, Afghanistan, whic...        1  3921   
4      At the outset, Mr. President, allow