# **word2vec implementation for political leanings**

https://rare-technologies.com/deep-learning-with-word2vec-and-gensim/

https://www.kaggle.com/datasets/umbertogriffo/googles-trained-word2vec-model-in-python/code?datasetId=12162&sortBy=voteCount

In [4]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.matutils import corpus2csc
from sklearn.feature_extraction.text import CountVectorizer

from wordcloud import WordCloud
import matplotlib.pyplot as plt

import re
import contractions
import numpy as np
import pandas as pd

In [5]:
# Load dataset
df = pd.read_csv("./data/2019_2.csv")
print(df.shape)
df.head()

(50269, 10)


Unnamed: 0,id,date_publish,outlet,headline,lead,body,authors,domain,url,political_leaning
0,52972702,2019-07-01 00:00:00,NPR,Iran Says It Exceeded Enriched Uranium Cap,Iran's foreign minister confirms that his coun...,Iran Says It Exceeded Enriched Uranium Cap\nIr...,,www.npr.org,https://www.npr.org/2019/07/01/737600999/iran-...,LEFT
1,4037078,2019-07-01 00:00:00,The New York Times,Bombing Kills Dozens in Kabul as Taliban Talks...,Attackers set off bombs that wrecked a war mus...,"KABUL, Afghanistan — A complex attack includin...",Thomas Gibbons-Neff;Rod Nordland,www.nytimes.com,https://www.nytimes.com/2019/07/01/world/asia/...,LEFT
2,52904870,2019-07-01 00:00:00,NPR,How Long Will The Current U.S. Economic Expans...,"NPR's Steve Inskeep talks to David Wessel, dir...",How Long Will The Current U.S. Economic Expans...,,www.npr.org,https://www.npr.org/2019/07/01/737535414/how-l...,LEFT
3,18311035,2019-07-01 00:00:00,BBC,Black girls 'perceived as less innocent by US ...,New research from The Georgetown Law Center on...,Video\nNew research from The Georgetown Law Ce...,,www.bbc.com,http://www.bbc.com/news/av/world-us-canada-404...,UNDEFINED
4,4061861,2019-07-01 00:00:00,The New York Times,Pete Buttigieg Raised $24.8 Million in Second ...,Mr. Buttigieg’s total is the latest evidence t...,WASHINGTON — Mayor Pete Buttigieg of South Ben...,Thomas Kaplan,www.nytimes.com,https://www.nytimes.com/2019/07/01/us/politics...,LEFT


In [6]:
#lemmatization and removing stopwords
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess(text):
    def is_english_word(word):
        """Function to filter out non-English words."""
        return bool(re.match(r'^[a-zA-Z]+$', word))
    text = text.lower()
    editorials_to_exclude = ["cnn", "fox", "reuters"]
    for editorial in editorials_to_exclude:
        text = re.sub(r"\b" + re.escape(editorial) + r"\b", "", text, flags=re.IGNORECASE)
    text = contractions.fix(text)
    words = word_tokenize(text)
    words = [
            lemmatizer.lemmatize(word) 
            for word in words 
            if word not in stop_words and is_english_word(word)
        ]
    words = [re.sub(r'[^\w\s]', '', token) for token in words if re.sub(r'[^\w\s]', '', token)]
    words = [word for word in words if word]
    return words[:5000]

# Create Word2Vec Corpus for headline and body
class MyCorpus:
    def __init__(self, texts):
        self.texts = texts

    def __iter__(self):
        for text in self.texts:
            yield preprocess(text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ALEJANDRO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ALEJANDRO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ALEJANDRO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
headline = df['headline']#.apply(preprocess)
body = df['body']#.apply(preprocess)

In [8]:
# Create Word2Vec model for headline and body separately
headline_corpus = MyCorpus(headline)
body_corpus = MyCorpus(body)

In [9]:
headline_model = Word2Vec(sentences=headline_corpus) #,vector_size=100, window=5, min_count=1, workers=4)
body_model = Word2Vec(sentences=body_corpus) #,vector_size=100, window=5, min_count=1, workers=4)

In [10]:
for index, word in enumerate(headline_model.wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(headline_model.wv.index_to_key)} is {word}")

word #0/7511 is trump
word #1/7511 is say
word #2/7511 is new
word #3/7511 is police
word #4/7511 is u
word #5/7511 is china
word #6/7511 is democrat
word #7/7511 is shooting
word #8/7511 is house
word #9/7511 is johnson


In [11]:
vector_Trump = headline_model.wv['trump']
print(vector_Trump)

vector_Abortion = headline_model.wv['abortion']
print(vector_Trump)

[-0.2367585   0.55916655  1.0645618   0.7336459  -1.3328441  -0.7641349
  1.0223736   1.0448487  -1.1596005  -1.4593971  -0.02156663 -0.6333029
 -0.29212296  0.39364266 -0.06683157 -0.6610192  -0.18043107 -0.8112339
 -0.96861386 -1.0873138   0.56825334 -0.10102818  0.38629338 -0.11525298
  0.91786367 -0.27058426 -0.6660524  -0.78718555 -0.42906767 -0.02413273
  0.5271197   0.08535359  0.9772652  -0.40858665 -0.31021193 -0.43167362
  0.08019218 -0.13841344  0.6650526  -1.6351038  -0.65190774  0.42510575
 -0.33795014 -1.0885165   0.40009487 -0.96452665 -0.64715     0.9959647
  0.28871265  0.9041853  -0.23219971  0.28751174 -0.8142752  -0.12368076
  1.4399587   0.04220992  0.62613404 -0.29919705 -0.0623568  -0.5649454
 -0.08017629  0.28489992  0.81683815 -0.37241918 -1.0314667   0.0226124
  0.11279571  0.1579838  -1.7159396   0.33597183  0.04047381  0.5213772
 -0.8490265  -0.19223702  0.8210401   0.46775627  0.3600397   0.59452283
 -1.1771429  -0.3192185   0.38618296 -0.35849229  0.277752

In [12]:
import tempfile

headline_model.save('./models/headline_model')
body_model.save('./models/body_model')

In [13]:
from sklearn.manifold import TSNE
import random

# Retrieve the weights from the model. This is used for initializing the weights
# in a Keras Embedding layer later
w2v_weights = headline_model.wv.vectors
vocab_size, embedding_size = w2v_weights.shape

print("Vocabulary Size: {} - Embedding Dim: {}".format(vocab_size, embedding_size))

# Some validation on the quality of the Word2Vec model
print(headline_model.wv.most_similar('trump', topn=3))
print(headline_model.wv.most_similar('biden', topn=3))
print(headline_model.wv.most_similar('abortion', topn=3))
print(headline_model.wv.most_similar(positive=['trump', 'biden'], topn=3))

Vocabulary Size: 7511 - Embedding Dim: 100
[('pelosi', 0.7955394983291626), ('lawmaker', 0.747930109500885), ('dems', 0.7463365197181702)]
[('walsh', 0.9306197762489319), ('arpaio', 0.9298338890075684), ('kamala', 0.8947131633758545)]
[('temporarily', 0.9445393681526184), ('allows', 0.9336333274841309), ('enforcement', 0.9268770217895508)]
[('walsh', 0.9102801084518433), ('arpaio', 0.8938741683959961), ('scarborough', 0.8577998280525208)]


In [14]:
def word2token(word):
    try:
        return headline_model.wv.vocab[word].index
    except KeyError:
        return 0
def token2word(token):
    return headline_model.wv.index_to_key[token]

In [19]:
import random
n_samples = 500
# Sample random words from model dictionary
random_i = random.sample(range(vocab_size), n_samples)
random_w = [token2word(i) for i in random_i]

# Generate Word2Vec embeddings of each word
word_vecs = np.array([[headline_model.wv.get_index(w)] for w in random_w])

# Apply t-SNE to Word2Vec embeddings, reducing to 2 dims
tsne = TSNE()
tsne_e = tsne.fit_transform(word_vecs)

# Plot t-SNE result
plt.figure(figsize=(32, 32))
plt.scatter(tsne_e[:, 0], tsne_e[:, 1], marker='o', c=range(len(random_w)), cmap=plt.get_cmap('Spectral'))

for label, x, y, in zip(random_w, tsne_e[:, 0], tsne_e[:, 1]):
    plt.annotate(label,
                 xy=(x, y), xytext=(0, 15),
                 textcoords='offset points', ha='right', va='bottom',
                 bbox=dict(boxstyle='round, pad=0.2', fc='yellow', alpha=0.1))

ValueError: n_components=2 must be between 1 and min(n_samples, n_features)=1 with svd_solver='randomized'

### **Text classification**

https://towardsdatascience.com/google-news-and-leo-tolstoy-visualizing-word2vec-word-embeddings-with-t-sne-11558d8bd4d


https://stackoverflow.com/questions/49643974/how-to-do-text-classification-using-word2vec

https://stackoverflow.com/questions/57525190/text-classification-with-word2vec

https://radimrehurek.com/gensim/similarities/docsim.html#gensim.similarities.docsim.Similarity

https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html

https://www.kaggle.com/code/guichristmann/lstm-classification-model-with-word2vec

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim

from torch.nn.utils.rnn import pad_sequence
import torch

# Convert text to sequences of Word2Vec embeddings
def get_word2vec_sequence(text, model):
    words = preprocess(text)
    word_vectors = [torch.tensor(model.wv[word], dtype=torch.float32) for word in words if word in model.wv]
    return torch.stack(word_vectors) if word_vectors else torch.zeros((1, model.vector_size))

# Convert the corpus into padded sequences
def get_padded_sequences(corpus, model, max_length=500):
    sequences = [get_word2Svec_sequence(text, model) for text in corpus]
    padded_sequences = pad_sequence(sequences, batch_first=True)  # Pad to the longest sequence
    return padded_sequences[:, :max_length]  # Optional: Truncate to `max_length`

In [None]:
class PoliticalLeaningDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

In [None]:
# Prepare sequences
headline_sequences = get_padded_sequences(headline_corpus, headline_model)
body_sequences = get_padded_sequences(body_corpus, body_model)

# Encode labels
label_encoder = LabelEncoder()
political_leaning_encoded = label_encoder.fit_transform(df['political_leaning'])

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    headline_sequences, political_leaning_encoded, test_size=0.2, random_state=42
)

# Create PyTorch datasets
train_dataset = PoliticalLeaningDataset(X_train, y_train)
test_dataset = PoliticalLeaningDataset(X_test, y_test)

In [None]:
# Create DataLoader instances for batching
train_dataset = PoliticalLeaningDataset(X_train, y_train)
test_dataset = PoliticalLeaningDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [None]:
# Define the neural network model
class PoliticalLeaningModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(PoliticalLeaningModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, output_dim)  # Output layer

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.fc4(x)
        return x

In [None]:
# Instantiate the model
input_dim = embeddings.shape[1]  # This is the size of the feature vector (headline + body)
output_dim = len(label_encoder.classes_)  # Number of political leaning classes
model = PoliticalLeaningModel(input_dim, output_dim)

# Set up the loss function and optimizer
criterion = nn.CrossEntropyLoss()  # Cross-entropy loss for multi-class classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Training the model
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_loss += loss.item()

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}")

# Test the model
def test_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = correct / total
    print(f"Test Accuracy: {accuracy:.4f}")

# Train and evaluate the model
train_model(model, train_loader, criterion, optimizer, num_epochs=10)
test_model(model, test_loader)

In [None]:
# Predict political leaning on new data
def predict(model, data):
    model.eval()
    with torch.no_grad():
        outputs = model(torch.tensor(data, dtype=torch.float32))
        _, predicted = torch.max(outputs, 1)
    return predicted.numpy()

# Predict political leaning for test data
predicted_labels = predict(model, X_test)

# Decode the predicted labels back to original political leaning classes
predicted_political_leaning = label_encoder.inverse_transform(predicted_labels)

# Print first 10 predictions
for i in range(10):
    print(f"Predicted: {predicted_political_leaning[i]}, Actual: {label_encoder.inverse_transform([y_test[i]])[0]}")

# **Pre trained word2vec**

https://radimrehurek.com/gensim/similarities/docsim.html

In [None]:
import gensim

model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

### document2word

In [None]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Sample documents
documents = [
    TaggedDocument("I love machine learning", [0]),
    TaggedDocument("Machine learning is the future", [1]),
    TaggedDocument("Deep learning is a subset of machine learning", [2])
]

# Train Doc2Vec model
model = Doc2Vec(documents, vector_size=100, window=5, min_count=2, epochs=40)

# Infer vector for a new document
new_document = "I enjoy natural language processing"
new_vector = model.infer_vector(new_document.split())

# Find most similar documents
similar_docs = model.dv.most_similar([new_vector])

print(similar_docs)