In [17]:
import pandas as pd

# Load the uploaded CSV file
file_path = 'allsides_balanced_news_headlines-texts.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()


Unnamed: 0.1,Unnamed: 0,title,tags,heading,source,text,bias_rating
0,0,Gun Violence Over Fourth of July Weekend,"['Protests', 'Fourth Of July', 'Gun Control An...",Chicago Gun Violence Spikes and Increasingly F...,New York Times (News),As Yasmin Miller drove home from a laundromat ...,left
1,1,Gun Violence Over Fourth of July Weekend,"['Protests', 'Fourth Of July', 'Gun Control An...",‘Bullets just came from nowhere’: Fourth of Ju...,Chicago Tribune,As many Chicagoans were celebrating the Fourth...,center
2,2,Gun Violence Over Fourth of July Weekend,"['Protests', 'Fourth Of July', 'Gun Control An...",Dozens of shootings across US mark bloody July...,New York Post (News),The nation’s 4th of July weekend was marred by...,right
3,3,Yellen Warns Congress of 'Economic Recession' ...,"['Janet Yellen', 'Debt Ceiling', 'Economic Pol...",Federal Government Will Run Out of Cash on Oct...,The Epoch Times,Treasury Secretary Janet Yellen on Tuesday war...,right
4,4,Yellen Warns Congress of 'Economic Recession' ...,"['Janet Yellen', 'Debt Ceiling', 'Economic Pol...",Yellen tells Congress that U.S. will run out o...,Washington Post,Treasury Secretary Janet Yellen on Tuesday tol...,left


In [18]:
import re

# Step 1: Clean the text column
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()  # Lowercase text
        text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters and digits
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
        return text
    return ""

# Apply cleaning to the 'text' column
data['clean_text'] = data['text'].apply(clean_text)

# Display a sample of the cleaned text
data[['text', 'clean_text']].head()


Unnamed: 0,text,clean_text
0,As Yasmin Miller drove home from a laundromat ...,as yasmin miller drove home from a laundromat ...
1,As many Chicagoans were celebrating the Fourth...,as many chicagoans were celebrating the fourth...
2,The nation’s 4th of July weekend was marred by...,the nations th of july weekend was marred by t...
3,Treasury Secretary Janet Yellen on Tuesday war...,treasury secretary janet yellen on tuesday war...
4,Treasury Secretary Janet Yellen on Tuesday tol...,treasury secretary janet yellen on tuesday tol...


In [19]:
# Install spaCy and download the English model
!pip install spacy
!python -m spacy download en_core_web_sm

# Load spaCy's English tokenizer
import spacy
nlp = spacy.load("en_core_web_sm")

# Tokenize using spaCy
def spacy_tokenizer(text):
    doc = nlp(text)
    return [token.text for token in doc if not token.is_punct and not token.is_space]

# Apply spaCy tokenizer to clean text
data['spacy_tokens'] = data['clean_text'].apply(spacy_tokenizer)

# Display sample data
data[['clean_text', 'spacy_tokens']].head()


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m100.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.




Unnamed: 0,clean_text,spacy_tokens
0,as yasmin miller drove home from a laundromat ...,"[as, yasmin, miller, drove, home, from, a, lau..."
1,as many chicagoans were celebrating the fourth...,"[as, many, chicagoans, were, celebrating, the,..."
2,the nations th of july weekend was marred by t...,"[the, nations, th, of, july, weekend, was, mar..."
3,treasury secretary janet yellen on tuesday war...,"[treasury, secretary, janet, yellen, on, tuesd..."
4,treasury secretary janet yellen on tuesday tol...,"[treasury, secretary, janet, yellen, on, tuesd..."


In [20]:
# Define a minimal list of English stopwords
manual_stopwords = set(["the", "a", "an", "is", "it", "to", "and", "in", "on", "for", "with", "of", "at", "by", "from", "as", "were", "was", "this"])

# Remove stopwords from tokens
data['filtered_tokens'] = data['spacy_tokens'].apply(lambda x: [word for word in x if word not in manual_stopwords])

# Display sample data
data[['spacy_tokens', 'filtered_tokens']].head()


Unnamed: 0,spacy_tokens,filtered_tokens
0,"[as, yasmin, miller, drove, home, from, a, lau...","[yasmin, miller, drove, home, laundromat, chic..."
1,"[as, many, chicagoans, were, celebrating, the,...","[many, chicagoans, celebrating, fourth, july, ..."
2,"[the, nations, th, of, july, weekend, was, mar...","[nations, th, july, weekend, marred, wrong, ki..."
3,"[treasury, secretary, janet, yellen, on, tuesd...","[treasury, secretary, janet, yellen, tuesday, ..."
4,"[treasury, secretary, janet, yellen, on, tuesd...","[treasury, secretary, janet, yellen, tuesday, ..."


In [8]:
from gensim.models import Word2Vec

# Step 1: Train Word2Vec embeddings on the filtered tokens
word2vec_model = Word2Vec(sentences=data['filtered_tokens'], vector_size=100, window=5, min_count=2, workers=4)

# Step 2: Check the nearest neighbors for the word "gun"
if "gun" in word2vec_model.wv:
    similar_words = word2vec_model.wv.most_similar("gun", topn=10)
else:
    similar_words = "Word 'gun' not found in vocabulary."

similar_words


[('control', 0.7836002111434937),
 ('background', 0.7492357492446899),
 ('birth', 0.7369154095649719),
 ('insure', 0.7249948382377625),
 ('violence', 0.7224950194358826),
 ('reauthorization', 0.7164033651351929),
 ('advocates', 0.7134428024291992),
 ('bicameral', 0.699687123298645),
 ('antirioting', 0.6910364627838135),
 ('passions', 0.6902425289154053)]

In [15]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.preprocessing.sequence import pad_sequences

# --- Step 1: Prepare the Embedding Matrix ---
embedding_dim = 100  # Word2Vec vector size
vocab_size = min(MAX_VOCAB_SIZE, len(tokenizer.word_index) + 1)

# Initialize the embedding matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    if i < vocab_size:
        if word in word2vec_model.wv:
            embedding_matrix[i] = word2vec_model.wv[word]  # Use Word2Vec vector
        else:
            embedding_matrix[i] = np.random.normal(size=(embedding_dim,))  # Random vector for OOV words

# --- Step 2: Define the Neural Network ---
model = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False  # Freeze the embeddings
    ),
    SpatialDropout1D(0.2),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')  # 3 classes: left, center, right
])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# --- Step 3: Train the Model ---
EPOCHS = 5
BATCH_SIZE = 32

history = model.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_data=(X_test, y_test))

# --- Step 4: Evaluate the Model ---
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")


Epoch 1/5
[1m433/544[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m9s[0m 89ms/step - accuracy: 0.4696 - loss: 1.0498

KeyboardInterrupt: 