In [14]:
import pandas as pd

# Load the uploaded CSV file
file_path = '/Users/miriam/Documents/GitHub/Qbias/allsides_balanced_news_headlines-texts.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()


Unnamed: 0.1,Unnamed: 0,title,tags,heading,source,text,bias_rating
0,0,Gun Violence Over Fourth of July Weekend,"['Protests', 'Fourth Of July', 'Gun Control An...",Chicago Gun Violence Spikes and Increasingly F...,New York Times (News),As Yasmin Miller drove home from a laundromat ...,left
1,1,Gun Violence Over Fourth of July Weekend,"['Protests', 'Fourth Of July', 'Gun Control An...",‘Bullets just came from nowhere’: Fourth of Ju...,Chicago Tribune,As many Chicagoans were celebrating the Fourth...,center
2,2,Gun Violence Over Fourth of July Weekend,"['Protests', 'Fourth Of July', 'Gun Control An...",Dozens of shootings across US mark bloody July...,New York Post (News),The nation’s 4th of July weekend was marred by...,right
3,3,Yellen Warns Congress of 'Economic Recession' ...,"['Janet Yellen', 'Debt Ceiling', 'Economic Pol...",Federal Government Will Run Out of Cash on Oct...,The Epoch Times,Treasury Secretary Janet Yellen on Tuesday war...,right
4,4,Yellen Warns Congress of 'Economic Recession' ...,"['Janet Yellen', 'Debt Ceiling', 'Economic Pol...",Yellen tells Congress that U.S. will run out o...,Washington Post,Treasury Secretary Janet Yellen on Tuesday tol...,left


In [13]:
# Convert non-string values to empty string for the 'text' column
data['text'] = data['text'].apply(lambda x: str(x) if isinstance(x, str) else '')

# Clean the text column again
data['text'] = data['text'].apply(clean_text)

# Display the cleaned data summary
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21754 entries, 0 to 21753
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        21754 non-null  object
 1   tags         21754 non-null  object
 2   heading      21754 non-null  object
 3   source       21746 non-null  object
 4   text         21754 non-null  object
 5   bias_rating  21754 non-null  object
 6   clean_text   21754 non-null  object
dtypes: object(7)
memory usage: 1.2+ MB


In [15]:
import re

# Step 1: Clean the text column
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()  # Lowercase text
        text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters and digits
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
        return text
    return ""

# Apply cleaning to the 'text' column
data['clean_text'] = data['text'].apply(clean_text)

# Display a sample of the cleaned text
data[['text', 'clean_text']].head()


Unnamed: 0,text,clean_text
0,As Yasmin Miller drove home from a laundromat ...,as yasmin miller drove home from a laundromat ...
1,As many Chicagoans were celebrating the Fourth...,as many chicagoans were celebrating the fourth...
2,The nation’s 4th of July weekend was marred by...,the nations th of july weekend was marred by t...
3,Treasury Secretary Janet Yellen on Tuesday war...,treasury secretary janet yellen on tuesday war...
4,Treasury Secretary Janet Yellen on Tuesday tol...,treasury secretary janet yellen on tuesday tol...


In [16]:
# Install spaCy and download the English model
!pip install spacy
!python -m spacy download en_core_web_sm

# Load spaCy's English tokenizer
import spacy
nlp = spacy.load("en_core_web_sm")

# Tokenize using spaCy
def spacy_tokenizer(text):
    doc = nlp(text)
    return [token.text for token in doc if not token.is_punct and not token.is_space]

# Apply spaCy tokenizer to clean text
data['spacy_tokens'] = data['clean_text'].apply(spacy_tokenizer)

# Display sample data
data[['clean_text', 'spacy_tokens']].head()


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


Unnamed: 0,clean_text,spacy_tokens
0,as yasmin miller drove home from a laundromat ...,"[as, yasmin, miller, drove, home, from, a, lau..."
1,as many chicagoans were celebrating the fourth...,"[as, many, chicagoans, were, celebrating, the,..."
2,the nations th of july weekend was marred by t...,"[the, nations, th, of, july, weekend, was, mar..."
3,treasury secretary janet yellen on tuesday war...,"[treasury, secretary, janet, yellen, on, tuesd..."
4,treasury secretary janet yellen on tuesday tol...,"[treasury, secretary, janet, yellen, on, tuesd..."


In [19]:
# Define a minimal list of English stopwords
manual_stopwords = set(["the", "a", "an", "is", "it", "to", "and", "in", "on", "for", "with", "of", "at", "by", "from", "as", "were", "was", "this"])

# Remove stopwords from tokens
data['filtered_tokens'] = data['spacy_tokens'].apply(lambda x: [word for word in x if word not in manual_stopwords])

# Display sample data
data[['spacy_tokens', 'filtered_tokens']].head()


Unnamed: 0,spacy_tokens,filtered_tokens
0,"[as, yasmin, miller, drove, home, from, a, lau...","[yasmin, miller, drove, home, laundromat, chic..."
1,"[as, many, chicagoans, were, celebrating, the,...","[many, chicagoans, celebrating, fourth, july, ..."
2,"[the, nations, th, of, july, weekend, was, mar...","[nations, th, july, weekend, marred, wrong, ki..."
3,"[treasury, secretary, janet, yellen, on, tuesd...","[treasury, secretary, janet, yellen, tuesday, ..."
4,"[treasury, secretary, janet, yellen, on, tuesd...","[treasury, secretary, janet, yellen, tuesday, ..."


In [20]:
from gensim.models import Word2Vec

# Step 1: Train Word2Vec embeddings on the filtered tokens
word2vec_model = Word2Vec(sentences=data['filtered_tokens'], vector_size=100, window=5, min_count=2, workers=4)

# Step 2: Check the nearest neighbors for the word "gun"
if "gun" in word2vec_model.wv:
    similar_words = word2vec_model.wv.most_similar("gun", topn=10)
else:
    similar_words = "Word 'gun' not found in vocabulary."

similar_words


[('background', 0.7790801525115967),
 ('control', 0.7700991034507751),
 ('violence', 0.7597283720970154),
 ('stiffen', 0.7140454649925232),
 ('purchasers', 0.7134820222854614),
 ('birth', 0.7109858989715576),
 ('advocates', 0.6902703642845154),
 ('checks', 0.6894843578338623),
 ('parental', 0.6825336813926697),
 ('spiraled', 0.679111123085022)]

In [24]:
from sklearn.model_selection import train_test_split
!pip install tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.utils import to_categorical

# Step 1: Prepare the data for training
# Define constants
MAX_VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 100

# Tokenize the text
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(data['clean_text'])
sequences = tokenizer.texts_to_sequences(data['clean_text'])

# Pad sequences
X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

# Encode the labels
label_mapping = {'left': 0, 'center': 1, 'right': 2}
y = data['bias_rating'].map(label_mapping).values
y = to_categorical(y, num_classes=3)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 2: Build the Neural Network
model = Sequential([
    Embedding(input_dim=MAX_VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH),
    SpatialDropout1D(0.2),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')
])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display model summary
model.summary()


Collecting tensorflow
  Using cached tensorflow-2.16.2-cp39-cp39-macosx_10_15_x86_64.whl.metadata (4.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Using cached absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=23.5.26 (from tensorflow)
  Using cached flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Using cached gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Using cached google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting h5py>=3.10.0 (from tensorflow)
  Using cached h5py-3.12.1-cp39-cp39-macosx_10_9_x86_64.whl.metadata (2.5 kB)
Collecting libclang>=13.0.0 (from tensorflow)
  Using cached libclang-18.1.1-py2.py3-none-macosx_10_9_x86_64.whl.metadata (5.2 kB)
Collecting ml-dtypes~=0.3.1 (from tensorfl

: 

In [None]:
# Tokenize the text
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(data['clean_text'])
sequences = tokenizer.texts_to_sequences(data['clean_text'])

# Pad sequences
X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

# Encode the labels
label_mapping = {'left': 0, 'center': 1, 'right': 2}
y = data['bias_rating'].map(label_mapping).values
y = to_categorical(y, num_classes=3)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 2: Build the Neural Network
model = Sequential([
    Embedding(input_dim=MAX_VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH),
    SpatialDropout1D(0.2),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')
])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display model summary
model.summary()