In [22]:
import os
import random
import re
from collections import Counter
from typing import List, Dict, Tuple
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [23]:
df = pd.read_csv(r"train.csv", sep=",", encoding="utf-8")
print(df.head())
texts = df["text"].tolist()
labels = df["label"].tolist()
len(df)

                                                text  label
0  BBC set for major shake-up, claims newspaper L...      2
1  Marsh averts cash crunch Embattled insurance b...      2
2  Jeter, Yankees Look to Take Control (AP) AP - ...      1
3  Flying the Sun to Safety When the Genesis caps...      3
4  Stocks Seen Flat as Nortel and Oil Weigh  NEW ...      2


800

In [24]:
stop_words = set(stopwords.words("english"))
def tokenize(text: str) -> List[str]:
    """Simple tokenization: lowercase + remove stopwords"""
    words = re.findall(r"\b[a-z]+\b", text.lower())
    return [w for w in words if w not in stop_words and len(w) > 2]
all_words = []
for row in df.itertuples(index=False):
    all_words.extend(tokenize(row.text))

In [25]:
config = {
    "min_word_freq": 2,
    "max_word_freq_ratio": 0.5,
    "vcand_size": 100
}

In [26]:
word_counts = Counter(all_words)
total_words = len(all_words)
# Filter: not too rare, not too common
min_freq = config["min_word_freq"]
max_freq = int(total_words * config["max_word_freq_ratio"])

filtered_words = [
    (word, count) for word, count in word_counts.items()
    if min_freq <= count <= max_freq
]
# Sort by frequency
filtered_words.sort(key=lambda x: x[1], reverse=True)
top_words_frequency = [word for word, _ in filtered_words]
len(top_words_frequency)

3089

In [27]:
vectorizer = TfidfVectorizer(
    max_features=config["vcand_size"] * 2,
    tokenizer=tokenize,
    lowercase=True,
    min_df=config["min_word_freq"],
    max_df=config["max_word_freq_ratio"]
)
vectorizer.fit(texts)
feature_names = vectorizer.get_feature_names_out()
tfidf_matrix = vectorizer.transform(texts)
avg_tfidf = tfidf_matrix.mean(axis=0).A1
word_scores = list(zip(feature_names, avg_tfidf))
word_scores.sort(key=lambda x: x[1], reverse=True)
top_words_tfidf = [word for word, _ in word_scores]
len(top_words_tfidf)

200

In [28]:
class_words = {0: [], 1: [], 2: [],3:[]}
for text, label in zip(texts, labels):
    words = tokenize(text)
    class_words[label].extend(words)
class_counts = {label: Counter(words) for label, words in class_words.items()}
all_words = set()
for counts in class_counts.values():
    all_words |= set(counts.keys())
labels_order = [0,1,2,3]
word_scores = []
for word in all_words:
    freqs = [class_counts[label].get(word, 0) for label in labels_order]
    total_freq = sum(freqs)
    if total_freq < config["min_word_freq"]:
        continue
    max_freq = max(freqs)
    avg_other = (total_freq - max_freq) / (len(freqs) - 1)
    diff_score = (max_freq - avg_other) / total_freq
    word_scores.append((word, diff_score, total_freq))
word_scores.sort(key=lambda x: (x[1], x[2]), reverse=True)
top_words_class_specific = [word for word, _, _ in word_scores]
len(top_words_class_specific)

3089

In [29]:
set_freq = set(top_words_frequency)
set_tfidf = set(top_words_tfidf)
set_class = set(top_words_class_specific)
combined_vocab = list(set_freq & set_tfidf & set_class)
print(len(combined_vocab))
top_words_combined = random.sample(combined_vocab, config["vcand_size"])
print(len(top_words_combined))

200
100


In [30]:
with open("vcand_frequency.txt", "w", encoding="utf-8") as f:
    for word in top_words_frequency:
        f.write(word + "\n")
with open("vcand_tfidf.txt", "w", encoding="utf-8") as f:
    for word in top_words_tfidf:
        f.write(word + "\n")
with open("vcand_class_specific.txt", "w", encoding="utf-8") as f:
    for word in top_words_class_specific:
        f.write(word + "\n")
with open("vcand.txt", "w", encoding="utf-8") as f:
    for word in top_words_combined:  
        f.write(word + "\n")