In [1]:
import pandas as pd

df = pd.read_csv("data/processed/combined_news_type_dataset.csv")
df.head()


Unnamed: 0,text,subject,source
0,IRAN MAKES MAJOR Announcement About How They P...,politics,kaggle
1,Britain seeks new ways to detect explosives in...,worldnews,kaggle
2,Fox News Host Calls GOP Out On Voter ID Laws ...,News,kaggle
3,AUSTRIAN JUSTICE SYSTEM Gives Teen With Homema...,left-news,kaggle
4,What Katy Perry Did With This Gift John Mayer ...,entertainment,gossipcop


In [2]:
print(df['subject'].value_counts())



subject
politicsNews       11272
worldnews          10145
entertainment       9879
News                9050
politics            6841
left-news           4459
Government News     1570
US_News              783
Middle-east          778
Name: count, dtype: int64


In [3]:
# Define the mapping dictionary
subject_map = {
    "politicsNews": "politics",
    "politics": "politics",
    "Government News": "politics",
    "left-news": "politics",
    "US_News": "national",
    "Middle-east": "world",
    "worldnews": "world",
    "News": "general",
    "entertainment": "entertainment"
}

# Apply the mapping
df["subject_mapped"] = df["subject"].map(subject_map)

# Check new class distribution
print(df["subject_mapped"].value_counts())


subject_mapped
politics         24142
world            10923
entertainment     9879
general           9050
national           783
Name: count, dtype: int64


In [15]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd


In [17]:
nlp = spacy.load("en_core_web_sm")

def spacy_tokenizer(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.is_space]
    return " ".join(tokens)

df["text_clean"] = df["text"].astype(str).apply(spacy_tokenizer)


In [18]:
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df["text_clean"])


In [19]:
le = LabelEncoder()
y = le.fit_transform(df["subject_mapped"])


In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=42, 
                                                    stratify=y)


In [21]:
import joblib

joblib.dump(tfidf, "models/spacy_tfidf_vectorizer.pkl")
joblib.dump(le, "models/label_encoder.pkl")

pd.DataFrame(X_train.toarray()).to_csv("data/processed/X_train.csv", index=False)
pd.DataFrame(X_test.toarray()).to_csv("data/processed/X_test.csv", index=False)
pd.DataFrame({"label": y_train}).to_csv("data/processed/y_train.csv", index=False)
pd.DataFrame({"label": y_test}).to_csv("data/processed/y_test.csv", index=False)

