Project: 3 News Topic Classification

Problem Statement:
Categorize news articles into politics, sports, entertainment, etc.

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [10]:
os.chdir(os.getcwd().replace('notebooks', 'data'))

df = pd.read_csv('data.csv')


In [11]:
df.sample(10)


Unnamed: 0,Class Index,Title,Description
42474,1,Federal finance minister expecting 'significan...,Canadian Press - CALGARY (CP) - The federal go...
81012,1,Zimbabwe deports South African trade union fac...,HARARE : Zimbabwean immigration officials orde...
104366,2,The match that always bears a grudge,"In sport few teams attract universal love, and..."
102558,2,Power romps over rival,So much for the intense match between long-tim...
30888,3,Britain #39;s trade gap widens as oil imports ...,Britain #39;s international trade gap widened ...
522,3,Director Leaves Hollinger Inc. Board,"Hollinger Inc., th #39;e Toronto-based holding..."
104141,2,"Hewitt routs Gaudio, eases into Masters Cup semis","Houston, TX (Sports Network) - Third-seeded Ll..."
91119,1,"Two Palestinians killed in Gaza Strip, teen ki...",JERUSALEM : Two Palestinians were killed near ...
55967,2,France #39;s Trezeguet may need shoulder opera...,Juventus striker David Trezeguet may have to u...
11737,1,S.Korea's Ruling Party Head Resigns on Father'...,SEOUL (Reuters) - The chairman of South Korea...


In [12]:
df.columns = ['label', 'title', 'description']


In [13]:
df.sample(10)


Unnamed: 0,label,title,description
80000,4,Cassini set to pierce moon #39;s haze,The Cassini spacecraft is set to reveal the cl...
125030,1,Varig Future in Doubt Amid Takeover Talks,"The future of Varig, Brazil's flagship airline..."
21948,1,Bangladesh at standstill in fourth opposition ...,DHAKA : Bangladesh was brought to a standstill...
38565,4,New Online Music: Easy Listening from Stelios,The service is the latest in a portfolio of bu...
80096,4,Intel invests in McCaw #39;s Clearwire,"The Santa Clara, Calif. semiconductor giant ye..."
122535,4,Juniper poaches Cisco execs,Juniper Networks lands two top Cisco Systems e...
124842,2,Players: Season in Peril,Veterans Chis Pronger and Jim McKenzie don't b...
103150,4,IBM tops server speed test,IBM #39;s new top-end Power5-based Unix server...
98809,3,"When love comes to MarketWatch, I #39;m gonna ...",Although the combined companies wont hurt each...
55332,1,Leaders Launch Drive to Curb Polio in Africa,"KANO, Nigeria (Reuters) - Political leaders a..."


In [14]:
df.sample(10)


Unnamed: 0,label,title,description
122539,4,Photo: Weather data to your desktop,"This WeatherBug Backyard weather station, dubb..."
16850,4,DoubleClick Signs on MSN to Use Its Rich Media...,Reuters - A top executive for Web marketing\co...
120223,1,Turkish Leader Calls for EU Membership Talks,Turkish Prime Minister Recep Tayyip Erdogan is...
28630,1,Israeli Armored Vehicles Mass in Gaza: Witnesses,Israeli armored vehicles backed by helicopters...
32161,2,"No. 18 Auburn 43, Mississippi St. 14",Carnell Williams ran for 122 yards and two tou...
68671,4,More Flaws With Microsoft,"Again, yesterday Microsoft (Nasdaq: MSFT) warn..."
34282,3,Krispy Kreme quarterly report delayed,Shares of Krispy Kreme Doughnuts Inc. fell yes...
115002,3,U.S. Mortgage Rates Rise in Latest Week (Reuters),Reuters - Interest rates on U.S. 30-year and\1...
19717,2,Check Fridays and Mondays for weekly team upda...,"quot; I admit it, most of my notes for this g..."
34363,4,IBM Covets Half of China Business Computer Mar...,Reuters - International Business Machines Corp...


In [15]:
df['news_text'] = df['title'] + ' ' + df['description']


In [16]:
df.head()


Unnamed: 0,label,title,description,news_text
0,3,Fears for T N pension after talks,Unions representing workers at Turner Newall...,Fears for T N pension after talks Unions repre...
1,4,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o...",The Race is On: Second Private Team Sets Launc...
2,4,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...,Ky. Company Wins Grant to Study Peptides (AP) ...
3,4,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...,Prediction Unit Helps Forecast Wildfires (AP) ...
4,4,Calif. Aims to Limit Farm-Related Smog (AP),AP - Southern California's smog-fighting agenc...,Calif. Aims to Limit Farm-Related Smog (AP) AP...


In [18]:
df = df[['news_text', 'label']]



In [19]:
df.isna().sum()


news_text    0
label        0
dtype: int64

In [20]:
df.dropna(inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


In [21]:
df.shape


(127600, 2)

In [22]:
df.duplicated().sum()


0

In [23]:
X, y = df['news_text'], df['label']


In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [26]:

import re
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()


def convert_text(text_series):
    return text_series.apply(lambda text: ' '.join(
        stemmer.stem(word) for word in re.findall(r'\b\w+\b', text.lower())
    ))


In [27]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier


In [28]:
model = Pipeline([
    ('preprocess', FunctionTransformer(convert_text, validate=False)),
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=8000, ngram_range=(1, 2))),
    ('clf', MultinomialNB())
])


In [29]:
model.fit(X_train, y_train)


In [30]:
model.score(X_test, y_test)


0.9043887147335423

In [None]:
models = {
    'Logistic Regression' : LogisticRegression(),
    'Random Forest' : RandomForestClassifier(),
    'Support Vector Machine' : SVC(),
    'Decision Tree' : DecisionTreeClassifier(),
    'ExtraTreeClassifier' : ExtraTreeClassifier(),
    'SVC' : SVC(),
    'Multinomial Naive Bayes' : MultinomialNB(),
    'Bernoulli Naive Bayes' : BernoulliNB(),
    'AdaBoost' : AdaBoostClassifier(),
    'Gradient Boosting' : GradientBoostingClassifier(),
    
}


for name, current_model in models.items():
    print(f"Checking accuracy for {name}")
    model = Pipeline([
        ('preprocess', FunctionTransformer(convert_text, validate=False)),
        ('tfidf', TfidfVectorizer(stop_words='english', max_features=8000, ngram_range=(1, 2))),
        ('clf', current_model)
    ])

    model.fit(X_train, y_train)
    print(f"Accuracy for {name} is {model.score(X_test, y_test)}")
    print("*"*50)
    print("\n")



Checking accuracy for Multinomial Naive Bayes
Accuracy for Multinomial Naive Bayes is 0.9528535980148883
**************************************************


Checking accuracy for Bernoulli Naive Bayes
Accuracy for Bernoulli Naive Bayes is 0.9586435070306039
**************************************************


Checking accuracy for AdaBoost




Accuracy for AdaBoost is 0.9387923904052936
**************************************************


Checking accuracy for Gradient Boosting
Accuracy for Gradient Boosting is 0.9346567411083541
**************************************************




In [None]:
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('preprocess', FunctionTransformer(convert_text, validate=False)),
    ('tfidf', TfidfVectorizer()),
    ('clf', BernoulliNB())
])

# Define parameter grid
param_grid = {
    'tfidf__max_df': [0.9, 1.0],
    'tfidf__min_df': [1, 3, 5],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__max_features': [5000, 8000, 10000],
    'clf__alpha': [0.1, 0.5, 1.0],  # Laplace smoothing
    'clf__binarize': [0.0, 0.5, 1.0],
    'clf__fit_prior': [True, False]
}

# Run GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)

# Best score and params
print("Best accuracy:", grid_search.best_score_)
print("Best parameters:", grid_search.best_params_)
