In [13]:
import numpy as np
import pandas as pd
from collections import defaultdict


In [14]:
import pickle

file = 'pklFiles/df.pkl'
fileobj = open(file, 'rb')

df = pickle.load(fileobj)

fileobj.close()

print(type(df))


df.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,category,title,ENCODED_CATEGORY
0,tech,Computer grid to help the world\n\nYour comput...,4
1,tech,Gadget growth fuels eco concerns\n\nTechnology...,4
2,tech,Sony wares win innovation award\n\nSony has ta...,4
3,tech,New Year's texting breaks record\n\nA mobile p...,4
4,tech,Players sought for $1m prize\n\nUK gamers are ...,4


In [15]:
#News Headlines

X  = df['title']

#Encoded News Categories

y = df['ENCODED_CATEGORY']

In [16]:
np.random.seed(42)
shuffled_df = df.sample(frac=1).reset_index(drop=True)

split_ratio = 0.8
split_index = int(len(shuffled_df) * split_ratio)

X_train = shuffled_df['title'][:split_index]
y_train = shuffled_df['ENCODED_CATEGORY'][:split_index]

X_test = shuffled_df['title'][split_index:]
y_test = shuffled_df['ENCODED_CATEGORY'][split_index:]

In [17]:
print("Shape of X: ", X.shape)
print("Shape of y: ", y.shape)


print("\n")
print("Shape of X_train: " +  str(X_train.shape))
print("Shape of y_train: " +  str(y_train.shape))
print("\n")
print("Shape of X_test: " +  str(X_test.shape))
print("Shape of y_test: " +  str(y_test.shape))

Shape of X:  (2435,)
Shape of y:  (2435,)


Shape of X_train: (1948,)
Shape of y_train: (1948,)


Shape of X_test: (487,)
Shape of y_test: (487,)


Feature Selection: TF-IDF Approach

In [18]:
class TfidfVectorizer:
    def __init__(self, max_df=0.95, min_df=2):
        self.vocabulary = {}
        self.document_count = None
        self.max_df = max_df
        self.min_df = min_df
        self.idf = None
        
    def _build_vocabulary(self, documents):
        doc_freq = defaultdict(int)
        term_freq = defaultdict(lambda: defaultdict(int))
        
        for doc_idx, doc in enumerate(documents):
            if pd.isna(doc):
                continue
                
            words = str(doc).lower().split()
            seen_words = set()
            
            for word in words:
                term_freq[doc_idx][word] += 1
                if word not in seen_words:
                    doc_freq[word] += 1
                    seen_words.add(word)
        
        n_docs = len(documents)
        
        valid_terms = {
            term: freq for term, freq in doc_freq.items()
            if self.min_df <= freq <= n_docs * self.max_df
        }
        
        self.vocabulary = {term: idx for idx, term in enumerate(valid_terms)}
        self.document_count = n_docs
        
        self.idf = {
            term: np.log(n_docs / (freq + 1)) + 1
            for term, freq in doc_freq.items()
            if term in self.vocabulary
        }
        
        return term_freq
    
    def fit_transform(self, documents):
        term_freq = self._build_vocabulary(documents)
        return self._create_tfidf_matrix(documents, term_freq)
    
    def _create_tfidf_matrix(self, documents, term_freq):
        n_docs = len(documents)
        n_terms = len(self.vocabulary)
        matrix = np.zeros((n_docs, n_terms))
        
        for doc_idx, doc in enumerate(documents):
            if pd.isna(doc):
                continue
                
            doc_terms = term_freq[doc_idx]
            max_freq = max(doc_terms.values()) if doc_terms else 1
            
            for term, freq in doc_terms.items():
                if term in self.vocabulary:
                    term_idx = self.vocabulary[term]
                    tf = 0.5 + 0.5 * (freq / max_freq)  # normalized tf
                    matrix[doc_idx, term_idx] = tf * self.idf[term]
        
        return matrix
    
    def transform(self, documents):
        term_freq = defaultdict(lambda: defaultdict(int))
        
        for doc_idx, doc in enumerate(documents):
            if pd.isna(doc):
                continue
                
            words = str(doc).lower().split()
            for word in words:
                term_freq[doc_idx][word] += 1
                
        return self._create_tfidf_matrix(documents, term_freq)

In [19]:
tfidf = TfidfVectorizer(max_df=0.7)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print("Training set shape:", X_train_tfidf.shape)
print("Test set shape:", X_test_tfidf.shape)

Training set shape: (1948, 4193)
Test set shape: (487, 4193)


In [20]:
class SVM:
    def __init__(self, learning_rate=0.01, lambda_param=0.01, n_iters=100):
        self.lr = learning_rate
        self.lambda_param = lambda_param
        self.n_iters = n_iters
        self.w = None
        self.b = None
        
    def fit(self, X, y):
        n_samples, n_features = X.shape
        
        self.w = np.zeros(n_features)
        self.b = 0
        
        y_ = np.where(y <= 0, -1, 1)
        
        batch_size = 32
        for _ in range(self.n_iters):
            indices = np.random.permutation(n_samples)
            
            for i in range(0, n_samples, batch_size):
                batch_indices = indices[i:min(i + batch_size, n_samples)]
                X_batch = X[batch_indices]
                y_batch = y_[batch_indices]
                
                margin = y_batch * (np.dot(X_batch, self.w) - self.b)
                mask = margin < 1
                
                dw = self.lambda_param * self.w
                dw -= np.sum(X_batch[mask] * y_batch[mask, np.newaxis], axis=0)
                self.w -= self.lr * dw / batch_size
                
                db = -np.sum(y_batch[mask])
                self.b -= self.lr * db / batch_size
                
    def predict(self, X):
        return np.sign(np.dot(X, self.w) - self.b)

In [21]:
from sklearn import metrics
n_classes = len(np.unique(y_train))
classifiers = []

params = {
    'learning_rate': 0.01,
    'lambda_param': 0.01,
    'n_iters': 100
}

for i in range(n_classes):
    print(f"Training classifier for class {i}")
    binary_y = np.where(y_train == i, 1, -1)
    
    svm = SVM(**params)
    svm.fit(X_train_tfidf, binary_y)
    classifiers.append(svm)

def predict_multiclass(X):
    predictions = np.zeros((X.shape[0], n_classes))
    for i, clf in enumerate(classifiers):
        predictions[:, i] = clf.predict(X)
    return np.argmax(predictions, axis=1)

pred = predict_multiclass(X_test_tfidf)

print("\nSupport Vector Machine (TF-IDF Approach)")
print(f"Accuracy Score: {metrics.accuracy_score(y_test, pred):.2%}")
print("\nConfusion Matrix:")
print(metrics.confusion_matrix(y_test, pred))

Training classifier for class 0
Training classifier for class 1
Training classifier for class 2
Training classifier for class 3
Training classifier for class 4
Training classifier for class 5

Support Vector Machine (TF-IDF Approach)
Accuracy Score: 81.11%

Confusion Matrix:
[[108   0   0   1   5   0]
 [ 18  66   0   2   0   0]
 [ 15   5  67   0   1   0]
 [  8   3   0  86   0   0]
 [ 25   5   3   0  68   0]
 [  1   0   0   0   0   0]]


Feature Selection : Bag of Words(BOW) Approach

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
import pickle

X_train_clean = X_train.fillna('')
X_test_clean = X_test.fillna('')

count_vectorizer = CountVectorizer(max_features=5000)
count_X_train = count_vectorizer.fit_transform(X_train_clean)
count_X_test = count_vectorizer.transform(X_test_clean)

with open('pklFiles/count_vectorizer.pkl', 'wb') as file:
    pickle.dump(count_vectorizer, file)

svm_classifier = SVC(C=1.0, kernel='linear', gamma='auto')
svm_classifier.fit(count_X_train, y_train)

with open('pklFiles/svm_classifier.pkl', 'wb') as file:
    pickle.dump(svm_classifier, file)

pred = svm_classifier.predict(count_X_test)

print("Support Vector Machine (BOW Approach)\n")
print(f"Accuracy Score: {accuracy_score(y_test, pred):.2%}\n")
print("Confusion Matrix:")
print(confusion_matrix(y_test, pred))

Support Vector Machine (BOW Approach)

Accuracy Score: 87.68%

Confusion Matrix:
[[92  1  4  4 13  0]
 [ 2 73  3  4  4  0]
 [ 0  0 85  3  0  0]
 [ 0  0  1 95  1  0]
 [ 7  4  3  5 82  0]
 [ 0  0  0  1  0  0]]


In [23]:
import pickle
count_vectorizer = pickle.load(open('pklFiles/count_vectorizer.pkl', 'rb'))
svm_classifier = pickle.load(open('pklFiles/svm_classifier.pkl', 'rb'))

In [24]:
encoded_categories = {
    0: 'business',
    1: 'entertainment',
    2: 'politics',
    3: 'sport',
    4: 'tech'
}

user_headline = [input("Enter News Headline: ")]
headline_count = count_vectorizer.transform(user_headline)

pred = svm_classifier.predict(headline_count)
print("Predicted Category: ", encoded_categories[pred[0]])

Predicted Category:  sport
