<a href="https://colab.research.google.com/github/Kenwoll/data-eng-project/blob/main/model/categorizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

In [None]:
df_train = pd.read_csv('BBC News Train.csv')
df_test = pd.read_csv('BBC News Test.csv')
df_train.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [None]:
import string

punctuation = string.punctuation
df_train['Text'] = df_train['Text'].apply(lambda x: ''.join([i for i in x if i not in punctuation]).split())
df_test['Text'] = df_test['Text'].apply(lambda x: ''.join([i for i in x if i not in punctuation]).split())
df_train.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,"[worldcom, exboss, launches, defence, lawyers,...",business
1,154,"[german, business, confidence, slides, german,...",business
2,1101,"[bbc, poll, indicates, economic, gloom, citize...",business
3,1976,"[lifestyle, governs, mobile, choice, faster, b...",tech
4,917,"[enron, bosses, in, 168m, payout, eighteen, fo...",business


In [None]:
# prompt: drop column from df

df_train.drop('ArticleId', axis=1, inplace=True)
df_test.drop('ArticleId', axis=1, inplace=True)

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
stop_words = stopwords.words('english')

df_train['Text'] = df_train['Text'].apply(lambda x: ' '.join([i for i in x if i not in stop_words]))
df_test['Text'] = df_test['Text'].apply(lambda x: ' '.join([i for i in x if i not in stop_words]))
df_train.head()

Unnamed: 0,Text,Category
0,worldcom exboss launches defence lawyers defen...,business
1,german business confidence slides german busin...,business
2,bbc poll indicates economic gloom citizens maj...,business
3,lifestyle governs mobile choice faster better ...,tech
4,enron bosses 168m payout eighteen former enron...,business


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_train['Text'], df_train['Category'], test_size=0.2, random_state=42)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

ft_pipe = Pipeline([('count_vect', CountVectorizer()), ('tfidf', TfidfTransformer())])
X_train_tf = ft_pipe.fit_transform(X_train)
X_test_tf = ft_pipe.transform(X_test)
X_train.shape

(1192,)

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

## MultinomialNB model

In [None]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(X_train_tf, y_train_encoded)
pred = mnb.predict(X_test_tf)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

print("nb: ", accuracy_score(pred, y_test_encoded))
print(classification_report(y_test_encoded, pred))

nb:  0.9630872483221476
              precision    recall  f1-score   support

           0       0.96      0.97      0.97        75
           1       1.00      0.98      0.99        46
           2       0.91      0.95      0.93        56
           3       0.97      1.00      0.98        63
           4       0.98      0.91      0.95        58

    accuracy                           0.96       298
   macro avg       0.97      0.96      0.96       298
weighted avg       0.96      0.96      0.96       298



### Logistic Regression model

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train_tf, y_train_encoded)
pred = lr.predict(X_test_tf)

In [None]:
print("lr: ", accuracy_score(pred, y_test_encoded))
print(classification_report(y_test_encoded, pred))

lr:  0.9630872483221476
              precision    recall  f1-score   support

           0       0.92      0.97      0.95        75
           1       1.00      0.98      0.99        46
           2       0.96      0.95      0.95        56
           3       0.97      1.00      0.98        63
           4       0.98      0.91      0.95        58

    accuracy                           0.96       298
   macro avg       0.97      0.96      0.96       298
weighted avg       0.96      0.96      0.96       298



### Random Forest model with word embedding

In [None]:
from gensim.models import Word2Vec
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [None]:
X_train_word = [x.split() for x in X_train]
X_test_word = [x.split() for x in X_test]

In [None]:
word2vec_model = Word2Vec(X_train_word, vector_size=100, window=5, min_count=1, workers=4)
word_vectors = word2vec_model.wv

In [None]:
def get_doc_vector(tokens, model):
    vectors = [model[token] for token in tokens if token in model]
    return np.mean(vectors, axis=0)

In [None]:
X_train_word2vec = [get_doc_vector(tokens, word_vectors) for tokens in X_train_word]
X_test_word2vec = [get_doc_vector(tokens, word_vectors) for tokens in X_test_word]

In [None]:
rf = RandomForestClassifier(n_estimators=20, random_state=42)
rf.fit(X_train_word2vec, y_train_encoded)
pred = rf.predict(X_test_word2vec)

In [None]:
print("rf: ", accuracy_score(pred, y_test_encoded))
print(classification_report(y_test_encoded, pred))


rf:  0.7818791946308725
              precision    recall  f1-score   support

           0       0.72      0.71      0.71        75
           1       0.78      0.70      0.74        46
           2       0.77      0.89      0.83        56
           3       0.83      0.83      0.83        63
           4       0.84      0.79      0.81        58

    accuracy                           0.78       298
   macro avg       0.79      0.78      0.78       298
weighted avg       0.78      0.78      0.78       298

