In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import spacy

from gensim import corpora, models

from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models import Word2Vec
from tabulate import tabulate

In [3]:
from sklearn.svm import SVC

from sklearn.naive_bayes import GaussianNB

from sklearn import tree

from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import AdaBoostClassifier

from sklearn.neural_network import MLPClassifier

In [4]:
from spacy.lang.en import stop_words
from string import punctuation

In [5]:
import numpy as np
import pandas as pd

In [5]:
categories = ["comp.graphics", "rec.autos", "sci.med", "talk.politics.mideast"]
news_groups = fetch_20newsgroups(
    subset="all",
    categories=categories,
    remove=("headers", "footers", "quotes"),
    shuffle=False,
)

In [7]:
import re

cleaned_collection = [re.sub(r"[\n\t]+| {2,}", " ", text) for text in news_groups.data]

In [8]:
nlp = spacy.load("en_core_web_sm")

In [9]:
stop_words = stop_words.STOP_WORDS
punctuations = list(punctuation)

token_collection = []
vector = []

lemmatized_collection = [
    [
        (token.lemma_.lower(), token.pos_)
        for token in nlp(text)
        if token.lemma_.lower() not in stop_words
        and token.lemma_.lower() not in punctuations
        and not token.pos_ == "PUNCT"
        and not token.pos_ == "SPACE"
    ]
    for text in cleaned_collection
]

In [10]:
lemm_texts = [
    " ".join(text)
    for text in [
        [token[0] for token in lemmatized_text]
        for lemmatized_text in lemmatized_collection
    ]
]

lemm_texts_nouns_adj = [
    " ".join(text)
    for text in [
        [
            token[0]
            for token in lemmatized_text
            if token[1] == "NOUN" or token[1] == "ADJ"
        ]
        for lemmatized_text in lemmatized_collection
    ]
]

lemm_texts = [text.split() for text in lemm_texts]
lemm_texts_nouns_adj = [text.split() for text in lemm_texts_nouns_adj]

In [19]:
# 6
svc = SVC()
gnb = GaussianNB()
dec_tree = tree.DecisionTreeClassifier()

In [11]:
# 7
rfc = RandomForestClassifier(max_depth=3)
abc = AdaBoostClassifier(n_estimators=100)
mlpc = MLPClassifier(hidden_layer_sizes=(100,))

In [12]:
def tfidf_vectorize(data):
    data = [' '.join(d) for d in data]
    tfidf_vectorizer = TfidfVectorizer()
    
    return tfidf_vectorizer.fit_transform(data).toarray()

In [13]:
def word2vec_vectorize(data):
    model = Word2Vec(sentences=data, vector_size=100, min_count=1, workers=4)

    def document_vector(model, tokenized_document):
        vectors = [model.wv[word] for word in tokenized_document if word in model.wv]
        return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)
    
    return np.array([document_vector(model, doc) for doc in data])

In [14]:
def LDA_vectorize(data):
    num_topics = 8
    text_vectors = []

    dictionary = corpora.Dictionary(data)
    bow_corpus = [dictionary.doc2bow(doc) for doc in data]

    model = models.LdaModel(bow_corpus, id2word=dictionary, num_topics=num_topics, passes=15)

    for doc_bow in bow_corpus:
        document_topics = model.get_document_topics(doc_bow)
        document_topic_vector = [topic_prob for _, topic_prob in document_topics]
        text_vectors.append(document_topic_vector)

    return np.array(text_vectors)

In [15]:
def classify(model, train_data, test_data, train_target, test_target):
    model.fit(train_data, train_target)

    predictions = model.predict(test_data)

    f1 = f1_score(test_target, predictions, average='weighted')

    res = f'F1 Score: {f1:.4f}'
    return res 

In [16]:
def main(dataset, tokinizers, classifiers, lemm_texts, lemm_texts_nouns_adj):
    df = pd.DataFrame(
        "",
        index=[tokinizer for tokinizer in tokinizers],
        columns=[classifier for classifier in classifiers],
    )

    list_of_data = {"ALL": lemm_texts, "NOUNS and ADJ": lemm_texts_nouns_adj}

    for classifier in classifiers:
        for name, data in list_of_data.items():
            for tokinizer in tokinizers:
                vectors = tokinizers[tokinizer](data)
                train_data, test_data, train_target, test_target = train_test_split(
                    vectors, dataset.target, test_size=0.2)
                df.loc[tokinizer, classifier] = (
                    df.loc[tokinizer, classifier]
                    + name
                    + ": "
                    + classify(
                        classifiers[classifier],
                        train_data,
                        test_data,
                        train_target,
                        test_target,
                    )
                    + "\n"
                )

    return df

In [17]:
tokinizers = {
    "tfidf": tfidf_vectorize,
    "LDA": LDA_vectorize,
    "word2vec": word2vec_vectorize
}

classifiers = {
    "SVM": svc,
    "Naive Bayes": gnb,
    "Decision Trees": dec_tree,
    "Random Forest": rfc,
    "Ada Boost classifier": abc,
    "MLP": mlpc
}


In [None]:
df = main(news_groups, tokinizers, classifiers, lemm_texts, lemm_texts_nouns_adj)

In [25]:
print(tabulate(df, headers="keys", tablefmt="grid"))

+----------+---------------------------------+---------------------------------+---------------------------------+---------------------------------+---------------------------------+---------------------------------+
|          | SVM                             | Naive Bayes                     | Decision Trees                  | Random Forest                   | Ada Boost classifier            | MLP                             |
| tfidf    | ALL: F1 Score: 0.9066           | ALL: F1 Score: 0.8593           | ALL: F1 Score: 0.7674           | ALL: F1 Score: 0.7669           | ALL: F1 Score: 0.7756           | ALL: F1 Score: 0.9095           |
|          | NOUNS and ADJ: F1 Score: 0.8686 | NOUNS and ADJ: F1 Score: 0.7945 | NOUNS and ADJ: F1 Score: 0.7456 | NOUNS and ADJ: F1 Score: 0.7535 | NOUNS and ADJ: F1 Score: 0.7270 | NOUNS and ADJ: F1 Score: 0.8598 |
+----------+---------------------------------+---------------------------------+---------------------------------+------------------