In [None]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec
from sklearn.preprocessing import StandardScaler
import gensim

In [None]:
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.3, random_state=42)

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

In [None]:
X_train_tokens = [gensim.utils.simple_preprocess(doc) for doc in X_train]
X_test_tokens = [gensim.utils.simple_preprocess(doc) for doc in X_test]
word2vec_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.train(X_train_tokens, total_examples=len(X_train_tokens), epochs=10)


In [None]:
def get_average_word_vectors(tokens_list, model, vector_size):
    feature_vectors = []
    for tokens in tokens_list:
        vec = np.zeros(vector_size)
        count = 0
        for token in tokens:
            if token in model.wv.key_to_index:
                vec += model.wv[token]
                count += 1
        if count > 0:
            vec /= count
        feature_vectors.append(vec)
    return np.array(feature_vectors)

In [None]:
X_train_w2v = get_average_word_vectors(X_train_tokens, word2vec_model, 100)
X_test_w2v = get_average_word_vectors(X_test_tokens, word2vec_model, 100)

In [None]:
scaler = StandardScaler()
X_train_w2v = scaler.fit_transform(X_train_w2v)
X_test_w2v = scaler.transform(X_test_w2v)

In [None]:
model_tfidf = LogisticRegression(max_iter=1000)
model_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = model_tfidf.predict(X_test_tfidf)

model_w2v = LogisticRegression(max_iter=1000)
model_w2v.fit(X_train_w2v, y_train)
y_pred_w2v = model_w2v.predict(X_test_w2v)

In [None]:
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
accuracy_w2v = accuracy_score(y_test, y_pred_w2v)

print(f"TF-IDF Model Accuracy: {accuracy_tfidf * 100:.2f}%")
print(f"Word2Vec Model Accuracy: {accuracy_w2v * 100:.2f}%")