In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
categories = ['alt.atheism', 'talk.religion.misc']

data = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)
X = data.data
y = data.target

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
# Increase the max_iter parameter to allow more iterations
logreg = LogisticRegression(max_iter=1000)

pipelines = {
    'Multinomial Naive Bayes - CountVectorizer': Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB())]),
    'Multinomial Naive Bayes - Word2Vec': Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB())]),
    'Multinomial Naive Bayes - Doc2Vec': Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB())]),
    'Logistic Regression - CountVectorizer': Pipeline([('vect', CountVectorizer()), ('clf', logreg)]),
    'Logistic Regression - Word2Vec': Pipeline([('vect', CountVectorizer()), ('clf', logreg)]),
    'Logistic Regression - Doc2Vec': Pipeline([('vect', CountVectorizer()), ('clf', logreg)]),
    'SVM - CountVectorizer': Pipeline([('vect', CountVectorizer()), ('clf', SVC())]),
    'SVM - Word2Vec': Pipeline([('vect', CountVectorizer()), ('clf', SVC())]),
    'SVM - Doc2Vec': Pipeline([('vect', CountVectorizer()), ('clf', SVC())]),
    'Decision Tree - CountVectorizer': Pipeline([('vect', CountVectorizer()), ('clf', DecisionTreeClassifier())]),
    'Decision Tree - Word2Vec': Pipeline([('vect', CountVectorizer()), ('clf', DecisionTreeClassifier())]),
    'Decision Tree - Doc2Vec': Pipeline([('vect', CountVectorizer()), ('clf', DecisionTreeClassifier())]),
}

In [39]:
results = []
for name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results.append({'Algorithm     Feature Extractor': name, 'Accuracy': accuracy})

results_df = pd.DataFrame(results)
print(results_df)



              Algorithm     Feature Extractor  Accuracy
0   Multinomial Naive Bayes - CountVectorizer  0.902098
1          Multinomial Naive Bayes - Word2Vec  0.902098
2           Multinomial Naive Bayes - Doc2Vec  0.902098
3       Logistic Regression - CountVectorizer  0.905594
4              Logistic Regression - Word2Vec  0.905594
5               Logistic Regression - Doc2Vec  0.905594
6                       SVM - CountVectorizer  0.678322
7                              SVM - Word2Vec  0.678322
8                               SVM - Doc2Vec  0.678322
9             Decision Tree - CountVectorizer  0.891608
10                   Decision Tree - Word2Vec  0.884615
11                    Decision Tree - Doc2Vec  0.884615


In [41]:
best_idx = results_df['Accuracy'].idxmax()
best_algorithm = results_df.loc[best_idx, 'Algorithm     Feature Extractor']

print(best_idx,best_algorithm,best_accuracy)
best_result_df= pd.DataFrame
best_result = best_result_df({'Algorithm     Feature Extractor': [best_algorithm], 'Accuracy': [best_accuracy]})
print(results_df)
print(best_result)

3 Logistic Regression - CountVectorizer 0.9055944055944056
              Algorithm     Feature Extractor  Accuracy
0   Multinomial Naive Bayes - CountVectorizer  0.902098
1          Multinomial Naive Bayes - Word2Vec  0.902098
2           Multinomial Naive Bayes - Doc2Vec  0.902098
3       Logistic Regression - CountVectorizer  0.905594
4              Logistic Regression - Word2Vec  0.905594
5               Logistic Regression - Doc2Vec  0.905594
6                       SVM - CountVectorizer  0.678322
7                              SVM - Word2Vec  0.678322
8                               SVM - Doc2Vec  0.678322
9             Decision Tree - CountVectorizer  0.891608
10                   Decision Tree - Word2Vec  0.884615
11                    Decision Tree - Doc2Vec  0.884615
         Algorithm     Feature Extractor  Accuracy
0  Logistic Regression - CountVectorizer  0.905594


In [42]:


with open('Isha_Task0_Text_Classification.txt', 'a') as f:
    f.write(results_df.to_string())
    f.write("\n\nBest Result:\n")
    f.write(best_result.to_string())


In [7]:
import os
print(os.getcwd())

C:\Users\isha6\Downloads
