In [5]:
from pprint import pprint
from time import time
import logging
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.datasets import fetch_20newsgroups
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

newsgroups = fetch_20newsgroups(subset='all')
print(newsgroups.target_names)
categories = newsgroups.target_names
data = fetch_20newsgroups(subset='train', categories=categories)
print(f"{len(data.filenames)} documents")
print(f"{len(data.target_names)} categories")
print()
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.25, random_state=42)
def tokenize_text(text):
    return word_tokenize(text.lower())
feature_extractors = [
    ('CountVectorizer', CountVectorizer()),
    ('TfidfVectorizer', TfidfVectorizer()),
    ('Word2Vec', Word2Vec(sentences=[tokenize_text(doc) for doc in X_train], vector_size=100, window=5, min_count=1, workers=4)),
    ('Doc2Vec', Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=10))
]
classifiers = [
    ('Multinomial Naive Bayes', MultinomialNB()),
    ('Logistic Regression', LogisticRegression()),
    ('Support Vector Machines', SVC()),
    ('Decision Trees', DecisionTreeClassifier())
]
results_table = pd.DataFrame(columns=['Feature Extractor', 'Classifier', 'Accuracy', 'Best Params'])

for extractor_name, extractor_model in feature_extractors:
    for clf_name, clf_model in classifiers:
        pipeline = Pipeline([
            ('vect', extractor_model),
            ('clf', clf_model),
        ])
        
        # Define the parameter grid for grid search
        param_grid = {}
        
        # For Multinomial Naive Bayes, add 'alpha' to the parameter grid
        if 'Multinomial' in clf_name:
            param_grid['clf__alpha'] = [0.1, 0.5, 1.0]

        # For Logistic Regression, add 'C' to the parameter grid
        if 'Logistic' in clf_name:
            param_grid['clf__C'] = [0.1, 1, 10]

        # Add other classifier-specific parameters as needed
        
        # Create GridSearchCV object
        grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
        
        # Fit the grid search to the data
        grid_search.fit(X_train, y_train)
        
        # Get the best parameters and accuracy
        best_params = grid_search.best_params_
        best_accuracy = grid_search.best_score_

        # Fit the pipeline with the best parameters on the entire training set
        pipeline.set_params(**best_params)
        pipeline.fit(X_train, y_train)
        
        # Evaluate on the test set
        predictions = pipeline.predict(X_test)
        test_accuracy = metrics.accuracy_score(y_test, predictions)

        results_table = results_table.append({
            'Feature Extractor': extractor_name,
            'Classifier': clf_name,
            'Accuracy': test_accuracy,
            'Best Params': best_params
        }, ignore_index=True)

best_config = results_table.loc[results_table['Accuracy'].idxmax()]
results_table.to_csv('Abijith_Task1_Text_Classification_GridSearchCV.txt', index=False, sep='\t')
print("Best Configuration:")
print(best_config)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abhis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
11314 documents
20 categories


AttributeError: 'DataFrame' object has no attribute 'append'