In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import *
from nltk import word_tokenize
import itertools

from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.utils._testing import ignore_warnings 
from sklearn.exceptions import FitFailedWarning, ConvergenceWarning 

In [5]:
categories = ['alt.atheism', 'sci.space', 'soc.religion.christian']
remove = ['headers', 'footers', 'quotes']
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=2, categories = categories, remove = remove)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=2, categories = categories, remove = remove)
twenty_train = pd.DataFrame(twenty_train, columns=['data', 'target']).replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True)
twenty_test = pd.DataFrame(twenty_test, columns=['data', 'target']).replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True)

In [3]:
def stm(text):
    ps = PorterStemmer()
    nltk_tokens = word_tokenize(text)
    result = ''
    for word in nltk_tokens:
        result += ' ' + ps.stem(word)
    return result

In [6]:
twenty_train.insert(loc=1, column='data_stemmed', value=twenty_train['data'].apply(lambda text: stm(text)))
twenty_test.insert(loc=1, column='data_stemmed', value=twenty_test['data'].apply(lambda text: stm(text)))

In [12]:
%%time
parameters = {
    'KNeighborsClassifier': {
        'vect__max_features': (1000,5000,10000),
        'vect__stop_words': ('english', None),
        'tfidf__use_idf': (True, False),
        'clf__n_neighbors': (1, 3, 5, 10),
        'clf__p': (1, 2)
    },
    'LogisticRegression': {
        'vect__max_features': (1000,5000,10000),
        'vect__stop_words': ('english', None),
        'tfidf__use_idf': (True, False),
        'clf__solver': ('lbfgs', 'newton-cg', 'sag', 'saga'),
        'clf__penalty': ['None']
    },
    'DecisionTreeClassifier': {
        'vect__max_features': (1000,5000,10000),
        'vect__stop_words': ('english', None),
        'tfidf__use_idf': (True, False),
        'clf__criterion': ('gini', 'entropy'),
        'clf__max_depth': [*range(1,5,1), *range(5,101,20)]
    },
        {
        'vect__max_features': (1000,5000,10000),
        'vect__stop_words': ('english', None),
        'tfidf__use_idf': (True, False),
        'clf__loss': ['hinge'],
        'clf__penalty': ['l2']
    }],
}

gs = {}
for clf, param in parameters.items():
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', eval(clf)())
    ])
    gs[clf] = GridSearchCV(text_clf, param, n_jobs=-1, error_score=0.0)
    gs[clf].fit(X = twenty_train['data'], y = twenty_train['target'])

SyntaxError: closing parenthesis ']' does not match opening parenthesis '{' on line 1 (<unknown>, line 29)

In [13]:
for clf, param in parameters.items():
    predicted = gs[clf].predict(twenty_test['data'])
    print(metrics.classification_report(twenty_test.target, predicted, target_names=categories))

                        precision    recall  f1-score   support

           alt.atheism       0.42      0.43      0.43       319
             sci.space       0.62      0.48      0.54       394
soc.religion.christian       0.51      0.62      0.56       398

              accuracy                           0.51      1111
             macro avg       0.52      0.51      0.51      1111
          weighted avg       0.52      0.51      0.51      1111

                        precision    recall  f1-score   support

           alt.atheism       0.57      0.40      0.47       319
             sci.space       0.65      0.87      0.74       394
soc.religion.christian       0.69      0.63      0.66       398

              accuracy                           0.65      1111
             macro avg       0.64      0.63      0.63      1111
          weighted avg       0.64      0.65      0.64      1111

                        precision    recall  f1-score   support

           alt.atheism       0.80

In [16]:
r = {}
def highlight_max(x, color):

    return np.where(x == np.nanmax(x.to_numpy()), f"color: {color};", None)

total_style = pd.Series("font-weight: bold;", index=[1])

for clf, param in parameters.items():
    predicted = gs[clf].predict(twenty_test['data'])
    
    pd.DataFrame(gs[clf].cv_results_).to_excel('all' + clf + '.xlsx')
    pd.DataFrame(classification_report(predicted, twenty_test.target, output_dict=True)).to_excel('best' + clf + '.xlsx')
    