In [63]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score


from sklearn.feature_extraction.text import TfidfTransformer
from nltk.stem import *
from nltk import word_tokenize
import os
import itertools

In [22]:
categories = ['alt.atheism', 'sci.space', 'soc.religion.christian']
remove = ('headers', 'footers', 'quotes')
t_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=2, categories = categories, remove = remove)
t_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=2, categories = categories, remove = remove)

twenty_train = dict()
twenty_test = dict()
for c in categories:
    twenty_train[c] = fetch_20newsgroups(subset='train', shuffle=True, random_state=2, categories = [c], remove = remove)
    twenty_test[c] = fetch_20newsgroups(subset='test', shuffle=True, random_state=2, categories = [c], remove = remove)
    twenty_train[c] = twenty_train[c].data
    twenty_test[c] = twenty_test[c].data
twenty_train['full'] = t_train.data
twenty_test['full'] = t_test.data

In [26]:
print (twenty_train['alt.atheism'][0], "\n----------------")
print (twenty_train['sci.space'][0], "\n----------------")
print (twenty_train['soc.religion.christian'][0], "\n----------------")


: Upon arriving at home, Joseph probably took advantage of Mary...had his way
: with her so to speak.  Of course, word of this couldn't get around so Mary,
: being the highly-religious follower that she was decided "Hey, I'll just say
: that GOD impregnated me...no one will ever know!"
: 
: Thus, seen as a trustworthy and honorable soul, she was believed...
:     
: And then came Jesus, the child born from violence.
: 
: 
: 

Dave,

Can you explain the purpose of your post, I can't imagine what you
must have thougt it meant.  
----------------

I don't think you're going to be able to see the differences from a sphere
unless they are greatly exaggerated.  Even the equatorial bulge is only
about 1 part in 300 -- you'd never notice a 1mm error in a 30cm globe --
and the other deviations from spherical shape are much smaller. 
----------------
My family has never been particularly religious - singing Christmas
carols is about the limit for them. Thus I've never really believed in God and

In [30]:
def stem(text):
    porter_stemmer = PorterStemmer()
    result = []
    for t in text:
        nltk_tokens = word_tokenize(t)
        line = ''
        for word in nltk_tokens:
            line += ' ' + porter_stemmer.stem(word)
        result.append(line)
    return result

In [48]:
def SortbyTF(inputStr):
    return inputStr[1]

def top_list(vect, data, count):
    x = list(zip(vect.get_feature_names(),np.ravel(data.sum(axis=0))))
    x.sort(key=SortbyTF, reverse = True)
    return x[:count]

def vectorize(train, categories):
    categories.append('full')
    mux = pd.MultiIndex.from_product([['Count','TF','TF-IDF'], ['Без стоп-слов','С стоп-cловами']])
    result = dict()
    for c in categories:
        result[c] = pd.DataFrame(columns=mux)
    
    stop_words = [None, 'english']
    idf = [False, True]
    
    indx_stop = {
        'english': 'Без стоп-слов',
        None: 'С стоп-cловами'
    }
    
    indx_tf = {
        False: 'TF',
        True: 'TF-IDF'
    }
    for c in categories:
        for stop in stop_words:  
            vect = CountVectorizer(max_features=10000, stop_words=stop)
            vect.fit(train[c])
            train_data = vect.transform(train[c])
            result[c]['Count', indx_stop[stop]] = top_list(vect, train_data, 20)
            
            for tf in idf:
                tfidf = TfidfTransformer(use_idf = tf).fit(train_data)
                train_fidf = tfidf.transform(train_data)
                result[c][indx_tf[tf], indx_stop[stop]] = top_list(vect, train_fidf, 20)
    return result

In [41]:
train_s = dict()
test_s = dict()
for с in categories:
    train_s[с] = stem(twenty_train[с])
    test_s[с] = stem(twenty_test[с])

train_s['full'] = stem(twenty_train['full'])
test_s['full'] = stem(twenty_train['full'])

In [49]:
summ_without_stem = vectorize(twenty_train, categories)
summ_with_stem = vectorize(train_s, categories)

for c in ['full'] + categories:
    summ_without_stem[c].to_excel('without_stem_' + c + '.xlsx')
    summ_with_stem[c].to_excel('with_stem_' + c + '.xlsx')



In [54]:
def prespocess(data, max_features, stop_words, use_tf, use_idf):
    tf = None
    cv = CountVectorizer(max_features=max_features, stop_words=stop_words).fit(data)
    if use_tf:
        tf = TfidfTransformer(use_idf=use_idf).fit(cv.transform(data))
    return cv, tf

def models_grid_search(data_train, data_test):
    max_features = [100,500,1000,5000,10000]
    stop_words = ['english', None]
    use_tf = [True, False]
    use_idf = [True, False]
    
    res = dict()
    for param in itertools.product(max_features, stop_words, use_tf, use_idf):
        cv, tf = prespocess(data_train.data, param[0], param[1], param[2], param[3])
        if tf:
            clf = MultinomialNB().fit(tf.transform(cv.transform(data_train.data)), data_train.target)
            prep_test = tf.transform(cv.transform(data_test.data))
        else:
            clf = MultinomialNB().fit(cv.transform(data_train.data), data_train.target)
            prep_test = cv.transform(data_test.data)
        
        name = f'max_features={param[0]}_stop_words={param[1]}_use_tf={param[2]}_use_idf={param[3]}'
        res[name] = pd.DataFrame(classification_report(clf.predict(prep_test), data_test.target, output_dict=True))  
    return res

In [67]:
categories = ['alt.atheism', 'sci.space', 'soc.religion.christian']
remove = ('headers', 'footers', 'quotes')
t_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=2, categories = categories, remove = remove)
t_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=2, categories = categories, remove = remove)
scores = models_grid_search(t_train, t_test)
if not os.path.exists('scores'):
    os.makedirs('scores')
    
for name, score in scores.items():
    score.to_excel('scores/' + name + '.xlsx')

parameters = {
    'vect__max_features': (100,500,1000,5000,10000),
    'vect__stop_words': ('english', None),
    'tfidf__use_idf': (True, False),
}

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, cv=3)
gs_clf.fit(X = t_train.data, y = t_train.target)
print(classification_report(gs_clf.predict(t_test.data), t_test.target))

              precision    recall  f1-score   support

           0       0.46      0.87      0.61       170
           1       0.87      0.90      0.88       384
           2       0.95      0.68      0.79       557

    accuracy                           0.78      1111
   macro avg       0.76      0.82      0.76      1111
weighted avg       0.85      0.78      0.80      1111



In [12]:
print(classification_report(prediction, twenty_test.target))

              precision    recall  f1-score   support

           0       0.47      0.73      0.57       205
           1       0.85      0.83      0.84       404
           2       0.86      0.69      0.76       502

    accuracy                           0.75      1111
   macro avg       0.73      0.75      0.73      1111
weighted avg       0.79      0.75      0.76      1111

