In [141]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
import numpy as np

| Вариант | Класс     |
|---------|-----------|
| 8       | 5, 16, 20 |

| № класса | Название класса          |
|----------|--------------------------|
| 5        | 'comp.sys.mac.hardware'  |
| 16       | 'soc.religion.christian' |
| 20       | 'talk.religion.misc'     |

In [130]:
categories = ['comp.sys.mac.hardware', 'soc.religion.christian', 'talk.religion.misc'] 
remove = ('headers', 'footers', 'quotes')
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42, categories=categories, remove=remove)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42, categories=categories, remove=remove)

In [131]:
for i in range(len(categories)):
    print(f'Документ класса {categories[i]}:')
    print(twenty_train.data[np.where(twenty_train.target == i)[0][0]], 
          end='\n-------------------------------------------------\n')

Документ класса comp.sys.mac.hardware:
A SIMM is a small PCB with DRAM chips soldered on.

--maarten
-------------------------------------------------
Документ класса soc.religion.christian:

[deletia- and so on]

I seem to have been rather unclear.

What I was asking is this:

Please show me that the most effective substance-absure recovery
programs involve meetinsg peoples' spiritual needs, rather than
merely attempting to fill peoples' spiritual needs as percieved
by the people, A.A, S.R.C. regulars, or snoopy. This will probably
involve defining "spritual needs" (is it not that clear) and
showing that such things exist and how they can be filled.

Annother tack you might take is to say that "fulfilling spiritual
needs" means "acknowledging a "higher power" of some sort, then
show that systems that do require this, work better than otherwise
identical systems that do not. A correlation here would help you,
but as you point out this might just be demonstrating swapping
one crutch for

In [132]:
from nltk.stem import *
from nltk import word_tokenize
from tqdm import tqdm

porter_stemmer = PorterStemmer()
def stem_text(text):
    nltk_tokens = word_tokenize(text)
    line = ' '.join([porter_stemmer.stem(word) for word in nltk_tokens])
    return line

stem_train = list(tqdm(map(stem_text, twenty_train.data), total=len(twenty_train.data)))
stem_test = list(tqdm(map(stem_text, twenty_test.data), total=len(twenty_test.data)))

100%|██████████| 1554/1554 [00:06<00:00, 251.09it/s]
100%|██████████| 1034/1034 [00:04<00:00, 232.14it/s]


In [133]:
def print_frequency_words(train, test, class_num=None, stop_words=None):

    if class_num is not None:
        train = [twenty_train.data[i] for i in np.where(twenty_train.target == class_num)[0]]

    vect = CountVectorizer(max_features = 10000, stop_words = stop_words)
    train_data = vect.fit_transform(train)

    x = list(zip(vect.get_feature_names_out(), np.ravel(train_data.sum(axis=0))))
    x.sort(key=lambda row: row[1], reverse=True) 
    if class_num is not None:
        print(f'Первые 20 наиболее частотных слов {class_num} класса:')
    else:
        print(f'Первые 20 наиболее частотных слов всей выборки:')
    print(x[:20])
    

In [134]:
def calc_jaccard_coef(first_list, second_list):
    '''Рассчет коэффициента Жаккара'''
    first_set = set(first_list)
    second_set = set(second_list)
    total = len(first_set.intersection(second_set))
    coef = total / (len(first_set) + len(second_set) - total)
    return coef

def print_jaccard_coef(data, c0, c1, stop_words=None):
    class0 = [data[i] for i in np.where(twenty_train.target == c0)[0]]
    class1 = [data[i] for i in np.where(twenty_train.target == c1)[0]]
    vect0 = CountVectorizer(max_features = 10000, stop_words = stop_words).fit(class0)
    vect1 = CountVectorizer(max_features = 10000, stop_words = stop_words).fit(class1)
    print(f'Коэффициент Жаккара между {c0} и {c1} классами:', end='\t')
    print(round(calc_jaccard_coef(vect0.get_feature_names_out(), vect1.get_feature_names_out()), 3))

дефолт

In [135]:
print_frequency_words(twenty_train.data, twenty_test.data, class_num=None)
print_frequency_words(twenty_train.data, twenty_test.data, class_num=0)
print_frequency_words(twenty_train.data, twenty_test.data, class_num=1)
print_frequency_words(twenty_train.data, twenty_test.data, class_num=2)

print_jaccard_coef(twenty_train.data, 0, 1)
print_jaccard_coef(twenty_train.data, 0, 2)
print_jaccard_coef(twenty_train.data, 1, 2)

Первые 20 наиболее частотных слов всей выборки:
[('the', 16652), ('to', 8490), ('of', 8334), ('and', 6656), ('that', 5747), ('is', 5591), ('in', 4801), ('it', 3830), ('you', 3092), ('not', 2749), ('for', 2699), ('this', 2486), ('be', 2316), ('are', 2220), ('have', 2166), ('as', 2136), ('with', 2071), ('on', 1823), ('but', 1818), ('was', 1622)]
Первые 20 наиболее частотных слов 0 класса:
[('the', 3290), ('to', 1544), ('and', 1248), ('of', 972), ('is', 966), ('it', 873), ('in', 748), ('for', 731), ('that', 706), ('with', 622), ('have', 534), ('on', 532), ('you', 531), ('this', 482), ('be', 410), ('if', 402), ('but', 361), ('or', 358), ('can', 357), ('not', 347)]
Первые 20 наиболее частотных слов 1 класса:
[('the', 8723), ('of', 4787), ('to', 4584), ('and', 3422), ('that', 3331), ('is', 3177), ('in', 2707), ('it', 1994), ('not', 1617), ('you', 1499), ('this', 1364), ('be', 1333), ('for', 1317), ('as', 1254), ('are', 1241), ('god', 1097), ('have', 1078), ('he', 1037), ('we', 1030), ('but',

стоп-слова

In [136]:
print_frequency_words(twenty_train.data, twenty_test.data, class_num=None, stop_words='english')
print_frequency_words(twenty_train.data, twenty_test.data, class_num=0, stop_words='english')
print_frequency_words(twenty_train.data, twenty_test.data, class_num=1, stop_words='english')
print_frequency_words(twenty_train.data, twenty_test.data, class_num=2, stop_words='english')

print_jaccard_coef(twenty_train.data, 0, 1, stop_words='english')
print_jaccard_coef(twenty_train.data, 0, 2, stop_words='english')
print_jaccard_coef(twenty_train.data, 1, 2, stop_words='english')

Первые 20 наиболее частотных слов всей выборки:
[('god', 1427), ('people', 779), ('jesus', 722), ('know', 625), ('does', 624), ('just', 586), ('don', 571), ('think', 565), ('like', 559), ('say', 461), ('time', 444), ('believe', 437), ('good', 416), ('church', 414), ('bible', 411), ('christian', 396), ('way', 377), ('christ', 373), ('did', 373), ('christians', 333)]
Первые 20 наиболее частотных слов 0 класса:
[('mac', 327), ('apple', 266), ('drive', 211), ('use', 173), ('problem', 171), ('like', 163), ('know', 162), ('does', 160), ('bit', 150), ('just', 148), ('scsi', 142), ('don', 123), ('thanks', 120), ('card', 115), ('32', 113), ('memory', 112), ('new', 110), ('monitor', 106), ('disk', 105), ('ram', 103)]
Первые 20 наиболее частотных слов 1 класса:
[('god', 1097), ('jesus', 466), ('people', 452), ('church', 340), ('think', 337), ('does', 317), ('know', 314), ('believe', 298), ('don', 286), ('christ', 281), ('say', 280), ('just', 279), ('time', 266), ('like', 265), ('faith', 257), ('b

стоп-слова + стемминг

In [137]:
print_frequency_words(stem_train, stem_test, class_num=None, stop_words='english')
print_frequency_words(stem_train, stem_test, class_num=0, stop_words='english')
print_frequency_words(stem_train, stem_test, class_num=1, stop_words='english')
print_frequency_words(stem_train, stem_test, class_num=2, stop_words='english')

print_jaccard_coef(stem_train, 0, 1, stop_words='english')
print_jaccard_coef(stem_train, 0, 2, stop_words='english')
print_jaccard_coef(stem_train, 1, 2, stop_words='english')

Первые 20 наиболее частотных слов всей выборки:
[('thi', 2494), ('wa', 1669), ('god', 1453), ('hi', 1004), ('christian', 909), ('ha', 867), ('doe', 788), ('peopl', 785), ('say', 759), ('use', 731), ('know', 720), ('jesu', 716), ('think', 659), ('ani', 658), ('onli', 625), ('like', 613), ('believ', 599), ('just', 586), ('time', 532), ('did', 518)]
Первые 20 наиболее частотных слов 0 класса:
[('mac', 327), ('apple', 266), ('drive', 211), ('use', 173), ('problem', 171), ('like', 163), ('know', 162), ('does', 160), ('bit', 150), ('just', 148), ('scsi', 142), ('don', 123), ('thanks', 120), ('card', 115), ('32', 113), ('memory', 112), ('new', 110), ('monitor', 106), ('disk', 105), ('ram', 103)]
Первые 20 наиболее частотных слов 1 класса:
[('god', 1097), ('jesus', 466), ('people', 452), ('church', 340), ('think', 337), ('does', 317), ('know', 314), ('believe', 298), ('don', 286), ('christ', 281), ('say', 280), ('just', 279), ('time', 266), ('like', 265), ('faith', 257), ('bible', 250), ('chri

tf-idf

In [140]:
tfidf = TfidfVectorizer(ngram_range=(1, 1))
train_data_tfidf = tfidf.fit_transform(stem_train)
train_data_tfidf = tfidf.transform(stem_test)

pipeline

In [216]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

def pipeline(x_train, y_train, x_test, stop_words=None, max_features=1000, use_idf=True):
    text_clf = Pipeline([('vect', CountVectorizer(max_features=max_features, stop_words=stop_words)),
                        ('tfidf', TfidfTransformer(use_idf=use_idf)),
                        ('clf', MultinomialNB()),], 
                        verbose=10)   
    text_clf = text_clf.fit(x_train, y_train)
    prediction = text_clf.predict(x_test)
    return prediction

In [217]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def print_score(prediction, y_test):

    precision = round(precision_score(prediction, y_test, average='weighted'), 3)
    print ('Precision score: ', precision)
    recall = round(recall_score(prediction, y_test, average='weighted'), 3)
    print ('Recall score: ', recall)
    f1 = round(f1_score(prediction, y_test, average='weighted'), 3)
    print ('F1-score: ', f1)
    accuracy = round(accuracy_score(prediction, y_test), 3)
    print ('Accuracy score: ', accuracy)

    return [precision, recall, f1, accuracy]

Наличие / отсутствие стемминга

In [218]:
prediction = pipeline(twenty_train.data, twenty_train.target, twenty_test.data)
results_withot_stem = print_score(prediction, twenty_test.target)

[Pipeline] .............. (step 1 of 3) Processing vect, total=   0.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   0.0s
Precision score:  0.914
Recall score:  0.752
F1-score:  0.802
Accuracy score:  0.752


In [219]:
prediction = pipeline(stem_train, twenty_train.target, stem_test)
results_stem = print_score(prediction, twenty_test.target)

[Pipeline] .............. (step 1 of 3) Processing vect, total=   0.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   0.0s
Precision score:  0.917
Recall score:  0.766
F1-score:  0.811
Accuracy score:  0.766


Отсечение \ не отсечение стоп-слов

In [220]:
prediction = pipeline(stem_train, twenty_train.target, stem_test, stop_words='english')
results_stop_words = print_score(prediction, twenty_test.target)

[Pipeline] .............. (step 1 of 3) Processing vect, total=   0.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   0.0s
Precision score:  0.899
Recall score:  0.786
F1-score:  0.817
Accuracy score:  0.786


Взвешивание: Count, TF, TF-IDF

In [222]:
prediction = pipeline(stem_train, twenty_train.target, stem_test, stop_words='english', use_idf=False)
results = print_score(prediction, twenty_test.target)

[Pipeline] .............. (step 1 of 3) Processing vect, total=   0.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   0.0s
Precision score:  0.914
Recall score:  0.779
F1-score:  0.817
Accuracy score:  0.779


Количество информативных терминов (max_features) - исследовать 5 значений в диапазоне от 100 до общего количества слов в выборке.

In [224]:
max_featurs = [100, 1000, 2000, 5000, 10000]
for i in max_featurs:
    prediction = pipeline(stem_train, twenty_train.target, stem_test, stop_words='english', max_features=i)
    results = print_score(prediction, twenty_test.target)

[Pipeline] .............. (step 1 of 3) Processing vect, total=   0.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   0.0s
Precision score:  0.856
Recall score:  0.682
F1-score:  0.742
Accuracy score:  0.682
[Pipeline] .............. (step 1 of 3) Processing vect, total=   0.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   0.0s
Precision score:  0.899
Recall score:  0.786
F1-score:  0.817
Accuracy score:  0.786
[Pipeline] .............. (step 1 of 3) Processing vect, total=   0.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   0.0s
Precision score:  0.911
Recall score:  0.783
F1-score:  0.818
Accuracy score:  0.783
[Pipeline] .............. (step 1 of 3) Processing vect, total=   0.2s
[Pipeline] ............. (step 2 of