In [1]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)

In [2]:
categories = ['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

cat5 = categories[::4] 
# every 4th, 5 total
cat10 = categories[::2]
# every other, 10 total
cat15 = categories[5:]
# everyone after 5, 15 total

cat5

['alt.atheism',
 'comp.sys.mac.hardware',
 'rec.motorcycles',
 'sci.electronics',
 'talk.politics.guns']

In [3]:
five_train = fetch_20newsgroups(subset='train', categories=cat5, shuffle=True, random_state=42)
ten_train = fetch_20newsgroups(subset='train', categories=cat10, shuffle=True, random_state=42)
fifteen_train = fetch_20newsgroups(subset='train', categories=cat15, shuffle=True, random_state=42)
# reimporting. probably ineficient but saves time having to clean/loop or do python comprehension
# which I'm not good at

In [4]:
print(len(five_train.data), len(ten_train.data), len(fifteen_train.data), len(twenty_train.data))
# yup, that worked

2793 5627 8491 11314


In [5]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [6]:
# now time for tf-idf
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline
NB_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

Tree_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', DecisionTreeClassifier(criterion="entropy")),
])

SVC_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SVC(C=1, kernel='linear')),
])

KNN_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', KNeighborsClassifier(n_neighbors=3)),
])

print('ran')


ran


In [7]:
# import testing sets

twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
fifteen_test = fetch_20newsgroups(subset='test', categories=cat15, shuffle=True, random_state=42)
ten_test = fetch_20newsgroups(subset='test', categories=cat10, shuffle=True, random_state=42)
five_test = fetch_20newsgroups(subset='test', categories=cat5, shuffle=True, random_state=42)

print(len(five_test.data), len(twenty_test.data))

1859 7532


In [8]:
# initialize NBs


# this doesnt really work because these all reference the same classifier.
# instead, they just need to be initialized
# we use them later in the next loop
NB5 = NB_clf.fit(five_train.data, five_train.target)
NB10 = NB_clf.fit(ten_train.data, ten_train.target)
NB15 = NB_clf.fit(fifteen_train.data, fifteen_train.target)
NB20 = NB_clf.fit(twenty_train.data, twenty_train.target)

print('done')

done


In [9]:
# test NBs
from sklearn.metrics import f1_score, recall_score, precision_score

NB_prc = []
NB_rec = []
NB_f1 = []

for clf, data, train in zip([NB5, NB10, NB15, NB20], [five_test, ten_test, fifteen_test, twenty_test], [five_train, ten_train, fifteen_train, twenty_train]):
#   zip all the training, testing, and clf names together
#   redo the classifier for each loop
#   add them to their score arrays
    
    clf = NB_clf.fit(train.data, train.target)
    NB_prc.append(precision_score(data.target, clf.predict(data.data), average='weighted'))
    NB_rec.append(recall_score(data.target, clf.predict(data.data), average='weighted'))
    NB_f1.append(f1_score(data.target, clf.predict(data.data), average='weighted'))



print(NB_prc, NB_rec, NB_f1)

[0.9333304357510203, 0.8915244280548772, 0.8684918261824941, 0.8218781741893993] [0.9279182356105433, 0.8619861185264281, 0.8261100300725278, 0.7738980350504514] [0.9278379899131345, 0.8616827513193228, 0.8166020487217819, 0.7684457156894653]


In [10]:
import pandas as pd

nb_df = pd.DataFrame(data=[NB_prc, NB_rec, NB_f1], index=['Precision', 'Recall', 'F1'], columns=[5,10,15,20]) 
nb_df = nb_df.rename_axis("Naive Bayes")

nb_df


Unnamed: 0_level_0,5,10,15,20
Naive Bayes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Precision,0.93333,0.891524,0.868492,0.821878
Recall,0.927918,0.861986,0.82611,0.773898
F1,0.927838,0.861683,0.816602,0.768446


In [11]:
# onto KNN
KNN_prc = []
KNN_rec = []
KNN_f1 = []

for data, train in zip([five_test, ten_test, fifteen_test, twenty_test], [five_train, ten_train, fifteen_train, twenty_train]):
#   zip all the training, testing, and clf names together
#   redo the classifier for each loop
#   add them to their score arrays
    
    clf = KNN_clf.fit(train.data, train.target)
    KNN_prc.append(precision_score(data.target, clf.predict(data.data), average='weighted'))
    KNN_rec.append(recall_score(data.target, clf.predict(data.data), average='weighted'))
    KNN_f1.append(f1_score(data.target, clf.predict(data.data), average='weighted'))

    
knn_df = pd.DataFrame(data=[KNN_prc, KNN_rec, KNN_f1], index=['Precision', 'Recall', 'F1'], columns=[5,10,15,20]) 
knn_df = knn_df.rename_axis("K-Nearest Neighbors")

knn_df

Unnamed: 0_level_0,5,10,15,20
K-Nearest Neighbors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Precision,0.831971,0.755546,0.740087,0.677474
Recall,0.822485,0.750667,0.737308,0.65786
F1,0.820486,0.748264,0.73439,0.660516


In [12]:
SVC_prc = []
SVC_rec = []
SVC_f1 = []

for data, train in zip([five_test, ten_test, fifteen_test, twenty_test], [five_train, ten_train, fifteen_train, twenty_train]):
#   zip all the training, testing, and clf names together
#   redo the classifier for each loop
#   add them to their score arrays
    
    clf = SVC_clf.fit(train.data, train.target)
    SVC_prc.append(precision_score(data.target, clf.predict(data.data), average='weighted'))
    SVC_rec.append(recall_score(data.target, clf.predict(data.data), average='weighted'))
    SVC_f1.append(f1_score(data.target, clf.predict(data.data), average='weighted'))

    
svc_df = pd.DataFrame(data=[SVC_prc, SVC_rec, SVC_f1], index=['Precision', 'Recall', 'F1'], columns=[5,10,15,20]) 
svc_df = svc_df.rename_axis("SVC")
print('done')
svc_df

done


Unnamed: 0_level_0,5,10,15,20
SVC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Precision,0.948146,0.901849,0.889507,0.839604
Recall,0.946746,0.898558,0.885371,0.834705
F1,0.947135,0.897493,0.884588,0.834515


In [15]:
# Decision tree

tree_prc = []
tree_rec = []
tree_f1 = []

for data, train in zip([five_test, ten_test, fifteen_test, twenty_test], [five_train, ten_train, fifteen_train, twenty_train]):
#   zip all the training, testing, and clf names together
#   redo the classifier for each loop
#   add them to their score arrays
    
    clf = Tree_clf.fit(train.data, train.target)
    tree_prc.append(precision_score(data.target, clf.predict(data.data), average='weighted'))
    tree_rec.append(recall_score(data.target, clf.predict(data.data), average='weighted'))
    tree_f1.append(f1_score(data.target, clf.predict(data.data), average='weighted'))

    
tree_df = pd.DataFrame(data=[tree_prc, tree_rec, tree_f1], index=['Precision', 'Recall', 'F1'], columns=[5,10,15,20]) 
tree_df = tree_df.rename_axis("Decision Tree")

tree_df

Unnamed: 0_level_0,5,10,15,20
Decision Tree,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Precision,0.722225,0.590675,0.500301,0.409969
Recall,0.719204,0.592098,0.493189,0.410648
F1,0.720523,0.591003,0.495234,0.409645


In [17]:
# Now Word2Vec and BERT

import transformers
import tensorflow
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
nlp = transformers.TFBertModel.from_pretrained('bert-base-uncased')

Neither PyTorch nor TensorFlow >= 2.0 have been found.Models won't be available and only tokenizers, configurationand file/data utilities can be used.
