# Load Data

In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
categories = ['alt.atheism', 'comp.graphics', 'talk.politics.misc']

In [3]:
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))

In [4]:
print(newsgroups_train.data[0])


I agree, I had a hard feeling not believing my grand-grand mother
who told me of elves dancing outside barns in the early mornings.
I preferred not to accept it, even if her statement provided
the truth itself. Life is hard.

Cheers,
Kent


# Pre-processing

In [5]:
! python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[K     |████████████████████████████████| 13.9 MB 3.3 MB/s eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [6]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm

In [7]:
import string

punctuations = string.punctuation
stopwords = list(STOP_WORDS)
stopwords[:10]

['in',
 'everyone',
 'next',
 'hers',
 'seems',
 'although',
 'herein',
 'wherein',
 'quite',
 'two']

In [8]:
import pandas as pd

df = pd.DataFrame(newsgroups_train.data, columns=["text"])
df = df.replace('\n','', regex=True)
df

Unnamed: 0,text
0,"I agree, I had a hard feeling not believing my..."
1,
2,Has anyone got multiverse to work ?I have buil...
3,"[deletions...]First of all, infinity is a math..."
4,It's not what they did that matters. It's wha...
...,...
1524,"I recommend the book ""Adams _v_ Texas"", the st..."
1525,There are several public domain utilities avai...
1526,I did not claim that our system was objective.
1527,"I'm looking for a database called ""Micro World..."


In [9]:
# Parser
parser = spacy.load("en_core_web_sm")
parser.max_length = 7000000

def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [10]:
from tqdm import tqdm
tqdm.pandas()
df["processed_text"] = df["text"].progress_apply(spacy_tokenizer)

100%|███████████████████████████████████████████████████████████████| 1529/1529 [00:51<00:00, 29.47it/s]


In [11]:
df["processed_text"]

0       agree hard feeling believe grand grand motherw...
1                                                        
2       multiverse work build 486 svr4 mips svr4s sun ...
3       deletion ... ]first infinity mathematical conc...
4       matter andwhat response matter welessen kill r...
                              ...                        
1524    recommend book adams v texas story man adams w...
1525    public domain utility available usualarchive s...
1526                               claim system objective
1527    look database micro world data bank ii databas...
1528    specifically change talk arguingthat motto int...
Name: processed_text, Length: 1529, dtype: object

# Vectorize

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

text = df['processed_text'].values
vectorizer = TfidfVectorizer(max_features=30)
X = vectorizer.fit_transform(text).toarray()

In [13]:
X.shape

(1529, 30)

In [14]:
y = newsgroups_train.target

# Organize Data

In [15]:
y_true = y.copy()
y_experiment = y_true.copy()

In [16]:
import numpy as np
rng = np.random.RandomState(42)
random_unlabeled_points = rng.rand(y_experiment.shape[0]) < 0.5
y_experiment[random_unlabeled_points] = -1

# RFoT

In [None]:
from RFoT import RFoT

model = RFoT(
        bin_scale=1,
        max_dimensions=5,
        component_purity_tol=0.99,
        min_rank=11,
        max_rank=21,
        n_estimators=50,
        bin_entry=True,
        clustering="ms",
        max_depth=2,
        n_jobs=10,
)
y_pred = model.predict(X, y_experiment)

100%|███████████████████████████████████████████████████████████████████| 50/50 [01:05<00:00,  1.31s/it]
 12%|████████▏                                                           | 6/50 [00:15<01:16,  1.75s/it]

# Look at the results

In [None]:
from sklearn.metrics import f1_score

unknown_indices = np.argwhere(y_experiment == -1).flatten()
did_predict_indices = np.argwhere(y_pred[unknown_indices] != -1).flatten()
abstaining_count = len(np.argwhere(y_pred == -1))
f1 = f1_score(
    y_true[unknown_indices][did_predict_indices],
    y_pred[unknown_indices][did_predict_indices],
    average="weighted",
)

print("------------------------")
print("Num. of Abstaining", abstaining_count)
print("Percent Abstaining", (abstaining_count / len(unknown_indices)) * 100, "%")
print("F1=", f1)

In [None]:
from sklearn.metrics import classification_report

y_true_hat = y_true[unknown_indices][did_predict_indices]
y_pred_hat = y_pred[unknown_indices][did_predict_indices]
print(classification_report(y_true_hat, y_pred_hat))