# Load Data

In [1]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'comp.graphics']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
print(newsgroups_train.data[0])


I'll take a wild guess and say Freedom is objectively valuable.  I base
this on the assumption that if everyone in the world were deprived utterly
of their freedom (so that their every act was contrary to their volition),
almost all would want to complain.  Therefore I take it that to assert or
believe that "Freedom is not very valuable", when almost everyone can see
that it is, is every bit as absurd as to assert "it is not raining" on
a rainy day.  I take this to be a candidate for an objective value, and it
it is a necessary condition for objective morality that objective values
such as this exist.



# Pre-processing

**Run below for the dependencies:**
- ```pip install spacy```
- ```python -m spacy download en_core_web_sm```

In [2]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import string

punctuations = string.punctuation
stopwords = list(STOP_WORDS)
stopwords[:10]

['only',
 'further',
 'where',
 'throughout',
 'anyone',
 'down',
 'being',
 'name',
 'whence',
 'first']

In [4]:
import pandas as pd

df = pd.DataFrame(newsgroups_train.data, columns=["text"])
df = df.replace('\n','', regex=True)
df

Unnamed: 0,text
0,I'll take a wild guess and say Freedom is obje...
1,Is there a precompiled version of hp2xx for DO...
2,Does anyone know of any good shareware animati...
3,It should have been made fairly clear that the...
4,
...,...
1059,Hi ... Recently I found XV for MS-DOS in a sub...
1060,"(reference line trimmed)[...]Yes.Well, for exa..."
1061,"No need to correct it, it stands as it is sa..."
1062,"For the last time, Bobby. Lack of belief in YO..."


In [5]:
# Parser
parser = spacy.load("en_core_web_sm")
parser.max_length = 7000000

def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [6]:
from tqdm import tqdm
tqdm.pandas()
df["processed_text"] = df["text"].progress_apply(spacy_tokenizer)

100%|██████████| 1064/1064 [00:22<00:00, 48.20it/s]


In [7]:
df["processed_text"]

0       wild guess freedom objectively valuable baseth...
1       precompile version hp2xx dos prefereablyfor 38...
2       know good shareware animation paint software s...
3       fairly clear crimson wouldever 150 75 old styl...
4                                                        
                              ...                        
1059    hi ... recently find xv ms dos subdirectory gn...
1060    reference line trimmed)[ ... ]yes example goal...
1061    need correct stand miss point entirely thing d...
1062    time bobby lack belief god implyatheism moslem...
1063    hi i've vga mode 13h graphic library available...
Name: processed_text, Length: 1064, dtype: object

# Vectorize

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

text = df['processed_text'].values
vectorizer = TfidfVectorizer(max_features=30)
X = vectorizer.fit_transform(text).toarray()

In [9]:
X.shape

(1064, 30)

In [10]:
y = newsgroups_train.target

In [11]:
X

array([[0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.52671177, 0.        ,
        0.        ],
       [0.        , 0.31779536, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.39560222]])

# Organize Data

In [12]:
y_true = y.copy()
y_experiment = y_true.copy()

In [13]:
import numpy as np
rng = np.random.RandomState(42)
random_unlabeled_points = rng.rand(y_experiment.shape[0]) < 0.3
y_experiment[random_unlabeled_points] = -1

# RFoT

In [14]:
from RFoT import RFoT

model = RFoT(
        bin_scale=1,
        max_dimensions=8,
        component_purity_tol=1.0,
        min_rank=2,
        max_rank=10,
        n_estimators=100,
        bin_entry=True,
        clustering="ms",
        max_depth=4,
        n_jobs=50,
)
y_pred = model.predict(X, y_experiment)

100%|██████████| 100/100 [00:17<00:00,  5.88it/s]
100%|██████████| 100/100 [00:08<00:00, 11.87it/s]
100%|██████████| 100/100 [00:10<00:00,  9.36it/s]
100%|██████████| 100/100 [00:09<00:00, 10.09it/s]


# Look at the results

In [15]:
from sklearn.metrics import f1_score

unknown_indices = np.argwhere(y_experiment == -1).flatten()
did_predict_indices = np.argwhere(y_pred[unknown_indices] != -1).flatten()
abstaining_count = len(np.argwhere(y_pred == -1))
f1 = f1_score(
    y_true[unknown_indices][did_predict_indices],
    y_pred[unknown_indices][did_predict_indices],
    average="weighted",
)

print("------------------------")
print("Num. of Abstaining", abstaining_count)
print("Percent Abstaining", (abstaining_count / len(unknown_indices)) * 100, "%")
print("F1=", f1)

------------------------
Num. of Abstaining 235
Percent Abstaining 70.35928143712576 %
F1= 0.9698783541333965


In [16]:
from sklearn.metrics import classification_report

y_true_hat = y_true[unknown_indices][did_predict_indices]
y_pred_hat = y_pred[unknown_indices][did_predict_indices]
print(classification_report(y_true_hat, y_pred_hat))

              precision    recall  f1-score   support

           0       0.93      0.96      0.94        26
           1       0.99      0.97      0.98        73

    accuracy                           0.97        99
   macro avg       0.96      0.97      0.96        99
weighted avg       0.97      0.97      0.97        99

