In [37]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from sklearn.datasets import fetch_20newsgroups
from random import randint, seed
seed(30027)

from sklearn.feature_extraction.text import CountVectorizer

## Question 1

In [38]:
categories = ['alt.atheism','talk.religion.misc']

data_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=30027)
data_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=30027)

X_train = data_train.data
y_train = data_train.target

X_test = data_test.data
y_test = data_test.target

len(X_train)
len(X_test)

857

570

In [39]:
print(X_train[0])

From: acooper@mac.cc.macalstr.edu
Subject: Re: Where are they now?
Organization: Macalester College
Lines: 38

In article <1qi156INNf9n@senator-bedfellow.MIT.EDU>, tcbruno@athena.mit.edu (Tom Bruno) writes:
> 
> Wow.  Leave your terminal for a few months and everyone you remember goes
> away-- how depressing.  Actually, there are a few familiar faces out there,
> counting Bob and Kent, but I don't seem to recognize anyone else.  Has anyone
> heard from Graham Matthews recently, or has he gotten his degree and sailed
> for Greener Pastures (tm)?  
> 
> Which brings me to the point of my posting.  How many people out there have 
> been around alt.atheism since 1990?  I've done my damnedest to stay on top of
> the newsgroup, but when you fall behind, you REALLY fall behind (it's still not
> as bad as rec.arts.startrek used to be, but I digress).  Has anyone tried to
> keep up with the deluge?  Inquiring minds want to know!  Also-- does anyone
> keep track of where the more infamous poster

The above text is from "alt.atheism" because they mention "alt.atheism" several times in the text.

In [40]:
# Check the guess:
y_train[0]
# true!

0

"""Question 1 (c) - can we train on X directly?"""
Can't train on the text directly. 

## Question 2

In [41]:
"""Question 2 (b) - CountVectorizer"""

vectoriser = CountVectorizer()
X_train_vec = vectoriser.fit_transform(X_train)
X_test_vec = vectoriser.transform(X_test)

'Question 2 (b) - CountVectorizer'

What transformation does count vectorizer apply to the data?

it transforms each raw document to a vector of "token counts"

each column pertains to one of the words in the vocabulary of the whole corpus

each cell value is a count of the number of times that word has appeared in that document

Weakness of this kind of preprocessing for textual data?

very wide instances (sparse), which require feature selection

lose context - the order in which words occur and the proximity of words to each other, e.g. "the baby bites the dog" vs "the dog bites the baby"

In [42]:
X_train_vec.shape
X_test_vec.shape

(857, 18089)

(570, 18089)

In [22]:
# all columns contain at least one non-zero entry
for j in range(570):
    all_zero = True
    for i in range(18089):
        if X_test_vec[j, i] != 0:
            all_zero = False
            break
    if all_zero == True:
        print(j)

# but in reality, it's not impossible, e.g. all words in a test document have not be seen in training document (e.g. empty document or different languages)

In [43]:
"""Question 2 (a) - DictVectorizer"""
from sklearn.feature_extraction import DictVectorizer

example_data = [
    {"a":2,        "c":"hello"}, # no data for "b"
    {"a":3, "b":0, "c":"world"},
    {"a":0, "b":6             }  # no data for "c"
]

vectorizer = DictVectorizer(sparse=False)
example_transformed = vectorizer.fit_transform(example_data)
example_transformed

'Question 2 (a) - DictVectorizer'

array([[2., 0., 1., 0.],
       [3., 0., 0., 1.],
       [0., 6., 0., 0.]])

## Question 3

In [59]:
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif

for method in [chi2, mutual_info_classif]:
    
    selector = SelectKBest(method, k=10)

    X_train1 = selector.fit_transform(X_train_vec, y_train)

    X_train1.shape

    # fetch the scores for the K best features
    scores = selector.scores_[selector.get_support(indices=True)]

    # fetch the names (tokens) for the K best features
    names = [vectoriser.get_feature_names()[i] for i in selector.get_support(indices=True)]

    # sort by score, print out
    scores_names = sorted(zip(scores, names), reverse=True)
    display(scores_names)

(857, 10)

[(224.60860862640754, 'atheists'),
 (220.94682964410734, 'keith'),
 (218.65880101721638, 'atheism'),
 (160.67589129636838, 'caltech'),
 (132.57334250013005, 'christ'),
 (117.58179369988414, 'ra'),
 (111.25019141016739, 'jesus'),
 (101.74299973783235, 'islamic'),
 (99.93808580485398, 'brian'),
 (99.8538942196429, 'atheist')]

(857, 10)

[(0.09558402391043085, 'atheists'),
 (0.07532480782080347, 'keith'),
 (0.07114535053559949, 'the'),
 (0.0706314729031248, 'cco'),
 (0.06791361176738653, 'caltech'),
 (0.06568123450982752, 'schneider'),
 (0.06332024200377574, 'allan'),
 (0.06204259837971008, 'of'),
 (0.0447927508127979, 'atheism'),
 (0.04473706629144185, 'it')]

Do the tokens seem relevant to distinguishing between document types?

Yes, broadly relevant e.g. "atheists", "atheism", "jesus", "islamic"

There are some weird features e.g. "brian", "ra", "keith" (peculiarity of dataset, small size of the dataset)

IS there evidence of the relative biases of chi_2 and PMI
chi^2 is alleged to prefer "uncommon" words
PMI is allegd to (relative to chi^2) prefer "common" words

The bias is nto particularly noticeable for the chi^2 features
The bias for PMi is noticable, more common words like "of", "it", "the"

In [48]:
selector.get_support(indices=True)

array([ 2536,  2537,  2540,  3345,  3634,  4029,  9309,  9441,  9644,
       13470])

In [57]:
mutual_info_classif

<function sklearn.feature_selection.mutual_info_.mutual_info_classif>