In [1]:
from sklearn.datasets import fetch_20newsgroups
from nltk.tokenize import word_tokenize # Used to extract words from documents
from nltk.stem import WordNetLemmatizer # Used to lemmatize words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans

import sys
from time import time

import pandas as pd
import numpy as np

In [31]:
# Selected 3 categories from the 20 newsgroups dataset

categories = [
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]

print("Loading 20 newsgroups dataset for categories:")
print(categories)

Loading 20 newsgroups dataset for categories:
['talk.religion.misc', 'comp.graphics', 'sci.space']


In [3]:
# fetch_20newsgroups() fetches the "20 Newsgroups" dataset

df = fetch_20newsgroups()
df

{'data': ["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n",
  "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washingto

In [4]:
df['data']

["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n",
 "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 

In [5]:
print(len(df['data']))

type(df['data'])

11314


list

In [6]:
type(df)

In [7]:
len(df['data'])

11314

In [8]:
df.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [9]:
df['filenames']

array(['/root/scikit_learn_data/20news_home/20news-bydate-train/rec.autos/102994',
       '/root/scikit_learn_data/20news_home/20news-bydate-train/comp.sys.mac.hardware/51861',
       '/root/scikit_learn_data/20news_home/20news-bydate-train/comp.sys.mac.hardware/51879',
       ...,
       '/root/scikit_learn_data/20news_home/20news-bydate-train/comp.sys.ibm.pc.hardware/60695',
       '/root/scikit_learn_data/20news_home/20news-bydate-train/comp.graphics/38319',
       '/root/scikit_learn_data/20news_home/20news-bydate-train/rec.motorcycles/104440'],
      dtype='<U86')

In [10]:
df['target']

array([7, 4, 4, ..., 3, 1, 8])

In [11]:
len(df['target_names'])

20

In [13]:
# subset ---> The subset parameter specifies which portion of the dataset you want to load. It can take values like 'train', 'test', or 'all'
# default value of subset parameter is 'train' (only 1554 out of 2588 loaded)
# here we are taking setting subset value to 'all' (to load both training and testing data)

# categories ---> Defines the categories of the newsgroups to be loaded.

# shuffle ---> This parameter specifies whether to shuffle the order of the documents

# remove ---> Tells the function to remove certain parts of the text that are usually not relevant to the content itself like
# headers(from, subject)

df1 = fetch_20newsgroups(subset='all', categories=categories, shuffle=False, remove=('headers', 'footers', 'quotes'))
df1

{'data': ['\n\nI think I can. Largely as a result of efforts by people reading this group\nwriting letters and making phone calls the following has happened:\n\n1. NASA reprogrammed funds to keep NASP alive in 1991.\n2. Efforts to kill DC-X and the SSRT progam where twice twarted\n   (Feb. and June of last year).\n3. Gouldin kept his job in spite of heavy lobbying against him.\n\nThis may not be what Mark was thinking of but it shows that the\nreaders of sci.space DO have power and influence.\n\n  Allen\n',
  'In regards to fractal commpression, I have seen 2 fractal compressed "movies".\nThey were both fairly impressive.  The first one was a 64 gray scale "movie" of\nCasablanca, it was 1.3MB and had 11 minutes of 13 fps video.  It was a little\ngrainy but not bad at all.  The second one I saw was only 3 minutes but it\nhad 8 bit color with 10fps and measured in at 1.2MB.\n\nI consider the fractal movies a practical thing to explore.  But unlike many \nother formats out there, you do e

In [14]:
df2 = fetch_20newsgroups(subset='all', categories=['alt.atheism'], shuffle=False) #, remove=('headers', 'footers', 'quotes'))
df2

{'data': ["From: keith@cco.caltech.edu (Keith Allan Schneider)\nSubject: Re: <Political Atheists?\nOrganization: California Institute of Technology, Pasadena\nLines: 14\nNNTP-Posting-Host: lloyd.caltech.edu\n\nbobbe@vice.ICO.TEK.COM (Robert Beauchaine) writes:\n\n>To show that the examples I and others\n>have provided are *not* counter examples of your supposed inherent\n>moral hypothesis, you have to successfully argue that\n>domestication removes or alters this morality.\n\nI think that domestication will change behavior to a large degree.\nDomesticated animals exhibit behaviors not found in the wild.  I\ndon't think that they can be viewed as good representatives of the\nwild animal kingdom, since they have been bred for thousands of years\nto produce certain behaviors, etc.\n\nkeith\n",
  'From: I3150101@dbstu1.rz.tu-bs.de (Benedikt Rosenau)\nSubject: Re: An Anecdote about Islam\nOrganization: Technical University Braunschweig, Germany\nLines: 74\n\nIn article <114140@bu.edu>\njaeg

In [21]:
np.unique(df1['target'])  # we have selected only 3 categories, so there are only 3 targets

array([0, 1, 2])

In [22]:
labels = df1.target
print(labels)

[1 0 1 ... 0 0 1]


In [38]:
# PERFORM LEMMATIZATION
# The process of converting a word to its dictionary (base) form, or lemma, by removing inflectional endings and returning the base or
# dictionary form of a word.

import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

PERFORMING **LEMMIZATION**

In [39]:
lemmatizer = WordNetLemmatizer()

for i in range(len(df1['data'])):
  tokenized_list = word_tokenize(df1['data'][i])
  lemmatized_doc = ""
  for word in tokenized_list:
    lemmatized_doc = lemmatized_doc + " " + lemmatizer.lemmatize(word)
  df1['data'][i] = lemmatized_doc

In [40]:
print(df1.data[0])

 I think I can . Largely a a result of effort by people reading this group writing letter and making phone call the following ha happened : 1 . NASA reprogrammed fund to keep NASP alive in 1991 . 2 . Efforts to kill DC-X and the SSRT progam where twice twarted ( Feb. and June of last year ) . 3 . Gouldin kept his job in spite of heavy lobbying against him . This may not be what Mark wa thinking of but it show that the reader of sci.space DO have power and influence . Allen


In [41]:
# Converting text data into numerical features using the TF-IDF (Term Frequency-Inverse Document Frequency) vectorization technique.

# strip_accents='unicode' ---> any accented characters will be converted to their ASCII equivalents using Unicode normalization.
# ("café" would become "cafe")

# stop_words = 'english' ---> removes common English stop words(like "the", "and", "in") from the text before vectorization

# min_df ---> sets the minimum document frequency.
# min_df=2 means that any word that appears in fewer than 2 documents in the corpus(collection of documents) will be ignored(not included
# in the vocabulary of the vectorizer)
# main use is to remove the spelling mistakes

vectorizer = TfidfVectorizer(strip_accents='unicode', stop_words='english', min_df=2)

# fit_transform ---> fits the TfidfVectorizer to the text data df1.data, which means it learns the vocabulary (i.e., the set of all unique words)
# from the input documents and computes the IDF (Inverse Document Frequency) values for all the terms in the corpus.

# It then transforms the text data into a sparse matrix of TF-IDF features. Each row of this matrix represents a document, and each column
# represents a term (word) from the vocabulary.

# The value at a given position in this matrix represents the TF-IDF score of a word in a particular document, which reflects how important
# that word is in the document compared to its frequency across the entire corpus(collection of documents).

X = vectorizer.fit_transform(df1.data)

In [42]:
X.shape

(2588, 14439)

**MODEL FITTING** USING **TF-IDF**

In [45]:
# Clustering using standard k-means (groups text data into clusters based on their similarity.)

# max_iter ---> specifies the maximum number of iterations the algorithm will run for each single run. If the algorithm hasn't converged
# (i.e., the clusters haven't stabilized) after 100 iterations, it will stop ---> prevents from infinite loop)

# init ---> The init parameter determines how the initial cluster centers are selected. k-means++ is a method that chooses initial cluster
# centers in a way that speeds up convergence and often leads to better clustering results.

km = KMeans(n_clusters=len(np.unique(df1['target'])), init='k-means++', max_iter=100)
km.fit(X)

  super()._check_params_vs_input(X, default_n_init=10)


**METRICS**

In [47]:
print(km.labels_)
print(labels)

[2 2 2 ... 1 1 2]
[1 0 1 ... 0 0 1]


In [48]:
cnt = 0
correct = 0
for i in range(len(labels)):
  if km.labels_[i] == labels[i]:
    correct += 1
  cnt += 1

print(correct,cnt)

408 2588


In [50]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, fowlkes_mallows_score

# ADJUSTED RAND INDEX(ARI)
print(adjusted_rand_score(labels, km.labels_))

# NORMALIZED MUTUAL INFORMATION(NMI)
print(normalized_mutual_info_score(labels, km.labels_))

# FOWLKES-MALLOWS INDEX(FMI)
print(fowlkes_mallows_score(labels, km.labels_))

0.2804518505770143
0.3860563523978226
0.5819819655908288
