# Load News Dataset

https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

https://www.nltk.org/book/ch05.html

In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

In [None]:
twenty_train.target_names 

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [None]:
len(twenty_train.data),len(twenty_train.filenames)

(2257, 2257)

In [None]:
twenty_train.filenames[0]

'/root/scikit_learn_data/20news_home/20news-bydate-train/comp.graphics/38440'

In [None]:
twenty_train.target_names[twenty_train.target[0]] # twenty_train.target[0] is the class id

'comp.graphics'

In [None]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton


In [None]:
 twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2])

In [None]:
for t in twenty_train.target[:10]:
  print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


# Extracting features from text

## Tagger

In [None]:
# import re
# def processText(text,lemma=False, gram=1, rmStop=True, low=True): # default remove stop words
#     text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b|@\w+|#|<[^>]+>|\s+[a-zA-Z]\s+', '', text, flags=re.MULTILINE) #delete URL, #hashtag# , @xxx, html tag, sigle char
#     tokens = word_tokenize(text)
#     whitelist = ["n't", "not", "no"]
#     new_tokens = []
#     stoplist = stopwordEn if rmStop else []
#     for i in tokens:
#       if i not in ['.',',','!','?','<','>','/','-','_',':','(',')'] and (i not in stoplist or i in whitelist):  #i.isalpha() or i not in [list of removel special char] 
#         if low: i = i.lower()
#         if lemma: i = lemmaWord(i)
#         new_tokens.append(i)
#     del tokens
#     # tokens = [lemmaWord(i.lower()) if lemma else i.lower() for i in tokens if (i.lower() not in stoplist or i.lower() in whitelist) and i.isalpha()]
#     if gram<=1:
#         return new_tokens
#     else:
#         return [' '.join(i) for i in nltk.ngrams(new_tokens, gram)]

In [None]:
# processText('Hello, world, my name is marshal?!54.34.53. ht.t.atr.ett http://sfdsd @fdaf #gdkg #fdjk# <p>sfd</p> a fd',rmStop=False,low=False)

['Hello',
 'world',
 'my',
 'name',
 'is',
 'marshal',
 '54.34.53',
 'ht.t.atr.ett',
 'gdkg',
 'fdjk',
 'sfdfd']

In [None]:
# def preprocess_text(sen):
#     # Removing html tags
#     sentence = remove_tags(sen)
#     # Remove punctuations and numbers
#     sentence = re.sub('[^a-zA-Z]', ' ', sentence)
#     # Single character removal
#     sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
#     # Removing multiple spaces
#     sentence = re.sub(r'\s+', ' ', sentence)
#     return sentence

# TAG_RE = re.compile(r'<[^>]+>')

# def remove_tags(text):
#     return TAG_RE.sub('', text)

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize
train_token = [word_tokenize(s) for s in twenty_train.data]
# train_token = [[t.lower() for t in l] for l in train_token]
train_tags = nltk.pos_tag_sents(train_token) # tagsets: universal, wsj, brown... default penn treebank 
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
train_tags[0][:3]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[('From', 'IN'), (':', ':'), ('sd345', 'NN')]

In [None]:
train_tags[0][:20]

[('From', 'IN'),
 (':', ':'),
 ('sd345', 'NN'),
 ('@', 'NN'),
 ('city.ac.uk', 'NN'),
 ('(', '('),
 ('Michael', 'NNP'),
 ('Collier', 'NNP'),
 (')', ')'),
 ('Subject', 'NN'),
 (':', ':'),
 ('Converting', 'NN'),
 ('images', 'NNS'),
 ('to', 'TO'),
 ('HP', 'NNP'),
 ('LaserJet', 'NNP'),
 ('III', 'NNP'),
 ('?', '.'),
 ('Nntp-Posting-Host', 'NN'),
 (':', ':')]

In [None]:
# first news
tags1_ary = [tag for (token, tag) in train_tags[0]] # list tags
tagDist = nltk.FreqDist(tags1_ary) # tags freq distribution
tagDist.most_common()

[('NNP', 28),
 ('NN', 19),
 (':', 9),
 ('.', 8),
 ('DT', 8),
 ('JJ', 8),
 ('CD', 5),
 ('IN', 4),
 ('(', 4),
 (')', 4),
 ('NNS', 4),
 ('TO', 4),
 (',', 4),
 ('VB', 3),
 ('VBZ', 2),
 ('NNPS', 1),
 ('PRP', 1),
 ('MD', 1),
 ('RB', 1),
 ('VBG', 1)]

In [None]:
nltk.FreqDist(train_tags[0]).most_common() # tagged token freq distribution

[((':', ':'), 8),
 (('.', '.'), 6),
 (('(', '('), 4),
 ((')', ')'), 4),
 (('to', 'TO'), 4),
 ((',', ','), 4),
 (('Michael', 'NNP'), 3),
 (('The', 'DT'), 3),
 (('Collier', 'NNP'), 2),
 (('HP', 'NNP'), 2),
 (('LaserJet', 'NNP'), 2),
 (('III', 'NNP'), 2),
 (('?', '.'), 2),
 (('City', 'NNP'), 2),
 (('University', 'NNP'), 2),
 (('files', 'NNS'), 2),
 (('the', 'DT'), 2),
 (('071', 'CD'), 2),
 (('From', 'IN'), 1),
 (('sd345', 'NN'), 1),
 (('@', 'NN'), 1),
 (('city.ac.uk', 'NN'), 1),
 (('Subject', 'NN'), 1),
 (('Converting', 'NN'), 1),
 (('images', 'NNS'), 1),
 (('Nntp-Posting-Host', 'NN'), 1),
 (('hampton', 'NN'), 1),
 (('Organization', 'NN'), 1),
 (('Lines', 'NNPS'), 1),
 (('14', 'CD'), 1),
 (('Does', 'NNP'), 1),
 (('anyone', 'NN'), 1),
 (('know', 'NN'), 1),
 (('of', 'IN'), 1),
 (('a', 'DT'), 1),
 (('good', 'JJ'), 1),
 (('way', 'NN'), 1),
 (('standard', 'JJ'), 1),
 (('PC', 'NN'), 1),
 (('application/PD', 'JJ'), 1),
 (('utility', 'NN'), 1),
 (('convert', 'VB'), 1),
 (('tif/img/tga', 'JJ'), 1)

In [None]:
[token for (token, tag) in train_tags[0] if tag=='NN'] # check tokens that are tagged as "NN"
# contain duplicates

['sd345',
 '@',
 'city.ac.uk',
 'Subject',
 'Converting',
 'Nntp-Posting-Host',
 'hampton',
 'Organization',
 'anyone',
 'know',
 'way',
 'PC',
 'utility',
 'format',
 'plotter',
 'response',
 'group',
 'advance',
 'uk.ac.city']

In [None]:
[(pair[0],freq) for (pair, freq) in nltk.FreqDist(train_tags[0]).most_common() if pair[1]=='NN']

[('sd345', 1),
 ('@', 1),
 ('city.ac.uk', 1),
 ('Subject', 1),
 ('Converting', 1),
 ('Nntp-Posting-Host', 1),
 ('hampton', 1),
 ('Organization', 1),
 ('anyone', 1),
 ('know', 1),
 ('way', 1),
 ('PC', 1),
 ('utility', 1),
 ('format', 1),
 ('plotter', 1),
 ('response', 1),
 ('group', 1),
 ('advance', 1),
 ('uk.ac.city', 1)]

In [None]:
subsets = []
for i in range (100):
  subsets.extend(train_tags[i])

cfd1 = nltk.ConditionalFreqDist(subsets) # use token as condition
cfd2 = nltk.ConditionalFreqDist([(tag, token) for (token, tag) in subsets]) # use tag as condition

In [None]:
cfd1['cause']

FreqDist({'NN': 9, 'VB': 6, 'VBP': 2})

In [None]:
cfd2['VB'].most_common(10)

[('be', 184),
 ('>', 78),
 ('have', 50),
 ('do', 32),
 ('know', 24),
 ('make', 17),
 ('get', 15),
 ('find', 15),
 ('help', 12),
 ('take', 12)]

In [None]:
(train_tags[0])

[('from', 'IN'),
 (':', ':'),
 ('sd345', 'NN'),
 ('@', 'NN'),
 ('city.ac.uk', 'NN'),
 ('(', '('),
 ('michael', 'FW'),
 ('collier', 'NN'),
 (')', ')'),
 ('subject', 'NN'),
 (':', ':'),
 ('converting', 'NN'),
 ('images', 'NNS'),
 ('to', 'TO'),
 ('hp', 'VB'),
 ('laserjet', 'NN'),
 ('iii', 'NN'),
 ('?', '.'),
 ('nntp-posting-host', 'NN'),
 (':', ':'),
 ('hampton', 'NN'),
 ('organization', 'NN'),
 (':', ':'),
 ('the', 'DT'),
 ('city', 'NN'),
 ('university', 'NN'),
 ('lines', 'NNS'),
 (':', ':'),
 ('14', 'CD'),
 ('does', 'VBZ'),
 ('anyone', 'NN'),
 ('know', 'VB'),
 ('of', 'IN'),
 ('a', 'DT'),
 ('good', 'JJ'),
 ('way', 'NN'),
 ('(', '('),
 ('standard', 'JJ'),
 ('pc', 'NN'),
 ('application/pd', 'JJ'),
 ('utility', 'NN'),
 (')', ')'),
 ('to', 'TO'),
 ('convert', 'VB'),
 ('tif/img/tga', 'JJ'),
 ('files', 'NNS'),
 ('into', 'IN'),
 ('laserjet', 'NN'),
 ('iii', 'JJ'),
 ('format', 'NN'),
 ('.', '.'),
 ('we', 'PRP'),
 ('would', 'MD'),
 ('also', 'RB'),
 ('like', 'VB'),
 ('to', 'TO'),
 ('do', 'VB'),
 (

## Tokenizing

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(analyzer='word', ngram_range=(1, 1))
count_vect_tag = CountVectorizer(analyzer='word', ngram_range=(1, 1))
X_token, X_tag =[],[] 

for i in train_tags:
    token_seq, tag_seq = [],[]
    for (token, tag) in i:
      token_seq.append(token)
      tag_seq.append(tag)
    X_token.append(' '.join(token_seq))
    X_tag.append(' '.join(tag_seq))

X_train_counts = count_vect.fit_transform(X_token).toarray()
X_train_tag = count_vect_tag.fit_transform(X_tag).toarray()

X_train_counts.shape, X_train_tag.shape

((2257, 35782), (2257, 33))

In [None]:
# concat token features with tag features
X_concat = [np.concatenate((X_train_counts[i], X_train_tag[i]), axis=0) for i in range(2257)]

In [None]:
X_concat[0].shape

(35815,)

In [None]:
count_vect_tag.get_feature_names()[:3]

['cc', 'cd', 'dt']

In [None]:
count_vect.vocabulary_

{'cc': 0,
 'cd': 1,
 'dt': 2,
 'ex': 3,
 'fw': 4,
 'in': 5,
 'jj': 6,
 'jjr': 7,
 'jjs': 8,
 'md': 9,
 'nn': 10,
 'nnp': 11,
 'nnps': 12,
 'nns': 13,
 'pdt': 14,
 'pos': 15,
 'prp': 16,
 'rb': 17,
 'rbr': 18,
 'rbs': 19,
 'rp': 20,
 'sym': 21,
 'to': 22,
 'uh': 23,
 'vb': 24,
 'vbd': 25,
 'vbg': 26,
 'vbn': 27,
 'vbp': 28,
 'vbz': 29,
 'wdt': 30,
 'wp': 31,
 'wrb': 32}

In [None]:
X_train_counts.toarray() # each number is a tag occurrence value

array([[  0,   5,   8, ...,   0,   0,   0],
       [  1,   4,  28, ...,   1,   0,   1],
       [ 15,   8,  45, ...,   1,   3,   2],
       ...,
       [ 16,  12,  55, ...,   5,   2,   2],
       [ 75,  38, 252, ...,  10,   4,  21],
       [  0,   3,   9, ...,   0,   1,   0]])

In [None]:
# tag occurrences in news[0]
pd.Series(X_train_counts.toarray()[0]).value_counts() 

0     18
1      5
4      3
8      2
28     1
19     1
5      1
3      1
2      1
dtype: int64

In [None]:
# grammar = r"""
#   NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN
#   PP: {<IN><NP>}               # Chunk prepositions followed by NP
#   VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
#   CLAUSE: {<NP><VP>}           # Chunk NP, VP
#   """
# cp = nltk.RegexpParser(grammar)
# chunked = cp.parse(train_tags[100])
# # chunked.draw()  

# for subtree in chunked.subtrees():
#     print(subtree)

# Traing a classifier

In [None]:
docs_new = ['God is love', 'OpenGL on the GPU is fast', 'parallel computing','compoud discovery']
def getTags(text):
  train_token = [word_tokenize(s) for s in text]
  train_token = [[t.lower() for t in l] for l in train_token]
  train_tags = [nltk.pos_tag(token) for token in train_token] 
  return [' '.join([tags for (token, tags) in i]) for i in train_tags]

tags_new = getTags(docs_new)
tags_new

['NN VBZ VB', 'NN IN DT NN VBZ JJ', 'NNS VBG', 'NN NN']

In [None]:
# only token
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train_counts, twenty_train.target)

X_new_counts = count_vect.transform(docs_new)
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

test_token = count_vect.transform(twenty_test.data)
predicted = clf.predict(test_token) # make prediction
print("Accuracy:",np.mean(predicted == twenty_test.target)) # evaluation

Accuracy: 0.9347536617842876


In [None]:
from sklearn.linear_model import SGDClassifier, LogisticRegression
clf = SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None).fit(X_train_counts, twenty_train.target)
predicted = clf.predict(test_token) # make prediction
print("Accuracy:",np.mean(predicted == twenty_test.target)) # evaluation

Accuracy: 0.8608521970705726


In [None]:
clf = LogisticRegression(random_state=42).fit(X_train_counts, twenty_train.target)
predicted = clf.predict(test_token) # make prediction
print("Accuracy:",np.mean(predicted == twenty_test.target)) # evaluation

Accuracy: 0.8908122503328895


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
# combined features
def getToeknTags(text):
  train_token = [word_tokenize(s) for s in text]
  train_tags = nltk.pos_tag_sents(train_token) 
  X_token, X_tag =[],[] 
  for i in train_tags:
      token_seq, tag_seq = [],[]
      for (token, tag) in i:
        token_seq.append(token)
        tag_seq.append(tag)
      X_token.append(' '.join(token_seq))
      X_tag.append(' '.join(tag_seq))
  
  X_train_counts = count_vect.transform(X_token).toarray()
  X_train_tag = count_vect_tag.transform(X_tag).toarray()
  X_concat = [np.concatenate((X_train_counts[i], X_train_tag[i]), axis=0) for i in range(len(X_train_counts))]

  return X_concat
concat_features = getToeknTags(docs_new)
concat_features[0].shape

(35815,)

In [None]:
predicted = clf.predict(concat_features)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics
'parallel computing' => comp.graphics
'compoud discovery' => sci.med


In [None]:
clf = MultinomialNB().fit(X_concat, twenty_train.target)

twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
test_concat = getToeknTags(twenty_test.data)
predicted = clf.predict(test_concat) # make prediction
print("Accuracy:",np.mean(predicted == twenty_test.target)) # evaluation

Accuracy: 0.9254327563249002


In [None]:
from sklearn.linear_model import SGDClassifier, LogisticRegression
clf = SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None).fit(X_concat, twenty_train.target)
predicted = clf.predict(test_concat) # make prediction
print("Accuracy:",np.mean(predicted == twenty_test.target)) # evaluation

Accuracy: 0.7776298268974701


In [None]:
clf = LogisticRegression(random_state=42).fit(X_concat, twenty_train.target)
predicted = clf.predict(test_concat) # make prediction
print("Accuracy:",np.mean(predicted == twenty_test.target)) # evaluation

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy: 0.874167776298269


# Building a pipeline
Combine all the tokenization process and model into a pipeline

Easier for further hyperparameters tuning


In [None]:
from sklearn.pipeline import Pipeline

text_clf = Pipeline([
    ('vect', CountVectorizer()), # tokenization
    ('clf', MultinomialNB()),   # modelling
])

In [None]:
text_clf.fit(X_concat, twenty_train.target) # training

# Evaluating performance

In [None]:
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
test_tags = getTags(twenty_test.data)
predicted = text_clf.predict(test_tags) # make prediction
print("Accuracy:",np.mean(predicted == twenty_test.target)) # evaluation

Accuracy: 0.36950732356857524


## Modelling

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
text_clf_lr = Pipeline([
    ('vect', CountVectorizer()),
    # ('clf', SGDClassifier(loss='hinge', penalty='l2',
    #                       alpha=1e-3, random_state=42,
    #                       max_iter=5, tol=None)),
     ('clf', LogisticRegression(random_state=42))
])

text_clf_lr.fit(X_tags, twenty_train.target)

predicted = text_clf_lr.predict(test_tags)
print("Accuracy:",np.mean(predicted == twenty_test.target))

In [None]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.38      0.27      0.32       319
         comp.graphics       0.51      0.69      0.59       389
               sci.med       0.32      0.61      0.42       396
soc.religion.christian       0.69      0.02      0.04       398

              accuracy                           0.40      1502
             macro avg       0.48      0.40      0.34      1502
          weighted avg       0.48      0.40      0.34      1502



In [None]:
pd.DataFrame(metrics.confusion_matrix(twenty_test.target, predicted),columns=twenty_test.target_names,index=twenty_test.target_names)
# row: actual class; col: predicted class 

Unnamed: 0,alt.atheism,comp.graphics,sci.med,soc.religion.christian
alt.atheism,98,53,54,114
comp.graphics,13,255,78,43
sci.med,34,122,148,92
soc.religion.christian,36,32,49,281
