# Load News Dataset

https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

In [4]:
import pandas as pd
# 1. import data into a DataFrame 
#     pd.read_csv(PATH/file.csv)  
# 2. Tokenize textual data, extract features, convert them into vectors
# 3. Modeling, train models on training set (select model, tune different parameters)
# 4. Evaluation on dev set (metrics calculation, error analysis)
# 5. Prediction on test set (store your results in given format and submit it)


df = pd.DataFrame({'text':twenty_train.data,'class':twenty_train.target})
df.head()

Unnamed: 0,text,class
0,From: sd345@city.ac.uk (Michael Collier)\nSubj...,1
1,From: ani@ms.uky.edu (Aniruddha B. Deglurkar)\...,1
2,From: djohnson@cs.ucsd.edu (Darin Johnson)\nSu...,3
3,From: s0612596@let.rug.nl (M.M. Zwart)\nSubjec...,3
4,From: stanly@grok11.columbiasc.ncr.com (stanly...,3


In [None]:
df['text'][0]

'From: sd345@city.ac.uk (Michael Collier)\nSubject: Converting images to HP LaserJet III?\nNntp-Posting-Host: hampton\nOrganization: The City University\nLines: 14\n\nDoes anyone know of a good way (standard PC application/PD utility) to\nconvert tif/img/tga files into LaserJet III format.  We would also like to\ndo the same, converting to HPGL (HP plotter) files.\n\nPlease email any response.\n\nIs this the correct group?\n\nThanks in advance.  Michael.\n-- \nMichael Collier (Programmer)                 The Computer Unit,\nEmail: M.P.Collier@uk.ac.city                The City University,\nTel: 071 477-8000 x3769                      London,\nFax: 071 477-8565                            EC1V 0HB.\n'

In [None]:
df['class'][0]

1

In [None]:
twenty_train.target_names 

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [None]:
len(twenty_train.data),len(twenty_train.filenames)

(2257, 2257)

In [None]:
twenty_train.filenames[0]

'/root/scikit_learn_data/20news_home/20news-bydate-train/comp.graphics/38440'

In [None]:
twenty_train.target_names[twenty_train.target[0]] # twenty_train.target[0] is the class id

'comp.graphics'

In [None]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton


In [None]:
nltk.pos_tag()

In [None]:
 twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2])

In [None]:
for t in twenty_train.target[:10]:
  print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


# Extracting features from text

## Tokenizing

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(analyzer='word', ngram_range=(1, 1))
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35788)

In [None]:
count_vect.vocabulary_.get('hello'),count_vect.vocabulary_.get('world')

(16413, 35275)

In [None]:
count_vect.get_feature_names()[16413]

'hello'

In [None]:
X_train_counts.toarray() # each number is a word occurrence value

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
# word occurrences in news[0]
pd.Series(X_train_counts.toarray()[0]).value_counts() 

0    35715
1       57
2       11
4        2
3        2
5        1
dtype: int64

## From occurrences to frequencies
Term Frequencies (**TF**) : TF(x)=N_x/N  divide the number of occurrences of each word in a document by the total number of words in the document

IDF(x) = log(N_Documents/N_docs_contains_x)

TF-IDF = TF*IDF

Term Frequency times Inverse Document Frequency (**TF-IDF**): to downscale weights for words that occur in many documents in the corpus and are therefore less informative than those that occur only in a smaller portion of the corpus

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(2257, 35788)

In [None]:
# TF values in news[0] (with l2 normalization)
pd.Series(X_train_tf.toarray()[0]).value_counts()

0.000000    35715
0.075378       57
0.150756       11
0.226134        2
0.301511        2
0.376889        1
dtype: int64

In [None]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

In [None]:
# ITF values in news[0] (with l2 normalization)
pd.Series(X_train_tfidf.toarray()[0]).value_counts()

0.000000    35715
0.134871        5
0.246455        2
0.068661        2
0.016798        2
            ...  
0.256120        1
0.031270        1
0.031043        1
0.084135        1
0.107836        1
Length: 66, dtype: int64

# Traing a classifier

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [None]:
docs_new = ['God is love', 'OpenGL on the GPU is fast', 'parallel computing','compoud discovery']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics
'parallel computing' => comp.graphics
'compoud discovery' => sci.med


# Building a pipeline
Combine all the tokenization process and model into a pipeline

Easier for further hyperparameters tuning


In [None]:
from sklearn.pipeline import Pipeline

text_clf = Pipeline([
    ('vect', CountVectorizer()), # tokenization
    ('tfidf', TfidfTransformer()), # features extraction
    ('clf', MultinomialNB()),   # modelling
])

In [None]:
text_clf.fit(twenty_train.data, twenty_train.target) # training

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

# Evaluating performance

In [None]:
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

predicted = text_clf.predict(twenty_test.data) # make prediction
print("Accuracy:",np.mean(predicted == twenty_test.target)) # evaluation

Accuracy: 0.8348868175765646


## Modelling

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
text_clf_svm = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    # ('clf', SGDClassifier(loss='hinge', penalty='l2',
    #                       alpha=1e-3, random_state=42,
    #                       max_iter=5, tol=None)),
     ('clf', LogisticRegression(random_state=42))
])

text_clf_svm.fit(twenty_train.data, twenty_train.target)

predicted = text_clf_svm.predict(twenty_test.data)
print("Accuracy:",np.mean(predicted == twenty_test.target))

Accuracy: 0.8974700399467377


In [None]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.80      0.87       319
         comp.graphics       0.87      0.98      0.92       389
               sci.med       0.94      0.89      0.91       396
soc.religion.christian       0.90      0.95      0.93       398

              accuracy                           0.91      1502
             macro avg       0.91      0.91      0.91      1502
          weighted avg       0.91      0.91      0.91      1502



In [None]:
pd.DataFrame(metrics.confusion_matrix(twenty_test.target, predicted),columns=twenty_test.target_names,index=twenty_test.target_names)
# row: actual class; col: predicted class 

Unnamed: 0,alt.atheism,comp.graphics,sci.med,soc.religion.christian
alt.atheism,256,11,16,36
comp.graphics,4,380,3,2
sci.med,5,35,353,3
soc.religion.christian,5,11,4,378


# Parameter tuning using grid search 

In [None]:
from sklearn.model_selection import GridSearchCV

text_clf_gs = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2', random_state=42,
                          max_iter=5, tol=None)),
])

parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

In [None]:
gs_clf = GridSearchCV(text_clf_gs, parameters, cv=5, n_jobs=-1) #n_jobs=-1 auto detect all CPU cores

In [None]:
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400]) # only train on fisrt 400 rows

In [None]:
prediction = gs_clf.predict(['God is love'])[0]
twenty_train.target_names[prediction]

'soc.religion.christian'

In [None]:
gs_clf.best_score_ # on training set

In [None]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)


In [None]:
gs_res = pd.DataFrame(gs_clf.cv_results_)
gs_res.T

Unnamed: 0,0,1,2,3,4,5,6,7
mean_fit_time,0.129526,0.41807,0.126337,0.39256,0.127089,0.413828,0.124502,0.389259
std_fit_time,0.0103683,0.0219688,0.0085369,0.0185551,0.00813722,0.0179995,0.00646239,0.0179886
mean_score_time,0.0329668,0.0698018,0.0317683,0.0650624,0.0327766,0.0687783,0.0307102,0.0643095
std_score_time,0.00310792,0.00622383,0.00334893,0.00605791,0.003757,0.00614362,0.00288478,0.00564394
param_clf__alpha,0.01,0.01,0.01,0.01,0.001,0.001,0.001,0.001
param_tfidf__use_idf,True,True,False,False,True,True,False,False
param_vect__ngram_range,"(1, 1)","(1, 2)","(1, 1)","(1, 2)","(1, 1)","(1, 2)","(1, 1)","(1, 2)"
params,"{'clf__alpha': 0.01, 'tfidf__use_idf': True, '...","{'clf__alpha': 0.01, 'tfidf__use_idf': True, '...","{'clf__alpha': 0.01, 'tfidf__use_idf': False, ...","{'clf__alpha': 0.01, 'tfidf__use_idf': False, ...","{'clf__alpha': 0.001, 'tfidf__use_idf': True, ...","{'clf__alpha': 0.001, 'tfidf__use_idf': True, ...","{'clf__alpha': 0.001, 'tfidf__use_idf': False,...","{'clf__alpha': 0.001, 'tfidf__use_idf': False,..."
split0_test_score,0.9,0.8875,0.7,0.6875,0.9,0.9125,0.825,0.8
split1_test_score,0.8625,0.85,0.7125,0.7,0.9,0.8875,0.7625,0.8


In [None]:
gs_clf.best_estimator_

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                ('clf',
                 SGDClassifier(alpha=0.001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                      