In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

import logging
from numpy import random
#import gensim
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup

%matplotlib inline

In [2]:
data = pd.read_csv("preProcessedData.csv")
# fraction of rows
# here you get 75% of the rows
train = data.sample(frac=0.75, random_state=99)
test = data.loc[~data.index.isin(train.index), :]

In [3]:
# Extracting features from text files
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train['Text'].values.astype('U'))
X_train_counts.shape

(4995, 9743)

In [4]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(4995, 9743)

In [6]:
# Machine Learning
# Training Naive Bayes (NB) classifier on training data.
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, train.Label)
# Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows:
# The names ‘vect’ , ‘tfidf’ and ‘clf’ are arbitrary but will be used later.
# We will be using the 'text_clf' going forward.
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

text_clf = text_clf.fit(train['Text'].values.astype('U'), train.Label)
# Performance of NB Classifier
predicted = text_clf.predict(test['Text'].values.astype('U'))
np.mean(predicted == test.Label)

0.7387387387387387

In [8]:
# Grid Search
# Here, we are creating a list of parameters for which we would like to do performance tuning. 
# All the parameters name start with the classifier name (remember the arbitrary name we gave). 
# E.g. vect__ngram_range; here we are telling to use unigram and bigrams and choose the one which is optimal.

from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}
# Next, we create an instance of the grid search by passing the classifier, parameters 
# and n_jobs=-1 which tells to use multiple cores from user machine.

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(train['Text'].values.astype('U'), train['Label'].values.astype('U'))

# To see the best mean score and the params, run the following code

gs_clf.best_score_
gs_clf.best_params_
predicted = gs_clf.predict(test['Text'].values.astype('U'))
np.mean(predicted == test.Label)

0.8486486486486486

In [9]:
%%time

print('accuracy %s' % accuracy_score(predicted, test.Label))
print(classification_report(test.Label, predicted,target_names=tags))

accuracy 0.8486486486486486
               precision    recall  f1-score   support

   bangladesh       0.85      0.79      0.82       185
      opinion       0.87      0.85      0.86       206
      economy       0.92      0.88      0.90       175
       sports       0.86      0.92      0.89       174
entertainment       0.85      0.78      0.81       179
   technology       0.88      0.77      0.82       194
international       0.63      0.87      0.73       183
   life-style       0.97      0.95      0.96       180
    education       0.89      0.84      0.86       189

    micro avg       0.85      0.85      0.85      1665
    macro avg       0.86      0.85      0.85      1665
 weighted avg       0.86      0.85      0.85      1665

Wall time: 99.6 ms


In [11]:
# Training Support Vector Machines - SVM and calculating its performance

from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=15, random_state=42))])

text_clf_svm = text_clf_svm.fit(train['Text'].values.astype('U'), train['Label'].values.astype('U'))
predicted_svm = text_clf_svm.predict(test['Text'].values.astype('U'))
np.mean(predicted_svm == test['Label'].values.astype('U'))

0.8174174174174175

In [13]:
# Similarly doing grid search for SVM
from sklearn.model_selection import GridSearchCV
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False),'clf-svm__alpha': (1e-2, 1e-3)}

gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(train['Text'].values.astype('U'), train['Label'].values.astype('U'))


gs_clf_svm.best_score_
gs_clf_svm.best_params_
predicted_svm = gs_clf_svm.predict(test['Text'].values.astype('U'))
np.mean(predicted_svm == test['Label'].values.astype('U'))

0.8306306306306306

In [14]:
%%time
from sklearn.metrics import classification_report

print('accuracy %s' % accuracy_score(predicted_svm, test.Label))
print(classification_report(test.Label, predicted_svm,target_names=tags))

accuracy 0.8306306306306306
               precision    recall  f1-score   support

   bangladesh       0.80      0.76      0.78       185
      opinion       0.88      0.84      0.86       206
      economy       0.81      0.87      0.84       175
       sports       0.82      0.94      0.88       174
entertainment       0.85      0.74      0.79       179
   technology       0.85      0.80      0.83       194
international       0.76      0.70      0.73       183
   life-style       0.86      0.97      0.91       180
    education       0.85      0.86      0.85       189

    micro avg       0.83      0.83      0.83      1665
    macro avg       0.83      0.83      0.83      1665
 weighted avg       0.83      0.83      0.83      1665

Wall time: 190 ms
Parser   : 224 ms


In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
text_clf_lsvc = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf-lsvc', LinearSVC())])
text_clf_lsvc = text_clf_lsvc.fit(train['Text'].values.astype('U'), train['Label'].values.astype('U'))

parameters_lsvc = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False)}

gs_clf_lsvc= GridSearchCV(text_clf_lsvc, parameters_lsvc, n_jobs=-1)
gs_clf_lsvc = gs_clf_lsvc.fit(train['Text'].values.astype('U'), train['Label'].values.astype('U'))


gs_clf_lsvc.best_score_
gs_clf_lsvc.best_params_
predicted_lsvc = gs_clf_lsvc.predict(test['Text'].values.astype('U'))
np.mean(predicted_lsvc == test['Label'].values.astype('U'))

0.8786786786786787

In [11]:
%%time
tags = data.Label.unique()
from sklearn.metrics import classification_report

print('accuracy %s' % accuracy_score(predicted_lsvc, test.Label))
print(classification_report(test.Label, predicted_lsvc,target_names=tags))

accuracy 0.8786786786786787
               precision    recall  f1-score   support

   bangladesh       0.83      0.83      0.83       185
      opinion       0.92      0.88      0.90       206
      economy       0.91      0.90      0.90       175
       sports       0.88      0.94      0.91       174
entertainment       0.86      0.83      0.84       179
   technology       0.88      0.86      0.87       194
international       0.81      0.83      0.82       183
   life-style       0.96      0.97      0.96       180
    education       0.85      0.87      0.86       189

    micro avg       0.88      0.88      0.88      1665
    macro avg       0.88      0.88      0.88      1665
 weighted avg       0.88      0.88      0.88      1665

Wall time: 251 ms
