In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

import logging
from numpy import random
#import gensim
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup

%matplotlib inline

In [2]:
data = pd.read_csv("preProcessedData.csv")
# fraction of rows
# here you get 75% of the rows
train = data.sample(frac=0.75, random_state=99)
test = data.loc[~data.index.isin(train.index), :]

In [3]:
# Extracting features from text files
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train['Text'].values.astype('U'))

# TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

X_train_tfidf.shape

(4995, 9743)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
text_clf_lsvc = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf-lsvc', LinearSVC())])
text_clf_lsvc = text_clf_lsvc.fit(train['Text'].values.astype('U'), train['Label'].values.astype('U'))

#Predict the response for test dataset
predicted_lsvc = text_clf_lsvc.predict(test['Text'].values.astype('U'))
np.mean(predicted_lsvc == test['Label'].values.astype('U'))

0.8492492492492493

In [5]:
%%time
tags = data.Label.unique()
from sklearn.metrics import classification_report

print('accuracy %s' % accuracy_score(predicted_lsvc, test.Label))
print(classification_report(test.Label, predicted_lsvc,target_names=tags))

accuracy 0.8492492492492493
               precision    recall  f1-score   support

   bangladesh       0.81      0.82      0.81       185
      opinion       0.87      0.82      0.84       206
      economy       0.85      0.87      0.86       175
       sports       0.87      0.88      0.87       174
entertainment       0.78      0.82      0.80       179
   technology       0.88      0.85      0.87       194
international       0.81      0.80      0.81       183
   life-style       0.93      0.96      0.95       180
    education       0.86      0.83      0.84       189

     accuracy                           0.85      1665
    macro avg       0.85      0.85      0.85      1665
 weighted avg       0.85      0.85      0.85      1665

Wall time: 117 ms


In [7]:
from sklearn.model_selection import GridSearchCV

parameters_lsvc = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False)}

gs_clf_lsvc= GridSearchCV(text_clf_lsvc, parameters_lsvc, n_jobs=-1)
gs_clf_lsvc = gs_clf_lsvc.fit(train['Text'].values.astype('U'), train['Label'].values.astype('U'))


gs_clf_lsvc.best_score_
gs_clf_lsvc.best_params_
predicted_lsvc = gs_clf_lsvc.predict(test['Text'].values.astype('U'))
np.mean(predicted_lsvc == test['Label'].values.astype('U'))

0.8786786786786787

In [8]:
%%time
tags = data.Label.unique()
from sklearn.metrics import classification_report

print('accuracy %s' % accuracy_score(predicted_lsvc, test.Label))
print(classification_report(test.Label, predicted_lsvc,target_names=tags))

accuracy 0.8786786786786787
               precision    recall  f1-score   support

   bangladesh       0.83      0.83      0.83       185
      opinion       0.92      0.88      0.90       206
      economy       0.91      0.90      0.90       175
       sports       0.88      0.94      0.91       174
entertainment       0.86      0.83      0.84       179
   technology       0.88      0.86      0.87       194
international       0.81      0.83      0.82       183
   life-style       0.96      0.97      0.96       180
    education       0.85      0.87      0.86       189

     accuracy                           0.88      1665
    macro avg       0.88      0.88      0.88      1665
 weighted avg       0.88      0.88      0.88      1665

Wall time: 145 ms


In [21]:
from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(test['Label'], predicted_lsvc)
conf_mat

array([[154,   1,   4,   3,   5,   5,   9,   2,   2],
       [  7, 182,   2,   1,   5,   1,   4,   0,   4],
       [  1,   2, 157,   1,   0,   4,   1,   0,   9],
       [  3,   0,   0, 164,   5,   1,   0,   0,   1],
       [  3,   2,   0,   4, 149,   3,  12,   1,   5],
       [  2,   1,   2,   6,   1, 166,  10,   1,   5],
       [ 12,   4,   4,   1,   3,   3, 152,   2,   2],
       [  1,   0,   0,   3,   2,   0,   0, 174,   0],
       [  2,   6,   3,   3,   4,   5,   0,   1, 165]], dtype=int64)