In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

import logging
from numpy import random
#import gensim
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup

%matplotlib inline

In [2]:
data = pd.read_csv("preProcessedData.csv")

In [5]:
# fraction of rows
# here you get 75% of the rows
train = data.sample(frac=0.75, random_state=99)
test = data.loc[~data.index.isin(train.index), :]

In [6]:
# Extracting features from text files
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train['Text'].values.astype('U'))
X_train_counts.shape

(4995, 9743)

In [7]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(4995, 9743)

In [8]:
# Machine Learning
# Training Naive Bayes (NB) classifier on training data.
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, train.Label)

In [9]:
# Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows:
# The names ‘vect’ , ‘tfidf’ and ‘clf’ are arbitrary but will be used later.
# We will be using the 'text_clf' going forward.
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

text_clf = text_clf.fit(train['Text'].values.astype('U'), train.Label)

In [10]:
# Performance of NB Classifier
predicted = text_clf.predict(test['Text'].values.astype('U'))
np.mean(predicted == test.Label)

0.7387387387387387

In [12]:
%%time
tags = data.Label.unique()
from sklearn.metrics import classification_report

print('accuracy %s' % accuracy_score(predicted, test.Label))
print(classification_report(test.Label, predicted,target_names=tags))

accuracy 0.7387387387387387
               precision    recall  f1-score   support

   bangladesh       0.82      0.71      0.76       185
      opinion       0.89      0.66      0.76       206
      economy       0.88      0.61      0.72       175
       sports       0.83      0.87      0.85       174
entertainment       0.84      0.60      0.70       179
   technology       0.88      0.66      0.76       194
international       0.39      0.92      0.55       183
   life-style       0.94      0.91      0.92       180
    education       0.84      0.71      0.77       189

    micro avg       0.74      0.74      0.74      1665
    macro avg       0.81      0.74      0.75      1665
 weighted avg       0.81      0.74      0.75      1665

Wall time: 16.9 ms


In [13]:
# Training Support Vector Machines - SVM and calculating its performance

from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=15, random_state=42))])

text_clf_svm = text_clf_svm.fit(train['Text'].values.astype('U'), train['Label'].values.astype('U'))
predicted_svm = text_clf_svm.predict(test['Text'].values.astype('U'))
np.mean(predicted_svm == test['Label'].values.astype('U'))

0.8174174174174175

In [14]:
%%time
print('accuracy %s' % accuracy_score(predicted_svm, test.Label))
print(classification_report(test.Label, predicted_svm,target_names=tags))

accuracy 0.8174174174174175
               precision    recall  f1-score   support

   bangladesh       0.80      0.76      0.78       185
      opinion       0.88      0.83      0.85       206
      economy       0.81      0.84      0.82       175
       sports       0.83      0.91      0.87       174
entertainment       0.78      0.73      0.76       179
   technology       0.82      0.82      0.82       194
international       0.76      0.69      0.72       183
   life-style       0.86      0.94      0.90       180
    education       0.81      0.84      0.82       189

    micro avg       0.82      0.82      0.82      1665
    macro avg       0.82      0.82      0.82      1665
 weighted avg       0.82      0.82      0.82      1665

Wall time: 14.4 ms


In [15]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
# Create Decision Tree classifer object
text_clf_dt = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-dt', DecisionTreeClassifier(criterion='gini',splitter='best',
                                                           max_depth=20))])

# Train Decision Tree Classifer
text_clf_dt = text_clf_dt.fit(train['Text'].values.astype('U'), train['Label'].values.astype('U'))

#Predict the response for test dataset
predicted_dt = text_clf_dt.predict(test['Text'].values.astype('U'))
np.mean(predicted_dt == test['Label'].values.astype('U'))

0.5021021021021022

In [16]:
%%time
from sklearn.metrics import classification_report

print('accuracy %s' % accuracy_score(predicted_dt, test.Label))
print(classification_report(test.Label, predicted_dt,target_names=tags))

accuracy 0.5021021021021022
               precision    recall  f1-score   support

   bangladesh       0.51      0.44      0.47       185
      opinion       0.51      0.44      0.48       206
      economy       0.61      0.55      0.58       175
       sports       0.32      0.70      0.44       174
entertainment       0.43      0.40      0.42       179
   technology       0.63      0.38      0.48       194
international       0.43      0.36      0.39       183
   life-style       0.70      0.74      0.72       180
    education       0.60      0.53      0.56       189

    micro avg       0.50      0.50      0.50      1665
    macro avg       0.53      0.51      0.50      1665
 weighted avg       0.53      0.50      0.50      1665

Wall time: 13.9 ms


In [17]:
from sklearn.linear_model import LogisticRegression

text_clf_lr = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf-lr', LogisticRegression(n_jobs=1, C=1e5)),
               ])
text_clf_lr = text_clf_lr.fit(train['Text'].values.astype('U'), train['Label'].values.astype('U'))

#Predict the response for test dataset
predicted_lr = text_clf_lr.predict(test['Text'].values.astype('U'))
np.mean(predicted_lr == test['Label'].values.astype('U'))

0.8252252252252252

In [18]:
%%time
from sklearn.metrics import classification_report

print('accuracy %s' % accuracy_score(predicted_lr, test.Label))
print(classification_report(test.Label, predicted_lr,target_names=tags))

accuracy 0.8252252252252252
               precision    recall  f1-score   support

   bangladesh       0.74      0.79      0.76       185
      opinion       0.84      0.79      0.81       206
      economy       0.81      0.85      0.83       175
       sports       0.86      0.86      0.86       174
entertainment       0.76      0.80      0.78       179
   technology       0.89      0.81      0.85       194
international       0.77      0.78      0.77       183
   life-style       0.93      0.95      0.94       180
    education       0.84      0.81      0.83       189

    micro avg       0.83      0.83      0.83      1665
    macro avg       0.83      0.83      0.83      1665
 weighted avg       0.83      0.83      0.83      1665

Wall time: 16.3 ms


In [33]:
from sklearn.neighbors import KNeighborsClassifier

text_clf_knn = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf-knn', KNeighborsClassifier(n_neighbors=6))])
text_clf_knn = text_clf_knn.fit(train['Text'].values.astype('U'), train['Label'].values.astype('U'))

#Predict the response for test dataset
predicted_knn = text_clf_knn.predict(test['Text'].values.astype('U'))
np.mean(predicted_knn == test['Label'].values.astype('U'))

0.7273273273273273

In [34]:
%%time
from sklearn.metrics import classification_report

print('accuracy %s' % accuracy_score(predicted_knn, test.Label))
print(classification_report(test.Label, predicted_knn,target_names=tags))

accuracy 0.7273273273273273
               precision    recall  f1-score   support

   bangladesh       0.65      0.70      0.67       185
      opinion       0.73      0.75      0.74       206
      economy       0.74      0.80      0.77       175
       sports       0.76      0.80      0.78       174
entertainment       0.70      0.59      0.64       179
   technology       0.76      0.77      0.77       194
international       0.52      0.61      0.56       183
   life-style       0.86      0.82      0.84       180
    education       0.88      0.71      0.79       189

    micro avg       0.73      0.73      0.73      1665
    macro avg       0.73      0.73      0.73      1665
 weighted avg       0.73      0.73      0.73      1665

Wall time: 14.4 ms
