In [1]:
import numpy as np 
import pandas as pd
import csv
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.metrics import *
from helper import *

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import pickle

import cufflinks

cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

import warnings
warnings.filterwarnings("ignore")

Using TensorFlow backend.


In [2]:
data = pd.read_csv("Data/PUSHSHIFT-Preprocessed.csv")
data.head(5)

Unnamed: 0,Post ID,Title,URL,Body,Comments,Score,Comments Count,Flair
0,f4j4os,loudspeaker temple loud,www.reddit.com,complain loudspeaker nuisance early morning ne...,call police work post anonymously twitter dele...,0.000289,0.011955,AskIndia
1,f4jmon,life calling,www.reddit.com,life calling dear friend first post reddit spi...,tldr loosing losing everything else ok post r ...,0.000289,0.002391,AskIndia
2,f4jpyp,another wall go trump time india,www.reuters.com,,deewaar wahin banayenge,0.000289,0.000531,Politics
3,f4jqqi,rickshaw puller invite modi daughter marriage ...,www.theweek.in,,pr kya hai ji understand people invite famous ...,0.000289,0.001328,Non-Political
4,f4jtws,maharashtra set roll npr may 1 uddhav thackera...,news.abplive.com,,uddhav chya aaichi gand changed side form gove...,0.000289,0.001594,Politics


In [3]:
count = data.shape[0]

In [4]:
data.isna().sum()

Post ID               0
Title                73
URL                   0
Body              16102
Comments          12190
Score                 0
Comments Count        0
Flair                 0
dtype: int64

In [5]:
data["Title"].fillna(" ", inplace = True)
data["Body"].fillna(" ", inplace = True) 
data["Comments"].fillna(" ", inplace = True)

In [6]:
flairs = ["AskIndia", "Non-Political", "Scheduled", "Photography", "Science/Technology", "Politics", 
          "Business/Finance", "Policy/Economy", "Sports", "Food", "Coronavirus"]

Helper functions

In [7]:
def singleFeature(feature):

    X = data[feature].values
    Y = data.Flair
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 77, test_size = 0.2)

    return X_train, X_test, Y_train, Y_test

##

def dualFeature(feature1, feature2):

    X1 = data[feature1].values
    X2 = data[feature2].values
    Y = data.Flair

    X = []

    for ind in range(count):

        X.append(X1[ind] + ' ' + X2[ind])

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 77, test_size = 0.2)

    return X_train, X_test, Y_train, Y_test

##

def triFeature(feature1, feature2, feature3):

    X1 = data[feature1].values
    X2 = data[feature2].values
    X3 = data[feature3].values
    Y = data.Flair

    X = []

    for ind in range(count):

        X.append(X1[ind] + ' ' + X2[ind] + ' ' + X3[ind])

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 77, test_size = 0.2)

    return X_train, X_test, Y_train, Y_test

### Title

In [8]:
X_train, X_test, Y_train, Y_test = singleFeature("Title")

Naive Bayes

In [9]:
MNB = Pipeline([('vect', CountVectorizer(ngram_range = (1, 1))),
               ('tfidf', TfidfTransformer()),
               ('MNB', MultinomialNB(alpha = 1, fit_prior = False)),
              ])
MNB.fit(X_train, Y_train)

from sklearn.metrics import classification_report
Y_pred = MNB.predict(X_test)

print('Train accuracy '+ str(MNB.score(X_train, Y_train)))
print('Test accuracy %s' % accuracy_score(Y_pred, Y_test))
print(classification_report(Y_test, Y_pred, target_names = flairs))

Train accuracy 0.7111146013297733
Test accuracy 0.5360134003350083
                    precision    recall  f1-score   support

          AskIndia       0.45      0.44      0.44       701
     Non-Political       0.61      0.34      0.43       152
         Scheduled       0.57      0.82      0.67      1530
       Photography       0.67      0.27      0.38        45
Science/Technology       0.47      0.36      0.40      1101
          Politics       0.56      0.32      0.41       111
  Business/Finance       0.36      0.14      0.20       146
    Policy/Economy       0.59      0.56      0.58       750
            Sports       0.65      0.63      0.64        35
              Food       0.62      0.17      0.27       162
       Coronavirus       0.56      0.33      0.41        43

          accuracy                           0.54      4776
         macro avg       0.55      0.40      0.44      4776
      weighted avg       0.53      0.54      0.51      4776



Logistic Regression

In [10]:
LR = Pipeline([('vect', CountVectorizer(ngram_range = (1, 1))),
                ('tfidf', TfidfTransformer()),
                ('LR', LogisticRegression(solver = "liblinear", penalty = "l2")),
               ])
LR.fit(X_train, Y_train)

Y_pred = LR.predict(X_test)

print('Train accuracy '+ str(LR.score(X_train, Y_train)))
print('Test accuracy %s' % accuracy_score(Y_pred, Y_test))
print(classification_report(Y_test, Y_pred, target_names = flairs))

Train accuracy 0.6806973456887074
Test accuracy 0.535175879396985
                    precision    recall  f1-score   support

          AskIndia       0.49      0.41      0.45       701
     Non-Political       0.67      0.22      0.33       152
         Scheduled       0.59      0.79      0.67      1530
       Photography       0.80      0.09      0.16        45
Science/Technology       0.41      0.49      0.44      1101
          Politics       1.00      0.17      0.29       111
  Business/Finance       0.46      0.08      0.14       146
    Policy/Economy       0.61      0.55      0.58       750
            Sports       0.88      0.63      0.73        35
              Food       0.85      0.10      0.19       162
       Coronavirus       1.00      0.19      0.31        43

          accuracy                           0.54      4776
         macro avg       0.71      0.34      0.39      4776
      weighted avg       0.56      0.54      0.51      4776



Random Forest

In [11]:
RF = Pipeline([
    ('vect', CountVectorizer(ngram_range = (1, 1))),
    ('tfidf', TfidfTransformer()),
    ('RF', RandomForestClassifier(max_depth = 3000, min_samples_split = 200))
])

RF.fit(X_train, Y_train)

Y_pred = RF.predict(X_test)

print('Train accuracy '+ str(RF.score(X_train, Y_train)))
print('Test accuracy %s' % accuracy_score(Y_pred, Y_test))
print(classification_report(Y_test, Y_pred, target_names = flairs))

Train accuracy 0.772786765090833
Test accuracy 0.5339195979899497
                    precision    recall  f1-score   support

          AskIndia       0.47      0.36      0.41       701
     Non-Political       0.61      0.32      0.42       152
         Scheduled       0.59      0.80      0.68      1530
       Photography       0.67      0.22      0.33        45
Science/Technology       0.40      0.45      0.43      1101
          Politics       0.65      0.28      0.39       111
  Business/Finance       0.38      0.09      0.14       146
    Policy/Economy       0.60      0.55      0.57       750
            Sports       0.85      0.63      0.72        35
              Food       0.71      0.15      0.25       162
       Coronavirus       0.95      0.42      0.58        43

          accuracy                           0.53      4776
         macro avg       0.63      0.39      0.45      4776
      weighted avg       0.54      0.53      0.52      4776



### Comments

In [12]:
X_train, X_test, Y_train, Y_test = singleFeature("Comments")

Naive Bayes

In [13]:
MNB = Pipeline([('vect', CountVectorizer(ngram_range = (1, 1))),
               ('tfidf', TfidfTransformer()),
               ('MNB', MultinomialNB(alpha = 1, fit_prior = False)),
              ])
MNB.fit(X_train, Y_train)

from sklearn.metrics import classification_report
Y_pred = MNB.predict(X_test)

print('Train accuracy '+ str(MNB.score(X_train, Y_train)))
print('Test accuracy %s' % accuracy_score(Y_pred, Y_test))
print(classification_report(Y_test, Y_pred, target_names = flairs))

Train accuracy 0.3622323438563426
Test accuracy 0.2830820770519263
                    precision    recall  f1-score   support

          AskIndia       0.17      0.66      0.27       701
     Non-Political       0.00      0.00      0.00       152
         Scheduled       0.40      0.44      0.42      1530
       Photography       1.00      0.02      0.04        45
Science/Technology       0.41      0.06      0.10      1101
          Politics       0.00      0.00      0.00       111
  Business/Finance       0.00      0.00      0.00       146
    Policy/Economy       0.62      0.20      0.31       750
            Sports       0.00      0.00      0.00        35
              Food       0.00      0.00      0.00       162
       Coronavirus       0.00      0.00      0.00        43

          accuracy                           0.28      4776
         macro avg       0.24      0.13      0.10      4776
      weighted avg       0.35      0.28      0.25      4776



Logistic Regression

In [14]:
LR = Pipeline([('vect', CountVectorizer(ngram_range = (1, 1))),
                ('tfidf', TfidfTransformer()),
                ('LR', LogisticRegression(solver = "liblinear", penalty = "l2")),
               ])
LR.fit(X_train, Y_train)

Y_pred = LR.predict(X_test)

print('Train accuracy '+ str(LR.score(X_train, Y_train)))
print('Test accuracy %s' % accuracy_score(Y_pred, Y_test))
print(classification_report(Y_test, Y_pred, target_names = flairs))

Train accuracy 0.4991361708811057
Test accuracy 0.4020100502512563
                    precision    recall  f1-score   support

          AskIndia       0.50      0.24      0.32       701
     Non-Political       1.00      0.01      0.01       152
         Scheduled       0.38      0.89      0.53      1530
       Photography       0.00      0.00      0.00        45
Science/Technology       0.36      0.15      0.21      1101
          Politics       0.00      0.00      0.00       111
  Business/Finance       0.86      0.04      0.08       146
    Policy/Economy       0.56      0.29      0.38       750
            Sports       0.83      0.14      0.24        35
              Food       0.00      0.00      0.00       162
       Coronavirus       0.00      0.00      0.00        43

          accuracy                           0.40      4776
         macro avg       0.41      0.16      0.16      4776
      weighted avg       0.43      0.40      0.33      4776



Random Forest

In [15]:
RF = Pipeline([
    ('vect', CountVectorizer(ngram_range = (1, 1))),
    ('tfidf', TfidfTransformer()),
    ('RF', RandomForestClassifier(max_depth = 3000, min_samples_split = 200))
])

RF.fit(X_train, Y_train)

Y_pred = RF.predict(X_test)

print('Train accuracy '+ str(RF.score(X_train, Y_train)))
print('Test accuracy %s' % accuracy_score(Y_pred, Y_test))
print(classification_report(Y_test, Y_pred, target_names = flairs))

Train accuracy 0.5374587717920528
Test accuracy 0.3890284757118928
                    precision    recall  f1-score   support

          AskIndia       0.54      0.19      0.28       701
     Non-Political       0.00      0.00      0.00       152
         Scheduled       0.36      0.93      0.52      1530
       Photography       1.00      0.04      0.09        45
Science/Technology       0.37      0.09      0.14      1101
          Politics       0.00      0.00      0.00       111
  Business/Finance       0.00      0.00      0.00       146
    Policy/Economy       0.58      0.26      0.36       750
            Sports       1.00      0.09      0.16        35
              Food       0.00      0.00      0.00       162
       Coronavirus       0.00      0.00      0.00        43

          accuracy                           0.39      4776
         macro avg       0.35      0.15      0.14      4776
      weighted avg       0.39      0.39      0.30      4776



### Title - Comments

In [16]:
X_train, X_test, Y_train, Y_test = dualFeature("Title", "Comments")

Naive Bayes

In [17]:
MNB = Pipeline([('vect', CountVectorizer(ngram_range = (1, 1))),
               ('tfidf', TfidfTransformer()),
               ('sampling', SMOTE()),
               ('MNB', MultinomialNB(alpha = 1, fit_prior = False)),
              ])
MNB.fit(X_train, Y_train)

from sklearn.metrics import classification_report
Y_pred = MNB.predict(X_test)

print('Train accuracy '+ str(MNB.score(X_train, Y_train)))
print('Test accuracy %s' % accuracy_score(Y_pred, Y_test))
print(classification_report(Y_test, Y_pred, target_names = flairs))

Train accuracy 0.6828438301659598
Test accuracy 0.49413735343383586
                    precision    recall  f1-score   support

          AskIndia       0.47      0.41      0.44       701
     Non-Political       0.40      0.43      0.42       152
         Scheduled       0.64      0.70      0.67      1530
       Photography       0.27      0.62      0.38        45
Science/Technology       0.57      0.17      0.26      1101
          Politics       0.36      0.66      0.46       111
  Business/Finance       0.26      0.45      0.33       146
    Policy/Economy       0.60      0.62      0.61       750
            Sports       0.06      0.69      0.12        35
              Food       0.35      0.44      0.39       162
       Coronavirus       0.29      0.58      0.38        43

          accuracy                           0.49      4776
         macro avg       0.39      0.53      0.41      4776
      weighted avg       0.55      0.49      0.49      4776



Logistic Regression

In [18]:
LR = Pipeline([('vect', CountVectorizer(ngram_range = (1, 1))),
                ('tfidf', TfidfTransformer()),
                ('sampling', SMOTE()),
                ('LR', LogisticRegression(solver = "liblinear", penalty = "l2")),
               ])
LR.fit(X_train, Y_train)

Y_pred = LR.predict(X_test)

print('Train accuracy '+ str(LR.score(X_train, Y_train)))
print('Test accuracy %s' % accuracy_score(Y_pred, Y_test))
print(classification_report(Y_test, Y_pred, target_names = flairs))

Train accuracy 0.7346212240196849
Test accuracy 0.5121440536013401
                    precision    recall  f1-score   support

          AskIndia       0.43      0.43      0.43       701
     Non-Political       0.32      0.43      0.37       152
         Scheduled       0.70      0.63      0.66      1530
       Photography       0.35      0.60      0.44        45
Science/Technology       0.51      0.30      0.38      1101
          Politics       0.36      0.71      0.48       111
  Business/Finance       0.22      0.44      0.29       146
    Policy/Economy       0.56      0.64      0.60       750
            Sports       0.41      0.63      0.49        35
              Food       0.26      0.44      0.33       162
       Coronavirus       0.62      0.70      0.66        43

          accuracy                           0.51      4776
         macro avg       0.43      0.54      0.47      4776
      weighted avg       0.54      0.51      0.51      4776



Random Forest

In [19]:
RF = Pipeline([
    ('vect', CountVectorizer(ngram_range = (1, 1))),
    ('tfidf', TfidfTransformer()),
    ('sampling', SMOTE()),
    ('RF', RandomForestClassifier(max_depth = 3000, min_samples_split = 200))
])

RF.fit(X_train, Y_train)

Y_pred = RF.predict(X_test)

print('Train accuracy '+ str(RF.score(X_train, Y_train)))
print('Test accuracy %s' % accuracy_score(Y_pred, Y_test))
print(classification_report(Y_test, Y_pred, target_names = flairs))

Train accuracy 0.815192921836553
Test accuracy 0.4897403685092127
                    precision    recall  f1-score   support

          AskIndia       0.41      0.39      0.40       701
     Non-Political       0.33      0.47      0.38       152
         Scheduled       0.65      0.67      0.66      1530
       Photography       0.30      0.56      0.39        45
Science/Technology       0.42      0.25      0.31      1101
          Politics       0.32      0.59      0.41       111
  Business/Finance       0.22      0.27      0.24       146
    Policy/Economy       0.53      0.63      0.57       750
            Sports       0.46      0.63      0.53        35
              Food       0.29      0.35      0.32       162
       Coronavirus       0.46      0.58      0.52        43

          accuracy                           0.49      4776
         macro avg       0.40      0.49      0.43      4776
      weighted avg       0.49      0.49      0.48      4776



### Title - Comments - Body

In [20]:
X_train, X_test, Y_train, Y_test = triFeature("Title", "Body", "Comments")

Naive Bayes

In [21]:
MNB = Pipeline([('vect', CountVectorizer(ngram_range = (1, 1))),
               ('tfidf', TfidfTransformer()),
               ('sampling', SMOTE()),
               ('MNB', MultinomialNB(alpha = 1, fit_prior = False)),
              ])
MNB.fit(X_train, Y_train)

from sklearn.metrics import classification_report
Y_pred = MNB.predict(X_test)

print('Train accuracy '+ str(MNB.score(X_train, Y_train)))
print('Test accuracy %s' % accuracy_score(Y_pred, Y_test))
print(classification_report(Y_test, Y_pred, target_names = flairs))

Train accuracy 0.6799120464897126
Test accuracy 0.49246231155778897
                    precision    recall  f1-score   support

          AskIndia       0.45      0.47      0.46       701
     Non-Political       0.41      0.43      0.42       152
         Scheduled       0.68      0.65      0.66      1530
       Photography       0.27      0.58      0.37        45
Science/Technology       0.59      0.15      0.24      1101
          Politics       0.40      0.65      0.49       111
  Business/Finance       0.23      0.50      0.31       146
    Policy/Economy       0.54      0.67      0.60       750
            Sports       0.08      0.69      0.14        35
              Food       0.33      0.44      0.38       162
       Coronavirus       0.32      0.58      0.41        43

          accuracy                           0.49      4776
         macro avg       0.39      0.53      0.41      4776
      weighted avg       0.55      0.49      0.49      4776



Logistic Regression

In [22]:
LR = Pipeline([('vect', CountVectorizer(ngram_range = (1, 1))),
                ('tfidf', TfidfTransformer()),
                ('sampling', SMOTE()),
                ('LR', LogisticRegression(solver = "liblinear", penalty = "l2")),
               ])
LR.fit(X_train, Y_train)

Y_pred = LR.predict(X_test)

print('Train accuracy '+ str(LR.score(X_train, Y_train)))
print('Test accuracy %s' % accuracy_score(Y_pred, Y_test))
print(classification_report(Y_test, Y_pred, target_names = flairs))

Train accuracy 0.7476048374430658
Test accuracy 0.5213567839195979
                    precision    recall  f1-score   support

          AskIndia       0.43      0.46      0.44       701
     Non-Political       0.37      0.45      0.40       152
         Scheduled       0.70      0.62      0.66      1530
       Photography       0.35      0.64      0.46        45
Science/Technology       0.49      0.34      0.40      1101
          Politics       0.39      0.73      0.51       111
  Business/Finance       0.24      0.40      0.30       146
    Policy/Economy       0.59      0.65      0.62       750
            Sports       0.50      0.63      0.56        35
              Food       0.27      0.43      0.33       162
       Coronavirus       0.55      0.60      0.58        43

          accuracy                           0.52      4776
         macro avg       0.44      0.54      0.48      4776
      weighted avg       0.54      0.52      0.52      4776



In [23]:
pickle.dump(LR, open('LR-PUSHSHIFT.pkl', 'wb'))

Random Forest

In [24]:
RF = Pipeline([
    ('vect', CountVectorizer(ngram_range = (1, 1))),
    ('tfidf', TfidfTransformer()),
    ('sampling', SMOTE()),
    ('RF', RandomForestClassifier(max_depth = 3000, min_samples_split = 200))
])

RF.fit(X_train, Y_train)

Y_pred = RF.predict(X_test)

print('Train accuracy '+ str(RF.score(X_train, Y_train)))
print('Test accuracy %s' % accuracy_score(Y_pred, Y_test))
print(classification_report(Y_test, Y_pred, target_names = flairs))

Train accuracy 0.8349824616512225
Test accuracy 0.5159128978224455
                    precision    recall  f1-score   support

          AskIndia       0.44      0.51      0.47       701
     Non-Political       0.40      0.44      0.42       152
         Scheduled       0.66      0.66      0.66      1530
       Photography       0.33      0.53      0.41        45
Science/Technology       0.44      0.30      0.35      1101
          Politics       0.35      0.59      0.44       111
  Business/Finance       0.28      0.32      0.30       146
    Policy/Economy       0.55      0.62      0.58       750
            Sports       0.71      0.63      0.67        35
              Food       0.35      0.37      0.36       162
       Coronavirus       0.47      0.58      0.52        43

          accuracy                           0.52      4776
         macro avg       0.45      0.50      0.47      4776
      weighted avg       0.52      0.52      0.51      4776

