In [1]:
import json
import pandas as pd
import datetime as dt
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
with open('webpage1.json') as f:
    data = json.load(f)
    
def get_date(created):
    return dt.datetime.fromtimestamp(created)

to_store = { "flair":[], "score":[], "url":[], "title":[], "created":[] }

flairs = [  "Coronavirus", "Politics", "Policy/Economy", "AskIndia", "Science/Technology",
           "Non-Political" , "Business/Finance"]

for subreddit in data.values():
    for post in subreddit:
        for flare,flare_name in post.items():
            for flair in flairs:
                if flare_name == flair:
                    to_store["flair"].append(post['link_flair_text'])
                    to_store["score"].append(post['score'])
                    to_store["url"].append(post['url'])
                    to_store["title"].append(post['title'])
                    to_store["created"].append(post['created_utc'])
                    
data_tab = pd.DataFrame(to_store)
data_tab

_timestamp = data_tab["created"].apply(get_date)
data_tab = data_tab.assign(timestamp = _timestamp)

data_tab.to_csv('Reddit_Data.csv', index=False)

data_tab = pd.read_csv('Reddit_Data.csv')

In [3]:
#Naive Byes Classifier
def nb_classifier(x_train, x_test, y_train, y_test):
  
    from sklearn.naive_bayes import MultinomialNB

    nb = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('clf', MultinomialNB())])
    nb.fit(x_train, y_train)
    y_pred = nb.predict(x_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

In [4]:
#Logistic Regression
def logisticreg(X_train, X_test, y_train, y_test):
    from sklearn.linear_model import LogisticRegression

    logreg = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', LogisticRegression(n_jobs=1, C=1e5))])
    
    logreg.fit(X_train, y_train)

    y_pred = logreg.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

In [5]:
#Linear SVM
def linear_svm(X_train, X_test, y_train, y_test): 
    from sklearn.linear_model import SGDClassifier

    sgd = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None))])
    sgd.fit(X_train, y_train)
    y_pred = sgd.predict(X_test)
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

In [6]:
#Random Forest   
def randomforest(X_train, X_test, y_train, y_test):
    from sklearn.ensemble import RandomForestClassifier
  
    ranfor = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', RandomForestClassifier(n_estimators = 1000, random_state = 42))])
    ranfor.fit(X_train, y_train)

    y_pred = ranfor.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

In [7]:
#MLP Classifier
def mlpclassifier(X_train, X_test, y_train, y_test):
    from sklearn.neural_network import MLPClassifier
  
    mlp = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', MLPClassifier(hidden_layer_sizes=(30,30,30)))])
    mlp.fit(X_train, y_train)

    y_pred = mlp.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

In [8]:
#Train Test Varied Data ML Models
def train_test(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

    print("Results of Naive Bayes Classifier")
    nb_classifier(X_train, X_test, y_train, y_test)
    print("Results of Linear Support Vector Machine")
    linear_svm(X_train, X_test, y_train, y_test)
    print("Results of Logistic Regression")
    logisticreg(X_train, X_test, y_train, y_test)
    print("Results of Random Forest")
    randomforest(X_train, X_test, y_train, y_test)
    print("Results of MLP Classifier")
    mlpclassifier(X_train, X_test, y_train, y_test)

In [9]:
train_test(data_tab.title,data_tab.flair)

Results of Naive Bayes Classifier
accuracy 0.6554809843400448
                    precision    recall  f1-score   support

       Coronavirus       0.87      0.59      0.70        68
          Politics       0.00      0.00      0.00         8
    Policy/Economy       0.58      1.00      0.73       203
          AskIndia       1.00      0.48      0.65        93
Science/Technology       0.00      0.00      0.00         9
     Non-Political       1.00      0.09      0.16        57
  Business/Finance       0.00      0.00      0.00         9

          accuracy                           0.66       447
         macro avg       0.49      0.31      0.32       447
      weighted avg       0.73      0.66      0.60       447

Results of Linear Support Vector Machine
accuracy 0.8120805369127517
                    precision    recall  f1-score   support

       Coronavirus       0.67      0.90      0.77        68
          Politics       0.00      0.00      0.00         8
    Policy/Economy       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))


accuracy 0.825503355704698
                    precision    recall  f1-score   support

       Coronavirus       0.69      0.85      0.76        68
          Politics       0.00      0.00      0.00         8
    Policy/Economy       0.87      0.91      0.89       203
          AskIndia       0.82      0.86      0.84        93
Science/Technology       1.00      0.44      0.62         9
     Non-Political       0.86      0.75      0.80        57
  Business/Finance       0.00      0.00      0.00         9

          accuracy                           0.83       447
         macro avg       0.61      0.55      0.56       447
      weighted avg       0.80      0.83      0.81       447

Results of Random Forest
accuracy 0.8568232662192393
                    precision    recall  f1-score   support

       Coronavirus       0.83      0.81      0.82        68
          Politics       0.00      0.00      0.00         8
    Policy/Economy       0.83      0.97      0.89       203
          AskInd

  _warn_prf(average, modifier, msg_start, len(result))


accuracy 0.814317673378076
                    precision    recall  f1-score   support

       Coronavirus       0.81      0.85      0.83        68
          Politics       0.00      0.00      0.00         8
    Policy/Economy       0.86      0.88      0.87       203
          AskIndia       0.85      0.83      0.84        93
Science/Technology       0.23      0.56      0.32         9
     Non-Political       0.88      0.79      0.83        57
  Business/Finance       0.00      0.00      0.00         9

          accuracy                           0.81       447
         macro avg       0.52      0.56      0.53       447
      weighted avg       0.80      0.81      0.81       447



  _warn_prf(average, modifier, msg_start, len(result))
