In [106]:
import json
import pandas as pd
import datetime as dt
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

with open('webpage1.json') as f:
    data = json.load(f)
    
def get_date(created):
    return dt.datetime.fromtimestamp(created)

to_store = { "flair":[], "score":[], "url":[], "title":[], "created":[] }

flairs = [ "Policy/Economy" , "AskIndia" , "Coronavirus" , 
           "Non-Political" , "Science/Technology" , "Politics" , "Photography" ,
           "Food","Sports", "Business/Finance", "Scheduled", "Low-effort self-post."]

for subreddit in data.values():
    for post in subreddit:
        for flare,flare_name in post.items():
            for flair in flairs:
                if flare_name == flair:
                    to_store["flair"].append(post['link_flair_text'])
                    to_store["score"].append(post['score'])
                    to_store["url"].append(post['url'])
                    to_store["title"].append(post['title'])
                    to_store["created"].append(post['created_utc'])
                    
data_tab = pd.DataFrame(to_store)

_timestamp = data_tab["created"].apply(get_date)
data_tab = data_tab.assign(timestamp = _timestamp)

data_tab.to_csv('Reddit_Data.csv', index=False)

data_tab = pd.read_csv('Reddit_Data.csv')

#Naive Byes Classifier
def nb_classifier(x_train, x_test, y_train, y_test):
  
  from sklearn.naive_bayes import MultinomialNB

  nb = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('clf', MultinomialNB())])
  nb.fit(x_train, y_train)

  y_pred = nb.predict(x_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred,target_names=flairs))

#Linear SVM
def linear_svm(X_train, X_test, y_train, y_test): 
    from sklearn.linear_model import SGDClassifier

    sgd = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None))])
    sgd.fit(X_train, y_train)
    y_pred = sgd.predict(X_test)
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))
    
#Logistic Regression
def logisticreg(X_train, X_test, y_train, y_test):
    from sklearn.linear_model import LogisticRegression

    logreg = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', LogisticRegression(n_jobs=1, C=1e5))])
    
    logreg.fit(X_train, y_train)

    y_pred = logreg.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))
    
 #Random Forest   
def randomforest(X_train, X_test, y_train, y_test):
    from sklearn.ensemble import RandomForestClassifier
  
    ranfor = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', RandomForestClassifier(n_estimators = 1000, random_state = 42))])
    ranfor.fit(X_train, y_train)

    y_pred = ranfor.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))
    
#MLP Classifier
def mlpclassifier(X_train, X_test, y_train, y_test):
    from sklearn.neural_network import MLPClassifier
  
    mlp = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', MLPClassifier(hidden_layer_sizes=(30,30,30)))])
    mlp.fit(X_train, y_train)

    y_pred = mlp.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))
    
#Train Test Varied Data ML Models
def train_test(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

    print("Results of Naive Bayes Classifier")
    nb_classifier(X_train, X_test, y_train, y_test)
    print("Results of Linear Support Vector Machine")
    linear_svm(X_train, X_test, y_train, y_test)
    print("Results of Logistic Regression")
    logisticreg(X_train, X_test, y_train, y_test)
    print("Results of Random Forest")
    randomforest(X_train, X_test, y_train, y_test)
    print("Results of MLP Classifier")
    mlpclassifier(X_train, X_test, y_train, y_test)
    
train_test(data_tab.title,data_tab.flair)

Results of Naive Bayes Classifier
accuracy 0.6237424547283702
                       precision    recall  f1-score   support

       Policy/Economy       1.00      0.10      0.19        49
             AskIndia       0.00      0.00      0.00         2
          Coronavirus       0.62      1.00      0.76       299
        Non-Political       0.00      0.00      0.00         3
   Science/Technology       0.00      0.00      0.00         1
             Politics       1.00      0.07      0.14        68
          Photography       0.00      0.00      0.00         9
                 Food       0.00      0.00      0.00         8
               Sports       1.00      0.02      0.05        43
     Business/Finance       0.00      0.00      0.00         2
            Scheduled       0.00      0.00      0.00         8
Low-effort self-post.       0.00      0.00      0.00         5

             accuracy                           0.62       497
            macro avg       0.30      0.10      0.09  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))


accuracy 0.8631790744466801
                       precision    recall  f1-score   support

       Policy/Economy       0.73      0.88      0.80        49
             AskIndia       0.00      0.00      0.00         2
          Coronavirus       0.87      0.94      0.91       299
        Non-Political       1.00      1.00      1.00         3
   Science/Technology       0.00      0.00      0.00         1
             Politics       0.90      0.81      0.85        68
          Photography       1.00      0.78      0.88         9
                 Food       0.00      0.00      0.00         8
               Sports       0.94      0.79      0.86        43
     Business/Finance       1.00      1.00      1.00         2
            Scheduled       0.00      0.00      0.00         8
Low-effort self-post.       1.00      0.60      0.75         5

             accuracy                           0.86       497
            macro avg       0.62      0.57      0.59       497
         weighted avg    

  _warn_prf(average, modifier, msg_start, len(result))


accuracy 0.8672032193158954
                       precision    recall  f1-score   support

       Policy/Economy       0.76      0.84      0.80        49
             AskIndia       0.00      0.00      0.00         2
          Coronavirus       0.88      0.96      0.92       299
        Non-Political       0.60      1.00      0.75         3
   Science/Technology       0.00      0.00      0.00         1
             Politics       0.82      0.82      0.82        68
          Photography       1.00      0.78      0.88         9
                 Food       0.00      0.00      0.00         8
               Sports       0.97      0.74      0.84        43
     Business/Finance       1.00      1.00      1.00         2
            Scheduled       0.00      0.00      0.00         8
Low-effort self-post.       1.00      0.60      0.75         5

             accuracy                           0.87       497
            macro avg       0.59      0.56      0.56       497
         weighted avg    

  _warn_prf(average, modifier, msg_start, len(result))
