In [1]:
#importing libraries required for natural language processing
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix,classification_report

In [2]:
#Reading data from csv file
df = pd.read_csv(r'C:\Users\admin\Anaconda3\Scripts\Reddit flare prediction\data.csv')
df.dropna(inplace = True)
df.reset_index(inplace  = True)
df['S.no'] = df['S.no'].astype("int32")
del df['index']
df.tail(10)

Unnamed: 0,S.no,Title,Author,Likes,Web-domain,Flare
4991,4992,‘Why Fight Over Ayodhya?’ Kids Explore Growing...,ashuhitman1,0.0,youtube.com,Politics
4992,4993,"What is more important - the Babri mosque, the...",Rauf_Will_Speak,4.0,self.india,AskIndia
4993,4994,Can you fill a 2 minute survey about GST for m...,king9karan,1.0,self.india,Policy/Economy
4994,4995,This is a very pressing issue that we must dis...,Throwaway_Mattress,2.0,self.india,Non-Political
4995,4996,Will Diwali fireworks damage house structures ...,riverfellon,3.0,self.india,AskIndia
4996,4997,Diwali has been dull this time: Priyanka Gandh...,[deleted],2.0,timesofindia.indiatimes.com,Politics
4997,4998,Harbhajan or Bumrah? This Girl's Unique Bowlin...,Savi321,2.0,in.news.yahoo.com,Sports
4998,4999,"Huge gap in private, government hospital Ayush...",zistu,3.0,timesofindia.indiatimes.com,Policy/Economy
4999,5000,It’s a great time to be an influencer on Insta...,scribbbblr,3.0,amp.scroll.in,Business/Finance
5000,5001,"Defence Minister Rajnath Singh, in Haryana's K...",Gavthi_Batman,2.0,twitter.com,Politics


In [3]:
#Joining some topics which should have same topic
polt = ["Politics","CAA-NRC-NPR","Politics [Megathread]","CAA-NRC","Politics -- Source in comments"]
no_polt = ["Non-Political","40 Martyrs","Goal Achieved!!!","On Internet Shutdowns"]
policy = ["Policy/Economy","Demonetization","Policy & Economy","Policy/Economy -2017 Article","[Year: 2001] Policy/Economy","Policy/Economy [Megathread]"]

In [4]:
#Setting common Flare for common topics
df.loc[df["Flare"].isin(polt),"Flare"] = "Politics"
df.loc[df["Flare"].isin(no_polt),"Flare"] = "Non-Political"
df.loc[df["Flare"].isin(policy),"Flare"] = "Policy/Economy"

In [5]:
#Taking top 6 topics having most number of data points
df = df.loc[df["Flare"].isin(df.Flare.unique()[:6])]

In [6]:
#Updating stopwords 
stop_words = stopwords.words('english')
stop_words += list(string.punctuation)

In [7]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [8]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [9]:
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stop_words:
            w = w.lower()
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w,pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word)
            
    return " ".join(output_words)

In [10]:
def change_flare(str):
    if str=="Coronavirus":
        return 1
    elif str == "Politics":
        return 2
    elif str=="Non-Political":
        return 3
    elif str=="AskIndia":
        return 4
    elif str == "Policy/Economy":
        return 5
    else:
        return 0

In [11]:
X = df["Title"].values
Y = df["Flare"].apply(change_flare)

In [12]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag

X = [ clean_review(word_tokenize(" ".join(i.split('-')))) for i in X]
X[:10]

['iit ia study make depressed able sleep please help',
 'please help ’ go insane lockdown end',
 'rajasthan cease use rapid test kit result invalid health minister raghu sharma say kit give mere five per cent correct valid result report forward indian council medical research icmr regard',
 'pregnant muslim woman refuse treatment india force give birth stillborn beaten',
 'isa global immigration agency scam',
 'first coronavirus patient receive plasma therapy recovers take ventilator delhi hospital',
 "pakistani student sadhguru call 'taliban name forbes list",
 'communal riot india past 5 year mukhtar abbas naqvi',
 'hello find good piano teacher online',
 "`` thank doctor kerala safer '' italian tourist covid 19 recovery"]

In [13]:
len(X),len(Y)

(4601, 4601)

In [14]:
count = 0
remove_index = []
for i in range(0,len(X)):
    if len(X[i].split())<2:
        remove_index.append(i)

for i in range(len(remove_index),0,-1):
    X.pop(i)
    Y.pop(i)

In [15]:
#Selecting data points in train,validation and test set
x_train,x_test,y_train,y_test = train_test_split(X,Y,random_state = 0,test_size = 0.3)
x_val,x_test,y_val,y_test = train_test_split(x_test,y_test,test_size = 0.5)

# Using CountVectorizer to preprocess text

In [16]:
cv = CountVectorizer(max_features = 4000,ngram_range = (1,3))

In [17]:
x_train_vec = cv.fit_transform(x_train)
x_val_vec = cv.transform(x_val)
x_test_vec = cv.transform(x_test)

In [18]:
NBClassifier = MultinomialNB(alpha = 0.1)

In [19]:
NBClassifier.fit(x_train_vec,y_train)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [20]:
print(NBClassifier.score(x_train_vec,y_train))
print(NBClassifier.score(x_val_vec,y_val))
print(NBClassifier.score(x_test_vec,y_test))

0.8601508485229415
0.5586510263929618
0.5695461200585652


In [21]:
cv = CountVectorizer(max_features = 5000,ngram_range = (1,3))

In [22]:
x_train_vec = cv.fit_transform(x_train)
x_val_vec = cv.transform(x_val)
x_test_vec = cv.transform(x_test)

In [23]:
NBClassifier = MultinomialNB(alpha = 0.1)

In [24]:
NBClassifier.fit(x_train_vec,y_train)
NBClassifier.score(x_train_vec,y_train)

0.8796354494028913

In [25]:
NBClassifier.score(x_val_vec,y_val)

0.5645161290322581

# Using Tf-Idf Vectorizer to preprocess text

In [26]:
tfidf = TfidfVectorizer(max_features = 7000 , ngram_range=(1,2))

In [27]:
x_train_vec = tfidf.fit_transform(x_train)
x_val_vec = tfidf.transform(x_val)
x_test_vec = tfidf.transform(x_test)

## Using Naive Bayes Classifier

In [28]:
NBClassifier = MultinomialNB()

In [29]:
NBClassifier.fit(x_train_vec,y_train)
NBClassifier.score(x_train_vec,y_train)

0.7661847894406034

In [30]:
y_pred = NBClassifier.predict(x_val_vec)
NBClassifier.score(x_val_vec,y_val)

0.5586510263929618

In [31]:
print(classification_report(y_val,y_pred))
print(confusion_matrix(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        13
           1       0.63      0.76      0.69       221
           2       0.51      0.77      0.61       189
           3       0.54      0.40      0.46       161
           4       0.29      0.03      0.05        71
           5       0.00      0.00      0.00        27

    accuracy                           0.56       682
   macro avg       0.33      0.33      0.30       682
weighted avg       0.50      0.56      0.51       682

[[  0   6   2   5   0   0]
 [  0 168  41  10   2   0]
 [  0  29 146  14   0   0]
 [  0  32  61  65   3   0]
 [  0  18  27  24   2   0]
 [  0  15  10   2   0   0]]


  'precision', 'predicted', average, warn_for)


## Using RandomForestClassifier

In [32]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators = 50, random_state = 100)
classifier.fit(x_train_vec,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=100,
                       verbose=0, warm_start=False)

In [33]:
y_pred = classifier.predict(x_val_vec)
print(classifier.score(x_val_vec,y_val))
print(classification_report(y_val,y_pred))
print(confusion_matrix(y_val,y_pred))

0.5747800586510264
              precision    recall  f1-score   support

           0       0.67      0.15      0.25        13
           1       0.71      0.70      0.71       221
           2       0.61      0.63      0.62       189
           3       0.43      0.60      0.51       161
           4       0.41      0.21      0.28        71
           5       0.60      0.11      0.19        27

    accuracy                           0.57       682
   macro avg       0.57      0.40      0.43       682
weighted avg       0.58      0.57      0.56       682

[[  2   5   0   6   0   0]
 [  0 155  30  32   4   0]
 [  0  21 120  46   1   1]
 [  1  14  32  97  17   0]
 [  0  11   9  35  15   1]
 [  0  11   6   7   0   3]]


In [34]:
y_pred = classifier.predict(x_test_vec)
print(classifier.score(x_train_vec,y_train))
print(classifier.score(x_test_vec,y_test))
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

0.9952859836580766
0.6002928257686676
              precision    recall  f1-score   support

           0       0.33      0.09      0.14        22
           1       0.74      0.74      0.74       221
           2       0.62      0.67      0.64       183
           3       0.47      0.62      0.53       154
           4       0.49      0.29      0.36        76
           5       0.57      0.15      0.24        27

    accuracy                           0.60       683
   macro avg       0.54      0.43      0.44       683
weighted avg       0.60      0.60      0.59       683

[[  2   2   4  10   4   0]
 [  2 163  26  24   6   0]
 [  1  10 123  44   3   2]
 [  0  22  25  96  10   1]
 [  1  19  12  22  22   0]
 [  0   5   9   9   0   4]]


## Using SVC

In [35]:
from sklearn.svm import SVC
svc = SVC(kernel = 'linear', C = 1.5,gamma = 0.01)
svc.fit(x_train_vec,y_train)

SVC(C=1.5, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [36]:
print(svc.score(x_train_vec,y_train))
print(svc.score(x_val_vec,y_val))

0.947517284726587
0.6041055718475073


In [37]:
y_pred = svc.predict(x_test_vec)
svc.score(x_test_vec,y_test)

0.6222547584187409

In [38]:
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.40      0.09      0.15        22
           1       0.73      0.76      0.74       221
           2       0.64      0.73      0.68       183
           3       0.52      0.58      0.55       154
           4       0.48      0.37      0.42        76
           5       0.50      0.15      0.23        27

    accuracy                           0.62       683
   macro avg       0.55      0.45      0.46       683
weighted avg       0.61      0.62      0.61       683

[[  2   3   3   9   4   1]
 [  1 167  25  23   5   0]
 [  1  15 134  27   4   2]
 [  0  19  30  90  14   1]
 [  0  17  12  19  28   0]
 [  1   7   6   6   3   4]]


## Using XGBoost

In [39]:
import xgboost as xgb

clf = xgb.XGBClassifier()
clf.fit(x_train_vec,y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method=None, validate_parameters=False, verbosity=None)

In [40]:
print(clf.score(x_train_vec,y_train))
print(clf.score(x_val_vec,y_val))

0.8485229415461973
0.5733137829912024


In [41]:
clf.score(x_test_vec,y_test)

0.5666178623718887

## Changing some parameters in count vectorizer to increase accuracy

In [42]:
cv = CountVectorizer(max_features = 7000,analyzer = 'word',stop_words = stop_words,ngram_range = (1,4),max_df = 0.8)
x_train_vec = cv.fit_transform(x_train)
x_val_vec = cv.transform(x_val)
x_test_vec = cv.transform(x_test)

In [43]:
svc = SVC(kernel = 'linear', C = 1.5,gamma = 0.01)
svc.fit(x_train_vec,y_train)
print(svc.score(x_val_vec,y_val))

0.5542521994134897


In [44]:
clf = xgb.XGBClassifier()
clf.fit(x_train_vec,y_train)
clf.score(x_val_vec,y_val)

0.5762463343108505

In [45]:
classifier = RandomForestClassifier(n_estimators = 50, random_state = 100)
classifier.fit(x_train_vec,y_train)
classifier.score(x_val_vec,y_val)

0.5762463343108505

## Changing some parameters in Tf-Idf vectorizer to increase accuracy

In [46]:
tfidf = TfidfVectorizer(max_features = 7000 , ngram_range=(1,4),analyzer = 'word',stop_words = stop_words,max_df = 0.8,lowercase = True)
x_train_vec = tfidf.fit_transform(x_train)
x_val_vec = tfidf.transform(x_val)
x_test_vec = tfidf.transform(x_test)

In [47]:
svc = SVC(kernel = 'linear', C = 1.6,gamma = 0.01)
svc.fit(x_train_vec,y_train)
print(svc.score(x_val_vec,y_val))

0.5967741935483871


In [48]:
clf = xgb.XGBClassifier()
clf.fit(x_train_vec,y_train)
clf.score(x_val_vec,y_val)

0.5718475073313783

In [49]:
classifier = RandomForestClassifier(n_estimators = 50, random_state = 100)
classifier.fit(x_train_vec,y_train)
classifier.score(x_val_vec,y_val)

0.5850439882697948

In [50]:
svc.score(x_test_vec,y_test)

0.6266471449487555

In [51]:
clf.score(x_test_vec,y_test)

0.5592972181551976

In [52]:
classifier.score(x_test_vec,y_test)

0.5885797950219619

## Using neural network 

In [53]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from sklearn.preprocessing import OneHotEncoder

Using TensorFlow backend.


In [55]:
onehot_encoder = OneHotEncoder(sparse=False)
y_train = np.array(y_train).reshape(len(y_train), 1)
y_val = np.array(y_val).reshape(len(y_val), 1)
y_test = np.array(y_test).reshape(len(y_test), 1)

y_train_encoded = onehot_encoder.fit_transform(y_train)
y_val_encoded = onehot_encoder.fit_transform(y_val)
y_test_encoded = onehot_encoder.fit_transform(y_test)
y_train_encoded.shape

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


(3182, 6)

In [56]:
model = Sequential()
model.add(Dense(64,activation='relu', input_dim=7000))
model.add(Dropout(0.2))
model.add(Dense(6,activation='sigmoid'))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.fit(x_train_vec, y_train_encoded,epochs=15, validation_data=(x_val_vec, y_val_encoded))

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 3182 samples, validate on 682 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.callbacks.History at 0x1c71c8efcc8>

In [57]:
score,acc = model.evaluate(x_test_vec,y_test_encoded)



In [58]:
print(score)
print(acc)

0.3578313812504401
0.8757930994033813


The above neural network gives accuracy of 87.5%. Thus it is working decently well as compared to primitive ML algorithms which were giving accuracy in range of 57-63%.

In [65]:
def fun(y_pred):
    res = []
    for i in y_pred:
        res.append(np.argmax(i))
    return res

In [66]:
#Classification report of neural network
y_pred = model.predict(x_test_vec)
y_pred = fun(y_pred)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.50      0.09      0.15        22
           1       0.71      0.69      0.70       221
           2       0.59      0.70      0.64       183
           3       0.48      0.56      0.52       154
           4       0.52      0.42      0.47        76
           5       0.44      0.15      0.22        27

    accuracy                           0.59       683
   macro avg       0.54      0.44      0.45       683
weighted avg       0.59      0.59      0.58       683

[[  2   1   4   9   4   2]
 [  1 152  34  28   5   1]
 [  0  18 129  30   5   1]
 [  0  23  29  86  15   1]
 [  0  13  14  17  32   0]
 [  1   6   7   9   0   4]]


Above neural network works really well on classes 1-4(Coronavirus,Politics,Non-Political,AskIndia), maybe due to presence of large number of data-points belonging to these classes. While classes 0 & 5 are performing not that well as there are less data points belonging to them.

In [59]:
from sklearn.externals import joblib



In [60]:
#Serializing model and vectorizer for web application
joblib.dump(model,'NN_model.pkl')
joblib.dump(tfidf,'Tf-Idf.pkl')

['Tf-Idf.pkl']