In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import hstack

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import stanfordnlp
from textblob import TextBlob
import textstat

from nltk.tokenize import RegexpTokenizer
from nltk import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB 
from sklearn.naive_bayes import MultinomialNB 


In [2]:
df = pd.read_csv('df_mc.csv')

In [6]:
df.sample(10)
print(f'Data shape is {df.shape}')

Data shape is (3782629, 9)


In [7]:
df.isna().mean()

Unnamed: 0      0.0
type            0.0
title           0.0
dc_score        0.0
sub             0.0
vs              0.0
tokens          0.0
lemma           0.0
title_length    0.0
dtype: float64

In [8]:
df['type'].value_counts(normalize=True)

reliable     0.312653
political    0.277992
bias         0.189269
fake         0.149409
clickbait    0.038647
junksci      0.019245
hate         0.012786
Name: type, dtype: float64

In [9]:
cvec = CountVectorizer(stop_words='english', ngram_range=(1, 2), max_df = .8, min_df = 3)
X = cvec.fit_transform(df['lemma'])

X_full =  hstack((X,np.array(df['sub'])[:,None]))

X_full =  hstack((X,np.array(df['vs'])[:,None]))

X_full =  hstack((X,np.array(df['dc_score'])[:,None]))

X_full =  hstack((X,np.array(df['title_length'])[:,None]))

X_full.shape

y = df['type']

X_train, X_test, y_train, y_test = train_test_split(X_full, y,
                                                    random_state=42, stratify = y)

nb = MultinomialNB()

nb = MultinomialNB()
nb.fit(X_train, y_train)
print(nb.score(X_train, y_train))
print(nb.score(X_test, y_test))

0.7481863579148325
0.7025415107787382


In [13]:
preds = nb.predict_proba(X_full)

In [14]:
preds.shape

(3782629, 7)

In [16]:
nb.classes_

array(['bias', 'clickbait', 'fake', 'hate', 'junksci', 'political',
       'reliable'], dtype='<U9')

In [18]:
prob_bias = []
prob_clickbait = []
prob_fake = []
prob_hate = []
prob_junksci = []
prob_political = []
prob_reliable = []

for row in preds:
    prob_bias.append(round(row[0], 4))
    prob_clickbait.append(round(row[1], 4))
    prob_fake.append(round(row[2], 4))
    prob_hate.append(round(row[3], 4))
    prob_junksci.append(round(row[4], 4))
    prob_political.append(round(row[5], 4))
    prob_reliable.append(round(row[6], 4))

df['prob_bias'] = prob_bias
df['prob_clickbait'] = prob_clickbait
df['prob_fake'] = prob_fake
df['prob_hate'] = prob_hate
df['prob_junksci'] = prob_junksci
df['prob_political'] = prob_political
df['prob_reliable'] = prob_reliable

df.sample(10)

     
    

Unnamed: 0.1,Unnamed: 0,type,title,dc_score,sub,vs,tokens,lemma,title_length,prob_bias,prob_clickbait,prob_fake,prob_hate,prob_junksci,prob_political,prob_reliable
375085,378941,fake,Dealing With Mentally Unbalanced Trespassers,16.52,0.2,1.0,"['Dealing', 'With', 'Mentally', 'Unbalanced', ...",Dealing With Mentally Unbalanced Trespassers,5,0.129,0.0062,0.5847,0.0007,0.0089,0.1901,0.0803
153182,154340,fake,What Smarter Minds Than Mine Think About Gold,0.4,0.0,0.7,"['What', 'Smarter', 'Minds', 'Than', 'Mine', '...",What Smarter Minds Than Mine Think About Gold,8,0.0006,0.0005,0.989,0.0,0.0,0.0063,0.0036
505046,509852,fake,‘Democrats Think They Benefit From a Governemn...,9.95,0.0,0.667,"['Democrats', 'Think', 'They', 'Benefit', 'Fro...",Democrats Think They Benefit From a Governemnt...,8,0.0215,0.0056,0.103,0.0,0.0,0.8683,0.0016
1931234,1976479,bias,Sierra Old Guard Now So Far Left They`ve Rende...,9.92,0.55,1.0,"['Sierra', 'Old', 'Guard', 'Now', 'So', 'Far',...",Sierra Old Guard Now So Far Left They ve Rende...,12,0.0642,0.0008,0.1149,0.0,0.0,0.4803,0.3398
932351,944194,reliable,"My Bad, Eh?",0.15,0.666667,0.364,"['My', 'Bad', 'Eh']",My Bad Eh,3,0.0285,0.0192,0.2961,0.0001,0.0005,0.5332,0.1223
798650,804038,reliable,Preview: Maple Leafs at Canadiens,10.2,0.0,1.0,"['Preview', 'Maple', 'Leafs', 'at', 'Canadiens']",Preview Maple Leafs at Canadiens,5,0.0,0.0,0.0,0.0,0.0,0.0,1.0
39155,39404,fake,"S&P’s Balls Drop, Firm Drops US LT Credit to A...",8.18,0.4,0.517,"['S', 'P', 's', 'Balls', 'Drop', 'Firm', 'Drop...",S P s Balls Drop Firm Drops US LT Credit to AA...,14,0.0002,0.0,0.6029,0.0,0.0,0.0005,0.3964
1940262,1985585,bias,No chances to find survivors of Riga supermark...,10.81,0.0,0.526,"['No', 'chances', 'to', 'find', 'survivors', '...",No chance to find survivor of Riga supermarket...,11,0.9976,0.0,0.0006,0.0,0.0,0.0,0.0018
1378543,1412636,reliable,"With Beckham in Final, No M.L.S. Regrets",10.75,0.6,0.515,"['With', 'Beckham', 'in', 'Final', 'No', 'M', ...",With Beckham in Final No M L S Regrets,9,0.0215,0.0005,0.0343,0.0,0.0,0.0198,0.924
1448195,1485883,reliable,How to Choreograph the End of a Call,6.01,0.0,1.0,"['How', 'to', 'Choreograph', 'the', 'End', 'of...",How to Choreograph the End of a Call,8,0.1493,0.0029,0.1366,0.0,0.0001,0.1668,0.5442
