In [1]:
import os
import json

import IPython
import IPython.display
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
from IPython.display import Markdown
from IPython.display import display

mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.grid'] = False
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)

In [2]:
data = json.load(open("./data/dataset.json", "r"))

data = pd.DataFrame(data)
data.sample(10)

Unnamed: 0,text,author_id,label,created_at,retweet_count,reply_count,like_count,quote_count,in_reply_to_user_id
153048,This is not what this means. It means if you ...,2551394742,real,2020-05-01T14:04:33.000Z,0,1,1,0,True
229271,@stevesilberman @kurteichenwald Former FD EMT-...,1626305472,real,2020-05-01T20:49:41.000Z,0,0,1,0,False
4571,"I can discuss MANY anecdotes like this, from m...",113036499,real,2020-06-20T04:48:43.000Z,1,1,15,0,False
160303,.@WHO director announced the name 'COVID-19' #...,14175873,real,2020-02-11T15:22:20.000Z,0,0,3,1,False
43505,Updated CDC Guidance: COVID-19 Employer Inform...,442246804,real,2020-06-08T11:12:13.000Z,0,0,0,0,True
102641,Big commercial labs given priority for coronav...,1101187863939440645,real,2020-04-30T01:25:53.000Z,0,0,0,0,True
179236,@joefree215 More reason to ignore the WHO.,14420325,real,2020-04-16T10:33:32.000Z,0,0,1,0,False
29223,@JoeFrady That double punch of “Bad” and “A So...,15381663,real,2020-05-20T12:29:42.000Z,0,0,1,0,False
126996,Communication Resources for Travelers | CDC ht...,15193667,real,2020-03-08T21:46:56.000Z,0,1,0,0,False
27605,TUSONGE offers capacity building session to Co...,896659878030303232,real,2020-06-25T12:55:09.000Z,1,0,3,0,True


In [3]:
def print_tweets_per_label(df, n=5):
    real = df[df.label == "real"]["text"].sample(n)
    display(Markdown("### Real Tweets"))
    real.apply(print)
    
    fake = df[df.label == "fake"]["text"].sample(n)
    display(Markdown("### Fake Tweets"))
    fake.apply(print)

In [4]:
print_tweets_per_label(data, n=3)

### Real Tweets

Sorry, You Might Get the Flu Twice This Year — Here’s Why https://t.co/ETFOtndCJw via @healthline
8) If we wait until there are drugs (good drugs that we know work on most everybody &amp; don't harm people too much) then opening looks very different. When outbreaks happen (&amp; they will) healthcare capacity will be less likely to be overwhelmed. Still dangerous but more sane.
San Antonio leaders say Tuesday COVID-19 numbers will be 'extraordinarily high'  

Dr. Colleen Bridger, interim director for Metro Health, is expected to address the city's contact tracing efforts as the virus continues to climb.

https://t.co/k1qMCOfgdg


### Fake Tweets

Can 5G exposure alter the structure and function of hemoglobin, causing coronavirus patients to die from oxygen deprivation? – https://t.co/49n78m3cb6 https://t.co/sZ0CJ7EfHz
A top German doctor recommends whiskey to protect against COVID-19 (he's joking...but still) https://t.co/rdISRz38wf
@Greg_Kane17 @MichaelDadiego nano technology

cloud seeding


In [5]:
print_tweets_per_label(data, n=3)

### Real Tweets

@gisby_marc @Tomfurness2 I like the last one about healthcare, basically they had the PPE
Question from the audience-- How is CMS evaluating the cost of utilization of telehealth as part of the broad picture on healthcare costs? Emily says there could be overrall cost savings with #digitalhealth as these services can avoid more costly, in person visits down the line.
WATCH LIVE: Will Gov. Phil Murphy offer a reopening plan? Here are the latest updates on the coronavirus outbreak in New Jersey. https://t.co/CYvlcDZ2QF


### Fake Tweets

Vatican confirms Pope Francis and two aides test positive for Coronavirus https://t.co/tcLVoRKnRo
[Coronavirus] Contains "HIV Insertions", Stoking Fears Over Artificially Created [Bioweapon] | Zero Hedge https://t.co/5QVcAZlZrj
@davidicke @CordeiroRick Never , you are the devil


In [6]:
data.isna().sum()

text                   0
author_id              0
label                  0
created_at             0
retweet_count          0
reply_count            0
like_count             0
quote_count            0
in_reply_to_user_id    0
dtype: int64

In [7]:
print("Shape before removing duplicate", data.shape)
data.drop_duplicates(subset=["text"], inplace=True)
print("Shape after removing duplicate", data.shape)

Shape before removing duplicate (221439, 9)
Shape after removing duplicate (219707, 9)


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['text'], data["label"], test_size=0.2, random_state=41)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

In [11]:
import metrics as mmetrics
from sklearn import metrics

In [12]:
result = pd.DataFrame()

In [13]:
model = make_pipeline(
    TfidfVectorizer(max_features=1000, analyzer='word'), # ngram_range=(1,3), 
    LogisticRegression(random_state=41)
)

In [14]:
result = pd.DataFrame()

model.fit(X_train, y_train)
probs = model.predict_proba(X_test)[:, :1]
y = y_test.values.reshape((-1, 1))
y = (y == "fake").astype(int)
perfs = mmetrics.get_performance_metrics(y, probs, class_labels=["LR"])
result = pd.concat([result, perfs])
result

Unnamed: 0,TP,TN,FP,FN,Accuracy,Prevalence,Sensitivity,Specificity,PPV,NPV,AUC,F1,Threshold
,,,,,,,,,,,,,
LR,913.0,41658.0,124.0,1247.0,0.969,0.049,0.423,0.997,0.88,0.971,0.921,0.571,0.5


In [15]:
def train_pipeline_count_classifier(classifier, ngram_range=(1, 1), name="LR", max_df=1):
    model = make_pipeline(
        TfidfVectorizer(max_features=1000, analyzer='word', ngram_range=ngram_range, max_df=max_df), # ngram_range=(1,3), 
        classifier
    )

    model.fit(X_train, y_train)
    probs = model.predict_proba(X_test)[:, :1]

    y = y_test.values.reshape((-1, 1))
    y = (y =="fake").astype(int)
    perfs = mmetrics.get_performance_metrics(y, probs, class_labels=[name])
    return perfs

In [16]:
result = pd.DataFrame()

perfs = train_pipeline_count_classifier(LogisticRegression(random_state=41), ngram_range=(1, 1), name="LR")
result = pd.concat([result, perfs])

perfs = train_pipeline_count_classifier(MultinomialNB(), ngram_range=(1, 1), name="MNB")
result = pd.concat([result, perfs])

perfs = train_pipeline_count_classifier(
    XGBClassifier(random_state=41, objective='binary:logistic', eval_metric="logloss"), ngram_range=(1, 1), name="XGB")
result = pd.concat([result, perfs])

perfs = train_pipeline_count_classifier(SVC(random_state=41, probability=True), ngram_range=(1, 1), name="SVC")
result = pd.concat([result, perfs])

perfs = train_pipeline_count_classifier(BaggingClassifier(random_state=41), ngram_range=(1, 1), name="BGC")
result = pd.concat([result, perfs])

perfs = train_pipeline_count_classifier(RandomForestClassifier(random_state=41), ngram_range=(1, 1), name="RFC")
result = pd.concat([result, perfs])

In [17]:
display(Markdown("## Val Score"))
display(result)

## Val Score

Unnamed: 0,TP,TN,FP,FN,Accuracy,Prevalence,Sensitivity,Specificity,PPV,NPV,AUC,F1,Threshold
,,,,,,,,,,,,,
LR,0.0,41782.0,0.0,2160.0,0.951,0.049,0.0,1.0,,0.951,0.501,0.0,0.5
MNB,0.0,41782.0,0.0,2160.0,0.951,0.049,0.0,1.0,,0.951,0.501,0.0,0.5
XGB,0.0,41782.0,0.0,2160.0,0.951,0.049,0.0,1.0,,0.951,0.5,0.0,0.5
SVC,0.0,41782.0,0.0,2160.0,0.951,0.049,0.0,1.0,,0.951,0.5,0.0,0.5
BGC,0.0,41782.0,0.0,2160.0,0.951,0.049,0.0,1.0,,0.951,0.501,0.0,0.5
RFC,0.0,41782.0,0.0,2160.0,0.951,0.049,0.0,1.0,,0.951,0.501,0.0,0.5


In [18]:
perfs = train_pipeline_count_classifier(LogisticRegression(random_state=41, class_weight="balanced"), ngram_range=(1, 1), name="LR")

perfs

Unnamed: 0,TP,TN,FP,FN,Accuracy,Prevalence,Sensitivity,Specificity,PPV,NPV,AUC,F1,Threshold
,,,,,,,,,,,,,
LR,0.0,41782.0,0.0,2160.0,0.951,0.049,0.0,1.0,,0.951,0.501,0.0,0.5


In [37]:
def create_sub_sample(X, y, n_sample=10):
    n_fake = (y=="fake").sum()
    X_fake = X[y == "fake"]
    datasets = []
    for i in range(n_sample):
        X_real = X[y == "real"].sample(n=n_fake)
        d1 = pd.DataFrame(X_fake.values, columns=["text"])
        d1["label"] = "fake"
        d2 = pd.DataFrame(X_real.values, columns=["text"])
        d2["label"] = "real"
        datasets.append(pd.concat([d1, d2], ignore_index=True))

    return datasets

In [30]:
def train_pipeline_count_classifier(classifier, X_train, y_train, ngram_range=(1, 1), name="LR", max_df=1):
    model = make_pipeline(
        TfidfVectorizer(max_features=1000, analyzer='word', ngram_range=ngram_range, max_df=max_df), # ngram_range=(1,3), 
        classifier
    )

    model.fit(X_train, y_train)
    probs = model.predict_proba(X_test)[:, :1]

    y = y_test.values.reshape((-1, 1))
    y = (y =="fake").astype(int)
    perfs = mmetrics.get_performance_metrics(y, probs, class_labels=[name])
    return perfs

In [31]:
result = pd.DataFrame()
datasets = create_sub_sample(X_train, y_train, 10)
for i, data in enumerate(datasets):
    perfs = train_pipeline_count_classifier(LogisticRegression(random_state=41), data["text"], data["label"], name="LR"+str(i))
    result = pd.concat([result, perfs])

In [32]:
result

Unnamed: 0,TP,TN,FP,FN,Accuracy,Prevalence,Sensitivity,Specificity,PPV,NPV,AUC,F1,Threshold
,,,,,,,,,,,,,
LR0,0.0,41782.0,0.0,2160.0,0.951,0.049,0.0,1.0,,0.951,0.506,0.0,0.5
LR1,0.0,41782.0,0.0,2160.0,0.951,0.049,0.0,1.0,,0.951,0.506,0.0,0.5
LR2,0.0,41782.0,0.0,2160.0,0.951,0.049,0.0,1.0,,0.951,0.506,0.0,0.5
LR3,0.0,41782.0,0.0,2160.0,0.951,0.049,0.0,1.0,,0.951,0.506,0.0,0.5
LR4,0.0,41782.0,0.0,2160.0,0.951,0.049,0.0,1.0,,0.951,0.507,0.0,0.5
LR5,0.0,41782.0,0.0,2160.0,0.951,0.049,0.0,1.0,,0.951,0.507,0.0,0.5
LR6,0.0,41782.0,0.0,2160.0,0.951,0.049,0.0,1.0,,0.951,0.507,0.0,0.5
LR7,0.0,41782.0,0.0,2160.0,0.951,0.049,0.0,1.0,,0.951,0.506,0.0,0.5
LR8,0.0,41782.0,0.0,2160.0,0.951,0.049,0.0,1.0,,0.951,0.507,0.0,0.5


In [28]:
datasets[0].label.value_counts()

fake    8439
real    8439
Name: label, dtype: int64

In [38]:
result = pd.DataFrame()
datasets = create_sub_sample(X_train, y_train, 10)
for i, data in enumerate(datasets):
    perfs = train_pipeline_count_classifier(
        XGBClassifier(random_state=41, objective='binary:logistic', eval_metric="logloss"), 
        data["text"], data["label"], name="XGB"+str(i))
    result = pd.concat([result, perfs])

In [39]:
result

Unnamed: 0,TP,TN,FP,FN,Accuracy,Prevalence,Sensitivity,Specificity,PPV,NPV,AUC,F1,Threshold
,,,,,,,,,,,,,
XGB0,0.0,41782.0,0.0,2160.0,0.951,0.049,0.0,1.0,,0.951,0.5,0.0,0.5
XGB1,0.0,41782.0,0.0,2160.0,0.951,0.049,0.0,1.0,,0.951,0.5,0.0,0.5
XGB2,0.0,41782.0,0.0,2160.0,0.951,0.049,0.0,1.0,,0.951,0.5,0.0,0.5
XGB3,0.0,41782.0,0.0,2160.0,0.951,0.049,0.0,1.0,,0.951,0.5,0.0,0.5
XGB4,0.0,41782.0,0.0,2160.0,0.951,0.049,0.0,1.0,,0.951,0.5,0.0,0.5
XGB5,0.0,41782.0,0.0,2160.0,0.951,0.049,0.0,1.0,,0.951,0.5,0.0,0.5
XGB6,0.0,41782.0,0.0,2160.0,0.951,0.049,0.0,1.0,,0.951,0.5,0.0,0.5
XGB7,0.0,41782.0,0.0,2160.0,0.951,0.049,0.0,1.0,,0.951,0.5,0.0,0.5
XGB8,0.0,41782.0,0.0,2160.0,0.951,0.049,0.0,1.0,,0.951,0.5,0.0,0.5


In [36]:
datasets[0].label.value_counts()

real    16878
fake     8439
Name: label, dtype: int64