In [1]:
import os
import datetime

import IPython
import IPython.display
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
from IPython.display import Markdown
from IPython.display import display

mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.grid'] = False
warnings.filterwarnings("ignore")
# pd.set_option('display.max_columns', None)

In [2]:
data = pd.read_csv("./data/TrainVal.csv")
data.sample(10)

Unnamed: 0,tweet,label,tweet_cleaned,tweet_len
7439,354 new cases of #COVID19Nigeria; FCT-78 Lagos...,real,354 new case covid 19nigeria fct 78 lago 76 ka...,30
1936,Bill Gates will make $200 billion through vacc...,fake,bill gate make billion vaccin,8
921,A number of areas have been added to the coron...,real,number area ad coronaviru watchlist scientist ...,35
6415,A tiger tested positive for COVID-19 please st...,fake,tiger test posit covid 19 pleas stay away pet ...,14
6490,6 of the 7 new cases are already linked to the...,real,6 7 new case alreadi link exist cluster auckla...,24
7316,Let us give one example of a difficult call. Y...,real,let us give one exampl difficult call yesterda...,40
5224,"""We???ve still had more deaths to the flu this...",fake,still death flu year we'v covid 19,14
5529,The first case of new coronavirus in Venezuela...,fake,first case new coronaviru venezuela regist com...,17
7661,Closing up the shop for the night. We're showi...,real,close shop night we'r show 19066 test 2182 pos...,28
4586,593 new cases of #COVID19Nigeria; Plateau-186 ...,real,593 new case covid 19nigeria plateau 186 lago ...,29


In [3]:
def print_tweets_per_label(df, n=5):
    real = df[df.label == "real"]["tweet_cleaned"].sample(n)
    display(Markdown("### Real Tweets"))
    real.apply(print)
    
    fake = df[df.label == "fake"]["tweet_cleaned"].sample(n)
    display(Markdown("### Fake Tweets"))
    fake.apply(print)

In [4]:
print_tweets_per_label(data, n=6)

### Real Tweets

septemb 7 nation forecast suggest 3300 8000 new covid 19 death report week end octob 3 forecast predict 205000 217000 total covid 19 death us octob 3 learn
high level covid 19 recoveri result 100 increas number recov case past 30 day
correct missouri recent chang way report case follow chang mistakenli use older method today initi made seem like state case regret error
last day juli seen stagger resurg covid 19 america worst month ever case averag number hospit peopl unsurprisingli declin sinc april death rose month
11:25 pm 22nd april breakdown case state lago 504 fct 119 kano 73 ogun 24 katsina 21 osun 20 oyo 17 edo 17 kwara 10 kaduna 9 akwa ibom 9 borno 9 bauchi 8 delta 6 gomb 5 ekiti 4 ondo 3 river 3 jigawa 2 enugu 2 niger 2 abia 2 benu 1 anambra 1 sokoto 1 adamawa 1
five previous report case consid recov bring number activ case covid 19 manag isol facil new zealand 22 ​ total number confirm case remain 1217 number report


### Fake Tweets

governor gretchen whitmer spent past weekend vacat cottag birch lake violat execut order �
death blame coronaviru actual due flu
six month sinc first confirm case covid 19 unit state presid trump still effect plan contain spread unjustifi failur leadership cost live everi day
bbc reliabl news sourc imag doctor make look like british outlet report ghislain maxwel intens care covid 19
season influenza vaccin mandatori georgia fall covid 19 mortal rate significantli lower fatal caus virus
also joke covid 19 still receiv rt pcr report ..


In [5]:
print("Shape before removing duplicate", data.shape)
data.dropna(inplace=True)
data.drop_duplicates(subset=["tweet_cleaned"], inplace=True)
print("Shape after removing duplicate", data.shape)

Shape before removing duplicate (8560, 4)
Shape after removing duplicate (8413, 4)


In [6]:
from xgboost import XGBClassifier
from sentence_transformers import SentenceTransformer

In [7]:
class Model:
    def __init__(self, params):
        xgb_params= {
            "objective": "binary:logistic",
            "n_estimators": params["n_estimators"],
            "max_depth": params["max_depth"],
            "n_jobs": 7,
            "seed": 1000,
            "verbosity":0,
            'tree_method': "gpu_hist",
            "gpu_id": 0,
            "subsample": params["subsample"],#0.7
            "colsample_bytree": params["colsample_bytree"],
            "learning_rate": params["learning_rate"],
            "lambda": params["lambda"],
            "alpha": params["alpha"],
            "eval_metric":'logloss'
        }
        
        self.emb = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")
        self.xgb = XGBClassifier(**xgb_params)
    
    def fit(self, X_train, y_train):
        X_emb = self.emb.encode(X_train)
        self.xgb.fit(X_emb, y_train)
        return self
    
    def predict(self, X_test):
        X_emb = self.emb.encode(X_test)
        preds = self.xgb.predict(X_emb)
        return preds
    
    def predict_proba(self, X_test):
        X_emb = self.emb.encode(X_test)
        preds = self.xgb.predict_proba(X_emb)
        return preds

In [8]:
params = {'ngram_range': 2, 'max_df': 0.6648896815644944, 'max_features': 1928, 'n_estimators': 605, 'max_depth': 10, 'subsample': 0.7159105584464553, 
          'colsample_bytree': 0.7851446292315494, 'learning_rate': 0.13139478930452317, 'lambda': 1.2081963210286712, 'alpha': 1.0087203749646416}

model = Model(params)

In [9]:
model.fit(data.tweet.values, data.label.values)

<__main__.Model at 0x7f5516e88820>

In [10]:
test_data = pd.read_csv("./data/TestLabel.csv", index_col=0)
test_data.sample(10)

Unnamed: 0_level_0,tweet,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1347,CDC &amp; partners are working together to coo...,real
1736,Air Canada promises 100 bonus Aeroplan Miles t...,fake
1626,The world's poorest and most marginalised peop...,real
605,Olive Garden employees are allowed to wear Bla...,fake
531,A video (already removed from YouTube) in whic...,fake
877,Here are the states reporting over 400 cases t...,real
92,We fact-checked a range of statements from the...,fake
1443,NEWS! Government finalising plans to comprehen...,fake
664,_Israeli scientists at the MIGAL Research Inst...,fake
297,Italy registered 4200 new recovering cases fro...,fake


In [12]:
probs = model.predict_proba(test_data.tweet.values)

In [13]:
probs[:5]

array([[9.6559525e-06, 9.9999034e-01],
       [9.9967712e-01, 3.2289056e-04],
       [9.9994826e-01, 5.1749685e-05],
       [2.0033121e-04, 9.9979967e-01],
       [2.0560622e-03, 9.9794394e-01]], dtype=float32)

In [14]:
y = (test_data.label == "real")
y.head()

id
1     True
2    False
3    False
4     True
5     True
Name: label, dtype: bool

In [15]:
import metrics as mmetrics

In [16]:
y = y.values.reshape((-1, 1))

perfs = mmetrics.get_performance_metrics(y, probs[:, 1:], ["xgboost"])
perfs

Unnamed: 0,TP,TN,FP,FN,Accuracy,Prevalence,Sensitivity,Specificity,PPV,NPV,AUC,F1,Threshold
,,,,,,,,,,,,,
xgboost,1053.0,950.0,70.0,67.0,0.936,0.523,0.94,0.931,0.938,0.934,0.983,0.939,0.5
