In [1]:
import os
import datetime

import IPython
import IPython.display
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
from IPython.display import Markdown
from IPython.display import display

mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.grid'] = False
warnings.filterwarnings("ignore")
# pd.set_option('display.max_columns', None)

In [2]:
data = pd.read_csv("./data/TrainVal.csv")
data.sample(10)

Unnamed: 0,tweet,label,tweet_cleaned,tweet_len
5022,Cheb Khaled the king of Algerian Rai dies in a...,fake,cheb khale king algerian rai die pari hospit g...,18
4842,Shadow Health Secretary Jonathan Ashworth says...,real,shadow health secretari jonathan ashworth say ...,38
4042,University of Campinas' field hospital located...,fake,univers campina field hospit locat state são p...,20
5516,President of the Philippines Rodrigo Duterte o...,fake,presid philippin rodrigo dutert order mandator...,17
4601,"The Netflix Korean drama ""My Secret Terrius"" p...",fake,netflix korean drama secret terriu predict cor...,13
6623,Yesterday our laboratories completed ​1043 tes...,real,yesterday laboratori complet ​ 1043 test bring...,29
6665,Image of food distributed in isolation centers...,fake,imag food distribut isol center state kerala i...,13
6678,📢#CoronaVirusUpdates: ✅India's #COVID19 recove...,real,📢 coronavirusupd ✅ india' covid 19 recoveri ra...,32
1810,If you get sick remember: #Antibiotics do NOT ...,real,get sick rememb antibiot work virus like caus ...,39
6598,Coronavirus: Social distancing rules relaxed f...,real,coronaviru social distanc rule relax coupl est...,11


In [3]:
def print_tweets_per_label(df, n=5):
    real = df[df.label == "real"]["tweet_cleaned"].sample(n)
    display(Markdown("### Real Tweets"))
    real.apply(print)
    
    fake = df[df.label == "fake"]["tweet_cleaned"].sample(n)
    display(Markdown("### Fake Tweets"))
    fake.apply(print)

In [4]:
print_tweets_per_label(data, n=6)

### Real Tweets

thank also individu essenti volunt go get data us everi day holiday without labor mani project organ would lack data need understand covid affect commun
total number confirm case covid 19 1464 number report world health organ ​ ​ yesterday laboratori process 5417 test bring total number test complet date 910853
📢 ncdcinthenew reopen societi mean viru longer us mean learn continu live safe requir sacrific area ” dg 📰
daili 4 pm updat across 50 state dc we'v track 16502 total test 1953 posit 13419 neg 1130 pend tri interpret data pleas read note state report data differ
nation taskforc covid 19 constitut review use hydroxychloroquin high-risk popul prophylaxi covid _19 base emerg evid safeti efficaci
five new case covid 19 report nigeria 3 fct 2 oyo state 08:00 pm 27th march 70 case confirm covid 19 report nigeria 3 discharg 1 death


### Fake Tweets

expert nanavati hospit give four common treatment patient
there' 3 differ coronaviru vaccin undergo clinic trial russia 500th person who' test gam-covid-vac
indian armi rajasthan prepar 1000 bed facil keep coronaviru patient facil icu ventil well
tamilnadu dmk parti leader selva kumar kick ladi doctor duti forward video group punish
go coronaviru test wait get cure within 2-3 hour without medicin use remedi
liquid squeez tea brew siam weed chromolaena odorata cure novel coronaviru accord nigerian prophetess dupe oluwaniyi


In [5]:
print("Shape before removing duplicate", data.shape)
data.dropna(inplace=True)
data.drop_duplicates(subset=["tweet_cleaned"], inplace=True)
print("Shape after removing duplicate", data.shape)

Shape before removing duplicate (8560, 4)
Shape after removing duplicate (8413, 4)


In [6]:
from sklearn.svm import SVC
from sentence_transformers import SentenceTransformer

In [7]:
class Model:
    def __init__(self, model=None):
        self.emb = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")
        if model is None:
            model = SVC(probability=True)
        self.model = model
    
    def fit(self, X_train, y_train):
        X_emb = self.emb.encode(X_train)
        self.model.fit(X_emb, y_train)
        return self
    
    def predict(self, X_test):
        X_emb = self.emb.encode(X_test)
        preds = self.model.predict(X_emb)
        return preds
    
    def predict_proba(self, X_test):
        X_emb = self.emb.encode(X_test)
        preds = self.model.predict_proba(X_emb)
        return preds

In [8]:
model = Model(SVC(probability=True))

In [9]:
model.fit(data.tweet.values, data.label.values)

<__main__.Model at 0x7f6a8e41b520>

In [10]:
test_data = pd.read_csv("./data/TestLabel.csv", index_col=0)
test_data.sample(10)

Unnamed: 0_level_0,tweet,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1393,COVID-19 Means 'Certificate of Identification ...,fake
790,Also (obvious typo) correction on the first tw...,real
2016,"Due to COVID-19, Would-Be Robbers Have Trouble...",fake
339,If you get coronavirus from Chinese food the s...,fake
351,Today there are 10 people in hospital who have...,real
819,Sadly three people are in hospital with COVID-...,real
1094,Corona patients have started appearing in publ...,fake
1012,The South set a new record for deaths across t...,real
67,One hundred days after the 1st #COVID19 case w...,real
411,@asinine_net_nz Hi Derek all businesses and se...,real


In [11]:
probs = model.predict_proba(test_data.tweet.values)

In [12]:
probs[:5]

array([[2.89689338e-07, 9.99999710e-01],
       [9.99161078e-01, 8.38921608e-04],
       [9.99966769e-01, 3.32314337e-05],
       [2.28768682e-06, 9.99997712e-01],
       [2.73023553e-06, 9.99997270e-01]])

In [13]:
y = (test_data.label == "real")
y.head()

id
1     True
2    False
3    False
4     True
5     True
Name: label, dtype: bool

In [14]:
import metrics as mmetrics

In [15]:
y = y.values.reshape((-1, 1))

perfs = mmetrics.get_performance_metrics(y, probs[:, 1:], ["svc"])
perfs

Unnamed: 0,TP,TN,FP,FN,Accuracy,Prevalence,Sensitivity,Specificity,PPV,NPV,AUC,F1,Threshold
,,,,,,,,,,,,,
svc,1055.0,957.0,63.0,65.0,0.94,0.523,0.942,0.938,0.944,0.936,0.987,0.943,0.5
