## Fake News Analysis

##### Importing Necessary Packages

In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import nltk
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

##### Loading the Dataset

In [19]:
data = pd.read_csv('data/train.csv')

##### Basic EDA

In [20]:
data.head()

Unnamed: 0,label,text
0,0,democrat hillary clinton saturday receive firs...
1,1,former white house chief strategist stephen k....
2,1,dr. abdul el-sayed liberal democrat would like...
3,0,manila across asia countries pull beijing orbi...
4,1,clinton lynch talk grandkids top secret transc...


In [21]:
data[['text', 'label']].head()

Unnamed: 0,text,label
0,democrat hillary clinton saturday receive firs...,0
1,former white house chief strategist stephen k....,1
2,dr. abdul el-sayed liberal democrat would like...,1
3,manila across asia countries pull beijing orbi...,0
4,clinton lynch talk grandkids top secret transc...,1


In [22]:
data.shape

(10612, 2)

In [23]:
data.describe()

Unnamed: 0,label
count,10612.0
mean,0.534301
std,0.498846
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10612 entries, 0 to 10611
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   10612 non-null  int64 
 1   text    10612 non-null  object
dtypes: int64(1), object(1)
memory usage: 165.9+ KB


In [25]:
data['label'].value_counts()

label
1    5670
0    4942
Name: count, dtype: int64

In [26]:
data.isnull().sum()

label    0
text     0
dtype: int64

##### Natural Language Toolkit

In [27]:
news_data = data['text']

In [28]:
news_data

0        democrat hillary clinton saturday receive firs...
1        former white house chief strategist stephen k....
2        dr. abdul el-sayed liberal democrat would like...
3        manila across asia countries pull beijing orbi...
4        clinton lynch talk grandkids top secret transc...
                               ...                        
10607    rio de janeiro hundreds brazilian soldier pour...
10608    proof positive delusional leader get ego size ...
10609    anyone follow election easily see donald trump...
10610    barack obama bother court always law already p...
10611    case could far-reaching consequences outcome f...
Name: text, Length: 10612, dtype: object

In [29]:
tokens = []
for news in news_data:
    tokens.append(word_tokenize(news))

In [30]:
len(tokens)

10612

In [31]:
sia = SentimentIntensityAnalyzer()

In [35]:
res = {}
for i, row in tqdm(data.iterrows(), total = len(data)):
    text = row['text']
    res[i] = sia.polarity_scores(text)

  0%|          | 0/10612 [00:00<?, ?it/s]

In [36]:
res

{0: {'neg': 0.083, 'neu': 0.772, 'pos': 0.144, 'compound': 0.6249},
 1: {'neg': 0.139, 'neu': 0.749, 'pos': 0.112, 'compound': -0.5267},
 2: {'neg': 0.062, 'neu': 0.778, 'pos': 0.161, 'compound': 0.9823},
 3: {'neg': 0.113, 'neu': 0.77, 'pos': 0.117, 'compound': -0.5719},
 4: {'neg': 0.086, 'neu': 0.804, 'pos': 0.11, 'compound': 0.7351},
 5: {'neg': 0.199, 'neu': 0.723, 'pos': 0.079, 'compound': -0.991},
 6: {'neg': 0.166, 'neu': 0.757, 'pos': 0.077, 'compound': -0.975},
 7: {'neg': 0.0, 'neu': 0.963, 'pos': 0.037, 'compound': 0.2732},
 8: {'neg': 0.032, 'neu': 0.827, 'pos': 0.141, 'compound': 0.93},
 9: {'neg': 0.097, 'neu': 0.751, 'pos': 0.152, 'compound': 0.9409},
 10: {'neg': 0.098, 'neu': 0.682, 'pos': 0.22, 'compound': 0.9652},
 11: {'neg': 0.115, 'neu': 0.797, 'pos': 0.088, 'compound': -0.6369},
 12: {'neg': 0.081, 'neu': 0.747, 'pos': 0.172, 'compound': 0.9747},
 13: {'neg': 0.173, 'neu': 0.7, 'pos': 0.127, 'compound': -0.1235},
 14: {'neg': 0.058, 'neu': 0.881, 'pos': 0.061, '

In [47]:
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index': 'id'})

In [46]:
data

Unnamed: 0,label,text
0,0,democrat hillary clinton saturday receive firs...
1,1,former white house chief strategist stephen k....
2,1,dr. abdul el-sayed liberal democrat would like...
3,0,manila across asia countries pull beijing orbi...
4,1,clinton lynch talk grandkids top secret transc...
...,...,...
10607,0,rio de janeiro hundreds brazilian soldier pour...
10608,1,proof positive delusional leader get ego size ...
10609,1,anyone follow election easily see donald trump...
10610,1,barack obama bother court always law already p...


In [48]:
vaders

Unnamed: 0,id,neg,neu,pos,compound
0,0,0.083,0.772,0.144,0.6249
1,1,0.139,0.749,0.112,-0.5267
2,2,0.062,0.778,0.161,0.9823
3,3,0.113,0.770,0.117,-0.5719
4,4,0.086,0.804,0.110,0.7351
...,...,...,...,...,...
10607,10607,0.199,0.739,0.062,-0.9899
10608,10608,0.000,0.594,0.406,0.8360
10609,10609,0.163,0.685,0.151,-0.8714
10610,10610,0.164,0.619,0.217,0.9169


In [51]:
data = data.reset_index().rename(columns={'index': 'id'})

In [52]:
data

Unnamed: 0,id,label,text
0,0,0,democrat hillary clinton saturday receive firs...
1,1,1,former white house chief strategist stephen k....
2,2,1,dr. abdul el-sayed liberal democrat would like...
3,3,0,manila across asia countries pull beijing orbi...
4,4,1,clinton lynch talk grandkids top secret transc...
...,...,...,...
10607,10607,0,rio de janeiro hundreds brazilian soldier pour...
10608,10608,1,proof positive delusional leader get ego size ...
10609,10609,1,anyone follow election easily see donald trump...
10610,10610,1,barack obama bother court always law already p...


In [53]:
vaders = vaders.merge(data, how='left')

In [56]:
vaders.drop(columns= 'id')

Unnamed: 0,neg,neu,pos,compound,label,text
0,0.083,0.772,0.144,0.6249,0,democrat hillary clinton saturday receive firs...
1,0.139,0.749,0.112,-0.5267,1,former white house chief strategist stephen k....
2,0.062,0.778,0.161,0.9823,1,dr. abdul el-sayed liberal democrat would like...
3,0.113,0.770,0.117,-0.5719,0,manila across asia countries pull beijing orbi...
4,0.086,0.804,0.110,0.7351,1,clinton lynch talk grandkids top secret transc...
...,...,...,...,...,...,...
10607,0.199,0.739,0.062,-0.9899,0,rio de janeiro hundreds brazilian soldier pour...
10608,0.000,0.594,0.406,0.8360,1,proof positive delusional leader get ego size ...
10609,0.163,0.685,0.151,-0.8714,1,anyone follow election easily see donald trump...
10610,0.164,0.619,0.217,0.9169,1,barack obama bother court always law already p...
