In [85]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [86]:
# Natural Language Toolkit (NLTK)
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords') # very commonly used used that can be ignored 

# printing the stopwords in English
print(stopwords.words('english'))


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### <CENTER> **IMPORTING DATASET**

In [87]:
df = pd.read_csv("news_articles.csv")
df.head()

Unnamed: 0,author,published,title,text,language,site_url,main_img_url,type,label,title_without_stopwords,text_without_stopwords,hasImage
0,Barracuda Brigade,2016-10-26T21:41:00.000+03:00,muslims busted they stole millions in govt ben...,print they should pay all the back all the mon...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,muslims busted stole millions govt benefits,print pay back money plus interest entire fami...,1.0
1,reasoning with facts,2016-10-29T08:47:11.259+03:00,re why did attorney general loretta lynch plea...,why did attorney general loretta lynch plead t...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,attorney general loretta lynch plead fifth,attorney general loretta lynch plead fifth bar...,1.0
2,Barracuda Brigade,2016-10-31T01:41:49.479+02:00,breaking weiner cooperating with fbi on hillar...,red state \nfox news sunday reported this mor...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,breaking weiner cooperating fbi hillary email ...,red state fox news sunday reported morning ant...,1.0
3,Fed Up,2016-11-01T05:22:00.000+02:00,pin drop speech by father of daughter kidnappe...,email kayla mueller was a prisoner and torture...,english,100percentfedup.com,http://100percentfedup.com/wp-content/uploads/...,bias,Real,pin drop speech father daughter kidnapped kill...,email kayla mueller prisoner tortured isis cha...,1.0
4,Fed Up,2016-11-01T21:56:00.000+02:00,fantastic trumps point plan to reform healthc...,email healthcare reform to make america great ...,english,100percentfedup.com,http://100percentfedup.com/wp-content/uploads/...,bias,Real,fantastic trumps point plan reform healthcare ...,email healthcare reform make america great sin...,1.0


In [88]:
df.shape

(2096, 12)

In [89]:
from sklearn.utils import shuffle
df = shuffle(df)
df = df.reset_index(drop=True)

### <center> **DATA PREPROCESSING**

In [90]:
df.isna().sum() # sum of all unavailable data in a column

author                      0
published                   0
title                       0
text                       46
language                    1
site_url                    1
main_img_url                1
type                        1
label                       1
title_without_stopwords     2
text_without_stopwords     50
hasImage                    1
dtype: int64

In [91]:
df = df.dropna()

In [92]:
df.isna().sum()

author                     0
published                  0
title                      0
text                       0
language                   0
site_url                   0
main_img_url               0
type                       0
label                      0
title_without_stopwords    0
text_without_stopwords     0
hasImage                   0
dtype: int64

In [93]:
print("Number of unique Authors ")
len(pd.unique(df['author']))

Number of unique Authors 


485

In [94]:
pd.unique(df['language'])

array(['english', 'german', 'ignore', 'spanish', 'french'], dtype=object)

In [95]:
df.drop(df[df.language!="english"].index, inplace = True)
df.head()

Unnamed: 0,author,published,title,text,language,site_url,main_img_url,type,label,title_without_stopwords,text_without_stopwords,hasImage
0,tokyowashi (noreply@blogger.com),2016-10-31T02:40:07.541+02:00,is it coming into clearer focus for americans ...,source zero hedge \n\noctober \n\nconspiracy ...,english,abeldanger.net,No Image URL,bs,Fake,coming clearer focus americans,source zero hedge october conspiracy theories ...,0.0
1,Daniel Greenfield,2016-10-29T04:17:00.000+03:00,hillarys secretary of state after send a chec...,obamas doj hasnt approved warrant for fbi of n...,english,frontpagemag.com,http://www.frontpagemag.com/sites/default/file...,hate,Real,obamas un ambassador cuba right human rights,obamas attorney general warned fbi director in...,1.0
2,Tim King,2016-11-16T02:00:00.000+02:00,will america survive the next years,a plea from california to not replicate in can...,english,ahtribune.com,http://ahtribune.com/images/media/Anti_Trump_P...,bs,Fake,america survive next years,plea california replicate canada attacks acade...,1.0
3,Dr. Patrick Slattery,2016-11-16T00:55:00.000+02:00,why our survival depends on the defeat of jewi...,share \ndr duke and pastor dankof quote jews b...,english,davidduke.com,http://davidduke.com/wp-content/uploads/2016/1...,hate,Real,survival depends defeat jewish power,share dr duke pastor dankof quote jews boastin...,1.0
4,No Author,2016-11-17T04:56:00.000+02:00,bernie sanders rises in the senate prepares to...,adobochron comments \nsan francisco californi...,english,addictinginfo.org,No Image URL,bias,Real,bernie sanders rises senate prepares become tr...,adobochron comments san francisco california a...,0.0


In [96]:
pd.unique(df['language'])

array(['english'], dtype=object)

In [97]:
df['title_without_stopwords'] = df['author']+' '+df['title_without_stopwords']

In [98]:
df.shape

(1967, 12)

In [99]:
DF= df

### **Separating the data required for prediction and the prediction.**
(prediction: i.e if the news is real or fake)

In [100]:
x = df.drop(['published','author', 'language','site_url','label','main_img_url','type','hasImage','title','text','title_without_stopwords'], axis='columns')
x.head(3)

Unnamed: 0,text_without_stopwords
0,source zero hedge october conspiracy theories ...
1,obamas attorney general warned fbi director in...
2,plea california replicate canada attacks acade...


In [101]:
pd.unique(DF['label'])

array(['Fake', 'Real'], dtype=object)

In [102]:
y = DF.label
y.head()

0    Fake
1    Real
2    Fake
3    Real
4    Real
Name: label, dtype: object

In [103]:
# since 'label' is not in an integer form, we can not fit out model just yet.
# Converting it into a quantitative variable:
dummies= pd.get_dummies(y)
dummies

Unnamed: 0,Fake,Real
0,1,0
1,0,1
2,1,0
3,0,1
4,0,1
...,...,...
2090,1,0
2091,1,0
2092,1,0
2094,0,1


In [104]:
y = pd.concat([y, dummies], axis='columns')
y

Unnamed: 0,label,Fake,Real
0,Fake,1,0
1,Real,0,1
2,Fake,1,0
3,Real,0,1
4,Real,0,1
...,...,...,...
2090,Fake,1,0
2091,Fake,1,0
2092,Fake,1,0
2094,Real,0,1


In [105]:
y=y.drop(['label', 'Fake'], axis='columns')

In [106]:
y.head()

Unnamed: 0,Real
0,0
1,1
2,0
3,1
4,1


### <center> ***STEMMING METHOD***
Stemming is the process of reducing inflection in words to their root forms. <br>
(Many variations of words carry the same meaning)

In [107]:
from nltk.stem.porter import PorterStemmer

In [108]:
port_stem = PorterStemmer()

In [109]:
# x=x['text_without_stopwords'].apply(str)

In [110]:
import string

def punctuation_removal(text):
    all_list = [char for char in text if char not in string.punctuation]
    clean_str = ''.join(all_list)
    return clean_str

df['text_without_stopwords'] = df['text_without_stopwords'].apply(punctuation_removal)

In [71]:
# def stemming(content):
#   stemmedContent = re.sub('[^a-zA-Z]',' ',content) # regular expression
#   stemmedContent = stemmedContent.lower()
#   stemmedContent = stemmedContent.split()
#   stemmedContent = [port_stem.stem(word) for word in stemmedContent if not word in stopwords.words('english')]
#   stemmedContent = ' '.join(stemmedContent)
#   return stemmedContent

# # NOTE: our dataset already contains the text column without stopwords. 

In [72]:
# X['text_without_stopwords'] = X['text_without_stopwords'].apply(stemming)

IndexError: ignored

In [111]:
print(x)

                                 text_without_stopwords
0     source zero hedge october conspiracy theories ...
1     obamas attorney general warned fbi director in...
2     plea california replicate canada attacks acade...
3     share dr duke pastor dankof quote jews boastin...
4     adobochron comments san francisco california a...
...                                                 ...
2090  home month popular reasons diet sucks reasons ...
2091  posted october tim brown among many wikileaks ...
2092  national mood focus group reflects angry divid...
2094  obama lies time doesnt bother even check facts...
2095  campaigning hillary clinton florida alicia mac...

[1967 rows x 1 columns]


### **Converting texual data into numeric data for calculations**

In [116]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [114]:
# X = df['text_without_stopwords'].values

In [117]:
print(X)

['source zero hedge october conspiracy theories swirled recent days fbi director james comey reopened hillarys email investigation closing back july concluding although hillary demonstrated gross negligence establishment private email server reasonable prosecutor would bring case democrats lavishing comey praise months concluding investigation impartial way since lashed seeking influence election cycle hillary describing recent actions deeply troubling republicans hand praised comeys recent efforts attempt correct corrupt investigation seemingly ignored critical evidence granting numerous immunity agreements clinton staffers according daily mail source close james comey decision least part came could longer resist mounting pressure mutinous agents fbi felt betrayed brought disgrace bureau letting hillary slap wrist james comeys decision revive investigation hillary clintons email server handling classified material came could longer resist mounting pressure mutinous agents fbi includin

In [118]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)

In [119]:
print(X)

  (0, 42274)	0.08978739522144587
  (0, 41915)	0.0369087945469165
  (0, 41899)	0.1246183643312962
  (0, 41838)	0.041029131467729796
  (0, 41836)	0.01946962422476859
  (0, 41705)	0.05036854857080138
  (0, 41465)	0.12115821471430427
  (0, 41098)	0.061541860454971765
  (0, 41000)	0.023930975733933133
  (0, 40932)	0.031867622581030555
  (0, 39986)	0.05289958231457869
  (0, 39135)	0.023910732142029795
  (0, 38949)	0.06686120732275666
  (0, 38883)	0.05220474244005154
  (0, 38483)	0.04768328235520761
  (0, 38364)	0.031647237778786204
  (0, 38296)	0.02743377168040608
  (0, 38142)	0.050944201449783684
  (0, 37975)	0.03177888911203715
  (0, 37829)	0.05532836966155911
  (0, 37259)	0.03832739561055719
  (0, 37043)	0.07947801316198316
  (0, 36835)	0.03330196397126319
  (0, 36177)	0.044112594394405485
  (0, 35900)	0.023730347912472013
  :	:
  (1966, 9389)	0.08045463230292564
  (1966, 8847)	0.10096969494262187
  (1966, 7659)	0.08599060846098151
  (1966, 7135)	0.051887335773239444
  (1966, 6685)	0.0859

### **FINALIZING DATASETS FOR TRAINING AND TESTOMG**

In [120]:
y.head()


Unnamed: 0,Real
0,0
1,1
2,0
3,1
4,1


In [137]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.1,random_state=0)

In [138]:
X_train=X_train.toarray()
X_test=X_test.toarray()

In [139]:
from sklearn.naive_bayes import GaussianNB
model = LogisticRegression()
model.fit(X_train,Y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression()

In [140]:
model.predict(X_test)

array([1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0],
      dtype=uint8)

In [141]:
model.score(X_test, Y_test)

0.7411167512690355