In [94]:
import numpy as np
import pandas as pd
import re  # regular expression lib
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [95]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [96]:
# printing the stopwords
print(stopwords.words ('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

**Data Pre-Processing**

Encoding is used to specify how text data is represented as bytes in a file. It determines the mapping between the characters in a text file and their binary representations (sequences of 0s and 1s).

In [97]:
# load the data to a pandas dataframe
news_data = pd.read_csv('/content/FA-KES-Dataset.csv', encoding='ISO-8859-1')

In [98]:
news_data.head()

Unnamed: 0,unit_id,article_title,article_content,source,date,location,labels
0,1914947530,Syria attack symptoms consistent with nerve ag...,Wed 05 Apr 2017 Syria attack symptoms consiste...,nna,4/5/2017,idlib,0
1,1914947532,Homs governor says U.S. attack caused deaths b...,Fri 07 Apr 2017 at 0914 Homs governor says U.S...,nna,4/7/2017,homs,0
2,1914947533,Death toll from Aleppo bomb attack at least 112,Sun 16 Apr 2017 Death toll from Aleppo bomb at...,nna,4/16/2017,aleppo,0
3,1914947534,Aleppo bomb blast kills six Syrian state TV,Wed 19 Apr 2017 Aleppo bomb blast kills six Sy...,nna,4/19/2017,aleppo,0
4,1914947535,29 Syria Rebels Dead in Fighting for Key Alepp...,Sun 10 Jul 2016 29 Syria Rebels Dead in Fighti...,nna,7/10/2016,aleppo,0


In [99]:
#O --> Real News
#1 - > Fake News

news_data.tail()

Unnamed: 0,unit_id,article_title,article_content,source,date,location,labels
799,1965511221,Turkish Bombardment Kills 20 Civilians in Syria,28-08-2016 Turkish Bombardment Kills 20 Civili...,manar,8/28/2016,aleppo,1
800,1965511222,Martyrs as Terrorists Shell Aleppos Salah Eddin,17-08-2016 Martyrs as Terrorists Shell Aleppos...,manar,8/1/2016,aleppo,1
801,1965511224,Chemical Attack Kills Five Syrians in Aleppo SANA,03-08-2016 Chemical Attack Kills Five Syrians ...,manar,8/3/2016,aleppo,0
802,1965511226,5 Killed as Russian Military Chopper Shot down...,01-08-2016 5 Killed as Russian Military Choppe...,manar,8/1/2016,idlib,1
803,1965511231,Syrian Army Kills 48 ISIL Terrorists in Deir E...,April 6 2017 Syrian Army Kills 48 ISIL Terrori...,manar,4/4/2017,deir ezzor,1


In [100]:
news_data.shape

(804, 7)

In [101]:
# checking for missing values
news_data.isnull().sum()

unit_id            0
article_title      0
article_content    0
source             0
date               0
location           0
labels             0
dtype: int64

In [102]:
# replacing the missing values with null string
# news_data = news_data.fillna(''); incase there are missing values in text datasets

In [103]:
# merging the article_title & article_content
news_data['content'] = news_data['article_title']+' '+news_data['article_content']

In [104]:
news_data.head()

Unnamed: 0,unit_id,article_title,article_content,source,date,location,labels,content
0,1914947530,Syria attack symptoms consistent with nerve ag...,Wed 05 Apr 2017 Syria attack symptoms consiste...,nna,4/5/2017,idlib,0,Syria attack symptoms consistent with nerve ag...
1,1914947532,Homs governor says U.S. attack caused deaths b...,Fri 07 Apr 2017 at 0914 Homs governor says U.S...,nna,4/7/2017,homs,0,Homs governor says U.S. attack caused deaths b...
2,1914947533,Death toll from Aleppo bomb attack at least 112,Sun 16 Apr 2017 Death toll from Aleppo bomb at...,nna,4/16/2017,aleppo,0,Death toll from Aleppo bomb attack at least 11...
3,1914947534,Aleppo bomb blast kills six Syrian state TV,Wed 19 Apr 2017 Aleppo bomb blast kills six Sy...,nna,4/19/2017,aleppo,0,Aleppo bomb blast kills six Syrian state TV We...
4,1914947535,29 Syria Rebels Dead in Fighting for Key Alepp...,Sun 10 Jul 2016 29 Syria Rebels Dead in Fighti...,nna,7/10/2016,aleppo,0,29 Syria Rebels Dead in Fighting for Key Alepp...


In [105]:
# separating feature and target
X = news_data.drop (columns='labels', axis =1)
Y = news_data[ 'labels']

In [106]:
print(X)

        unit_id                                      article_title  \
0    1914947530  Syria attack symptoms consistent with nerve ag...   
1    1914947532  Homs governor says U.S. attack caused deaths b...   
2    1914947533    Death toll from Aleppo bomb attack at least 112   
3    1914947534        Aleppo bomb blast kills six Syrian state TV   
4    1914947535  29 Syria Rebels Dead in Fighting for Key Alepp...   
..          ...                                                ...   
799  1965511221    Turkish Bombardment Kills 20 Civilians in Syria   
800  1965511222    Martyrs as Terrorists Shell Aleppos Salah Eddin   
801  1965511224  Chemical Attack Kills Five Syrians in Aleppo SANA   
802  1965511226  5 Killed as Russian Military Chopper Shot down...   
803  1965511231  Syrian Army Kills 48 ISIL Terrorists in Deir E...   

                                       article_content source       date  \
0    Wed 05 Apr 2017 Syria attack symptoms consiste...    nna   4/5/2017   
1    Fr

In [107]:
print(Y)

0      0
1      0
2      0
3      0
4      0
      ..
799    1
800    1
801    0
802    1
803    1
Name: labels, Length: 804, dtype: int64


**Stemming:**   
Stemming is the process of reducing a word to its root word

In [108]:
port_stem = PorterStemmer ()

In [109]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ',content) # represents all words withut symbols etc..
  stemmed_content = stemmed_content.lower () # converts to lowercase
  stemmed_content = stemmed_content.split() # for splitting
  stemmed_content = ' '.join(stemmed_content) # removes commas

 # captures words not present in "stopwords" & applying if needed
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words ('english')]
  return stemmed_content

In [110]:
news_data['content'] = news_data['content'].apply(stemming)

In [111]:
print(news_data['content'])

0      [r,  , c, k,  , p,  , c, n, e, n,  , w, h,  , ...
1      [h,  , g, v, e, r, n, r,  ,  , u,  ,  , c, k, ...
2      [e, h,  , l, l,  , f, r,  , l, e, p, p,  , b, ...
3      [l, e, p, p,  , b, b,  , b, l,  , k, l, l,  , ...
4      [r,  , r, e, b, e, l,  , e,  , n,  , f, g, h, ...
                             ...                        
799    [u, r, k, h,  , b, b, r, e, n,  , k, l, l,  , ...
800    [r, r,  ,  , e, r, r, r,  , h, e, l, l,  , l, ...
801    [c, h, e, c, l,  , c, k,  , k, l, l,  , f, v, ...
802    [k, l, l, e,  ,  , r, u, n,  , l, r,  , c, h, ...
803    [r, n,  , r,  , k, l, l,  , l,  , e, r, r, r, ...
Name: content, Length: 804, dtype: object


In [112]:
X = news_data ['content'].values
Y = news_data ['labels'].values

In [113]:
print(X)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [114]:
print(Y)

[0 0 0 0 0 0 0 0 1 0 1 1 1 1 0 0 1 1 0 0 0 0 1 1 1 1 1 0 0 1 0 0 1 0 0 1 1
 1 1 1 1 1 1 0 1 1 1 1 1 1 0 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 0 0 0 1 0 1 0 0
 1 0 1 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 1 0 1 1 1 1 1 0 1 0 0 1 1 1 0 0 1 1
 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 0 0 1 0 1 0 1 0 1 1 1 1 1 0 1 1 1 0 1 0 0
 1 1 0 0 0 1 1 1 1 0 1 1 1 1 1 0 0 1 1 0 1 0 1 0 0 0 0 1 1 0 0 1 0 1 1 0 0
 1 0 1 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 1 1 1 1 0 0 0 1 1 0 1 0 1
 0 0 0 0 0 1 0 0 0 0 1 1 0 1 1 1 0 1 1 0 0 0 1 1 0 1 1 0 0 1 1 0 1 1 0 1 1
 0 1 1 0 0 1 1 1 1 0 0 0 0 1 1 0 1 0 0 0 1 0 0 1 0 0 1 1 1 1 1 1 1 1 1 1 0
 1 0 0 0 1 1 1 0 1 1 1 1 1 0 0 1 0 1 1 0 1 0 0 0 1 0 1 1 1 1 1 0 0 0 0 1 1
 0 1 1 1 0 0 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0 0 0 0 0 1 0 1 0 1 0 0 1 0 0 0 1
 1 0 1 0 1 1 0 0 0 0 1 0 1 0 1 1 1 1 1 0 1 1 1 1 0 1 0 0 1 0 0 0 0 1 1 0 1
 1 1 0 1 1 0 0 1 1 0 0 0 1 1 1 0 0 0 1 1 1 0 0 0 0 1 1 1 1 1 0 0 1 1 0 1 0
 0 0 0 1 1 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 1 1 1 1 0 0 1 0 0 1 1 1 1 1 0
 0 1 0 0 1 1 0 0 0 1 0 1 

In [115]:
Y.shape

(804,)

In [116]:
# converting the textual data to feature vectors
vectorizer = TfidfVectorizer ()
vectorizer.fit (X)
X = vectorizer.transform(X)

AttributeError: ignored

In [None]:
print(X)