# Exporing the Data

In [43]:
import numpy as np
import pandas as pd

In [44]:
fake = pd.read_csv("Fake.csv")

In [45]:
true = pd.read_csv("True.csv")

In [46]:
fake.columns

Index(['title', 'text', 'subject', 'date'], dtype='object')

In [47]:
true.columns

Index(['title', 'text', 'subject', 'date'], dtype='object')

In [48]:
list = set()
for i in fake['date']:
    list.add(i[-4:])
    
print(list)

{'.jpg', 'ier/', 'deo]', 'eft/', '2015', '2017', '2016', 'b-18', 'pie/'}


In [49]:
fake['r-date'] = fake['date'].apply(lambda x:1 if str(x)[-4:]=="2017" else 0)

In [50]:
fake['r-date'].value_counts()

r-date
0    14278
1     9203
Name: count, dtype: int64

In [51]:
list = set()
for i in true['date']:
    list.add(i[-5:])
    
print(list)

{'2016 ', '2017 '}


In [52]:
true['r-date'] = true['date'].apply(lambda x:1 if str(x)[-5:-1]=="2017" else 0)

In [53]:
true['r-date'].value_counts()

r-date
1    16701
0     4716
Name: count, dtype: int64

In [54]:
fake.value_counts('subject')

subject
News               9050
politics           6841
left-news          4459
Government News    1570
US_News             783
Middle-east         778
Name: count, dtype: int64

In [55]:
true.value_counts('subject')

subject
politicsNews    11272
worldnews       10145
Name: count, dtype: int64

In [56]:
fake['category']=0
true['category']=1

In [57]:
df = pd.concat([fake, true]).reset_index(drop=True)

In [58]:
df.head()

Unnamed: 0,title,text,subject,date,r-date,category
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1,0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1,0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1,0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1,0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1,0


# Data Cleaning

In [59]:
df.isna().sum()*100/len(df)

title       0.0
text        0.0
subject     0.0
date        0.0
r-date      0.0
category    0.0
dtype: float64

In [60]:
df = df[['text', 'category']]

In [61]:
blank = []

for i,t in df["text"].items(): #.tems() give out index
    if t.isspace():
        blank.append(i)

len(blank)

631

In [62]:
blank

[10923,
 11041,
 11190,
 11225,
 11236,
 11241,
 11247,
 11249,
 11267,
 11268,
 11283,
 11284,
 11285,
 11289,
 11290,
 11292,
 11295,
 11296,
 11301,
 11303,
 11304,
 11305,
 11309,
 11314,
 11317,
 11319,
 11322,
 11330,
 11334,
 11335,
 11340,
 11343,
 11348,
 11351,
 11352,
 11357,
 11371,
 11373,
 11374,
 11382,
 11397,
 11402,
 11403,
 11409,
 11410,
 11412,
 11415,
 11419,
 11421,
 11427,
 11431,
 11432,
 11440,
 11448,
 11450,
 11453,
 11462,
 11464,
 11465,
 11472,
 11473,
 11475,
 11478,
 11489,
 11491,
 11493,
 11494,
 11501,
 11505,
 11507,
 11520,
 11531,
 11532,
 11533,
 11538,
 11542,
 11547,
 11549,
 11551,
 11559,
 11563,
 11581,
 11585,
 11589,
 11590,
 11614,
 11624,
 11625,
 11627,
 11631,
 11636,
 11637,
 11643,
 11650,
 11658,
 11661,
 11672,
 11679,
 11681,
 11684,
 11686,
 11688,
 11692,
 11708,
 11718,
 11729,
 11739,
 11753,
 11765,
 11768,
 11777,
 11782,
 11786,
 11788,
 11792,
 11793,
 11803,
 11806,
 11813,
 11821,
 11831,
 11832,
 11841,
 11844,
 11848,


In [63]:
df.drop(blank, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(blank, inplace=True)


In [64]:
df.shape

(44267, 2)

In [65]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
import re
nlp = spacy.load("en_core_web_sm")

In [66]:
lemma = WordNetLemmatizer()

In [67]:
# spaCy stopwords
list1 = nlp.Defaults.stop_words
print(len(list1))

# NLTK stopwords
list2 = stopwords.words('english')
print(len(list2))

# union of both sets
Stopwords = set((set(list1) | set(list2)))
print(len(Stopwords))

326
198
401


In [68]:
def clean_text(text):
    string = ""

    text = text.lower()
    text=re.sub(r"i'm","i am",text)
    text=re.sub(r"he's","he is",text)
    text=re.sub(r"she's","she is",text)
    text=re.sub(r"that's","that is",text)
    text=re.sub(r"what's","what is",text)
    text=re.sub(r"where's","where is",text)
    text=re.sub(r"\'ll"," will",text)
    text=re.sub(r"\'ve"," have",text)
    text=re.sub(r"\'re"," are",text)
    text=re.sub(r"\'d"," would",text)
    text=re.sub(r"won't","will not",text)
    text=re.sub(r"can't","cannot",text)
    
    #removing any special character
    text=re.sub(r"[-()\"#!@$%^&*{}?.,:]"," ",text)
    text=re.sub(r"\s+"," ",text)
    text=re.sub('[^A-Za-z0-9]+',' ', text)
    
    for word in text.split():
        if word not in Stopwords:
            string+=lemma.lemmatize(word)+" "
    
    return string

In [71]:
df['text'] = df['text'].apply(clean_text)

# Feature-Extraction & Model building

In [74]:
from sklearn.model_selection import train_test_split


X=df["text"] #feature 
y=df["category"] # traget

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [75]:
#importing libraries to build a pipline
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [76]:
#this pipe line will take the text and vectorise it , and then TF-IDF, then fitting the model

text_clf=Pipeline([("tfidf",TfidfVectorizer()),("clf",LinearSVC())])
text_clf.fit(X_train,y_train)

In [77]:
#making prediction using the model
predictions=text_clf.predict(X_test)

In [78]:
from sklearn import metrics
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      7562
           1       0.99      0.99      0.99      7047

    accuracy                           0.99     14609
   macro avg       0.99      0.99      0.99     14609
weighted avg       0.99      0.99      0.99     14609



In [79]:

#overall acuracy
print(metrics.accuracy_score(y_test,predictions))

0.9930864535560271


In [80]:
#confusion matrix
print(metrics.confusion_matrix(y_test,predictions))

[[7512   50]
 [  51 6996]]


In [81]:
df.to_csv("Cleaned_news_data", index = False)

# downloading model

In [82]:
import joblib

# Save model
joblib.dump(text_clf, "fake_news_model.pkl")


['fake_news_model.pkl']