## Import Dependencies

In [1]:
import pandas as pd
import re
import nltk
import joblib
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
## import models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

## Load Dataset

In [2]:
true_df = pd.read_csv("True.csv")
true_df.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [3]:
fake_df = pd.read_csv("Fake.csv")
fake_df.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


## Label True data as 1 and Fake as 0

In [4]:
true_df["label"] = 1
true_df.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [5]:
fake_df["label"] = 0
fake_df.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


## Combine true and fake data

In [6]:
df = pd.concat([true_df,fake_df],axis = 0)

In [7]:
df.shape

(44898, 5)

## Remove non important columns
We are just using title in this model so removing rest of the columns. 
We can also use text column but it will make model training very slow.

In [8]:
df.drop(columns=["text","subject","date"],axis = 1,inplace =True)
df

Unnamed: 0,title,label
0,"As U.S. budget fight looms, Republicans flip t...",1
1,U.S. military to accept transgender recruits o...,1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,1
3,FBI Russia probe helped by Australian diplomat...,1
4,Trump wants Postal Service to charge 'much mor...,1
...,...,...
23476,McPain: John McCain Furious That Iran Treated ...,0
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,0
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,0
23479,How to Blow $700 Million: Al Jazeera America F...,0


In [9]:
df.isnull().sum()

title    0
label    0
dtype: int64

## Shuffle the data

In [10]:
df = df.sample(frac= 1)
df

Unnamed: 0,title,label
16820,TEACHER’S UNION PROTEST Gets Ugly As Protester...,0
7375,Fractured Republicans come together in face of...,1
2533,Hypocrite Republicans Refuse To Investigate F...,0
2918,WATCH: Ashley Judd Takes Trump Down At Women’...,0
1726,Factbox: Trump on Twitter (September 14) - DAC...,1
...,...,...
17272,FLAG DAY 2015: 10 FACTS TO KNOW ABOUT THE STAR...,0
9044,Trump announces new hires to expand campaign o...,1
17723,One of Liberia's main parties calls for halt t...,1
8358,Lewis Black SLAMS Cruz: He’s So Creepy That N...,0


In [11]:
df.reset_index(inplace = True)
df

Unnamed: 0,index,title,label
0,16820,TEACHER’S UNION PROTEST Gets Ugly As Protester...,0
1,7375,Fractured Republicans come together in face of...,1
2,2533,Hypocrite Republicans Refuse To Investigate F...,0
3,2918,WATCH: Ashley Judd Takes Trump Down At Women’...,0
4,1726,Factbox: Trump on Twitter (September 14) - DAC...,1
...,...,...,...
44893,17272,FLAG DAY 2015: 10 FACTS TO KNOW ABOUT THE STAR...,0
44894,9044,Trump announces new hires to expand campaign o...,1
44895,17723,One of Liberia's main parties calls for halt t...,1
44896,8358,Lewis Black SLAMS Cruz: He’s So Creepy That N...,0


In [12]:
df.drop(["index"],axis = 1, inplace = True)
df

Unnamed: 0,title,label
0,TEACHER’S UNION PROTEST Gets Ugly As Protester...,0
1,Fractured Republicans come together in face of...,1
2,Hypocrite Republicans Refuse To Investigate F...,0
3,WATCH: Ashley Judd Takes Trump Down At Women’...,0
4,Factbox: Trump on Twitter (September 14) - DAC...,1
...,...,...
44893,FLAG DAY 2015: 10 FACTS TO KNOW ABOUT THE STAR...,0
44894,Trump announces new hires to expand campaign o...,1
44895,One of Liberia's main parties calls for halt t...,1
44896,Lewis Black SLAMS Cruz: He’s So Creepy That N...,0


## Download stopwords to remove them from the data

In [13]:
nltk.download("stopwords")
# printing the stopwords in English
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Observeaiw14\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Use stemming to change word in base form

In [14]:
port_stem = PorterStemmer()

In [15]:
def stemming(text):
    # remove non alphabetical letters
    stemmed_content = re.sub("[^a-zA-Z]"," ",text)
    # convert to lowercase
    stemmed_content = stemmed_content.lower()
    # split text in list of words
    stemmed_content = stemmed_content.split()
    # Apply stemming to each word, excluding common stopwords.
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    # Join the stemmed words back into a single string.
    stemmed_content = " ".join(stemmed_content)
    return stemmed_content

In [16]:
df["title"] = df["title"].apply(stemming)
df

Unnamed: 0,title,label
0,teacher union protest get ugli protest cop duk...,0
1,fractur republican come togeth face trump triumph,1
2,hypocrit republican refus investig flynn scandal,0
3,watch ashley judd take trump women march epic ...,0
4,factbox trump twitter septemb daca florida mex...,1
...,...,...
44893,flag day fact know star stripe,0
44894,trump announc new hire expand campaign oper,1
44895,one liberia main parti call halt elect result,1
44896,lewi black slam cruz creepi one notic bulli video,0


## Create X and Y 

In [17]:
X = df["title"] 
Y = df["label"] 
print(X.shape,Y.shape)

(44898,) (44898,)


## Implement Train Test Split

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state = 1, stratify = Y)

In [19]:
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

(35918,) (8980,) (35918,) (8980,)


## Do Tfidf vectorisation and change words in numerical values

In [20]:
Vectorizer = TfidfVectorizer()
# fit_transform method for training data to learn the vocabulary and transform the data
XV_train = Vectorizer.fit_transform(X_train)
# transform method is used with test data to apply same mapping without altering learned vocabulary and IDF values.
XV_test = Vectorizer.transform(X_test)

## Start model training and checking scores for model performance:
for LogisticRegression, DecisionTreeClassifier, RandomForestClassifier and GradientBoostingClassifier

In [21]:
lr = LogisticRegression()
lr.fit(XV_train,Y_train)

In [22]:
predict_lr = lr.predict(XV_test)
predict_lr

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [23]:
lr.score(XV_test,Y_test)

0.9419821826280623

In [24]:
dt = DecisionTreeClassifier()
dt.fit(XV_train,Y_train)

In [25]:
predict_dt = dt.predict(XV_test)

In [26]:
dt.score(XV_test,Y_test)

0.9033407572383073

In [27]:
rec = RandomForestClassifier(random_state = 0)
rec.fit(XV_train,Y_train)

In [28]:
predict_rec = rec.predict(XV_test)

In [29]:
rec.score(XV_test,Y_test)

0.9393095768374164

In [30]:
gb =  GradientBoostingClassifier(random_state =0)
gb.fit(XV_train,Y_train)

In [31]:
predict_gb = gb.predict(XV_test)

In [32]:
gb.score(XV_test,Y_test)

0.8510022271714922

## Observations : 
We can see that random forest classifier is working best with the data so we will use that one as our final model


## Make function for taking a text and returing the prediction

In [33]:
def predict_news_type(text):
    text = {"title": [text]}
    df_new = pd.DataFrame(text)
    df_new["title"] = df_new["title"].apply(stemming)
    X_new = df_new["title"]
    XV_new = Vectorizer.transform(X_new)
    XV_new
    prediction = rec.predict(XV_new)
    print(prediction)
    if (prediction[0]==1):
      return 'The news is Real'
    else:
      return 'The news is Fake'


In [43]:
print(predict_news_type("Pope Francis Just Called Out Donald Trump "))

[0]
The news is Fake


In [44]:
print(predict_news_type("Gaza receives largest aid shipment since Israel-Hamas war began"))

[1]
The news is Real
