In [11]:
import numpy as np
import pandas as pd
# re - regular expression(searching text in documents)
import re
# this is used to remove unnecessary words(like articles,who,etc)
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
# convert text into feature vector
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [12]:

df = pd.read_csv('fakenews_train.csv')
df_test = pd.read_csv('fakenews_test.csv')
print(df.head())
print(df.shape)
print(df.isnull().sum())

   id                                              title              author  \
0   0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   
1   1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   
2   2                  Why the Truth Might Get You Fired  Consortiumnews.com   
3   3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   
4   4  Iranian woman jailed for fictional unpublished...      Howard Portnoy   

                                                text  label  
0  House Dem Aide: We Didn’t Even See Comey’s Let...      1  
1  Ever get the feeling your life circles the rou...      0  
2  Why the Truth Might Get You Fired October 29, ...      1  
3  Videos 15 Civilians Killed In Single US Airstr...      1  
4  Print \nAn Iranian woman has been sentenced to...      1  
(20800, 5)
id           0
title      558
author    1957
text        39
label        0
dtype: int64


In [13]:
# filling null values
df.fillna("",inplace=True)
df_test.fillna("",inplace=True)
df["content"] = df["author"] + " " + df["title"]
df_test["content"] = df_test["author"] + " " + df_test["title"]
# print(df["content"].head())
m = df.drop("label", axis=1)
m_test = df_test
n = df["label"]

In [14]:
# Stemming - reducing a word to its root word
# (removing prefix n suffix)
port_stem = PorterStemmer()
words = stopwords.words('english')

def stemming(content):
    # get all the characters in the text
    # (by replacing numbers, punctuations, etc with ' ')
    stemmed_content = re.sub('[^a-zA-Z]',' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if word not in words]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

m["content"] = m["content"].apply(stemming)
m_test["content"] = m_test["content"].apply(stemming)

m = m["content"].values
m_test = m_test["content"].values
y = df["label"].values

# Vectorizing - converting textual data to numerical data
# Tf - term freq.(no. of times a word is repeated in doc) , idf - inverse document freq.
vectorizer = TfidfVectorizer()
vectorizer.fit(m)
m = vectorizer.transform(m)
m_test = vectorizer.transform(m_test)

In [15]:
m_train,m_test,n_train,n_test = train_test_split(m,n,train_size=0.8,stratify=n,random_state=0)

In [16]:
log_reg = LogisticRegression()
log_reg.fit(m_train,n_train)

n_train_pred = log_reg.predict(m_train)
n_test_pred = log_reg.predict(m_test)

In [17]:
acc_train = accuracy_score(n_train,n_train_pred)
print("Training Accuracy:",acc_train)
acc_test = accuracy_score(n_test,n_test_pred)
print("Testing Accuracy:",acc_test)

Training Accuracy: 0.9866586538461538
Testing Accuracy: 0.9774038461538461


In [18]:
ip = m_test[0]
pred = log_reg.predict(ip)
if pred == 0:
    print("Breaking News!!")
else:
    print("FAKE NEWS!!")

FAKE NEWS!!
