In [1]:
##PAC classifier code from Palvi's latest PAC model

from newspaper import Article
import time
import pandas as pd
import numpy as np
import itertools
import csv

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import re
import nltk
import string
pd.options.mode.chained_assignment = None

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
#stopwords = set(stopwords.words("english"))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/leeannewalker/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/leeannewalker/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## PRE-PROCESSING START POINT

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv("test.csv")
train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [3]:
test.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


### Standardisation, Punctuation Removal, Remove Stopwords

In [4]:
#data preprocessing
df = train[["text"]]
df["text"] = df["text"].astype(str)
df["text_lower"] = df["text"].str.lower()

PUNCT_TO_REMOVE = string.punctuation

def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df["text_clean"] = df["text_lower"].apply(lambda text: remove_punctuation(text))
del df["text_lower"]
del df['text']

In [5]:
# Do the same for the test dataset
df_test = test[["text"]]
df_test["text"] = df_test["text"].astype(str)
df_test["text_lower"] = df_test["text"].str.lower()

PUNCT_TO_REMOVE2 = string.punctuation

def remove_punctuation_test(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE2))


df_test["text_test_clean"] = df_test["text_lower"].apply(lambda text: remove_punctuation_test(text))
del df_test["text_lower"]
del df_test['text']

In [6]:
df.shape

(20800, 1)

In [8]:
# SKIP THIS IF YOU HAVE MODEL IN DIRECTORY
#Porter Stemmer takes a long time
ps = PorterStemmer()
corpus_train = []
for i in range(0, len(df)):
    review_train = re.sub('[^a-zA-Z]', ' ', df['text_clean'][i])
    review_train = review_train.lower()
    review_train = review_train.split()
    review_train = [ps.stem(word) for word in review_train if not word in stopwords.words('english')]
    review_train = ' '.join(review_train)
    corpus_train.append(review_train)

In [7]:
#Store this when running for first time so you don't have to keep running PS. "-r" recovers saved thing
%store -r review_train

In [10]:
# SKIP THIS IF YOU HAVE MODEL IN DIRECTORY
#Porter Stemmer takes a long time
ps = PorterStemmer()
corpus_test = []
for i in range(0, len(df_test)):
    review_test = re.sub('[^a-zA-Z]', ' ', df_test['text_test_clean'][i])
    review_test = review_test.lower()
    review_test = review_test.split()
    review_test = [ps.stem(word) for word in review_test if not word in stopwords.words('english')]
    review_test = ' '.join(review_test)
    corpus_test.append(review_test)

In [8]:
#Store this when running for first time so you don't have to keep running PS. "-r" recovers saved thing
%store -r review_test

In [9]:
test=test.fillna(' ')
train=train.fillna(' ')
test['total'] = test['title'] + ' ' + test['author'] + df_test['text_test_clean']
train['total'] = train['title'] + ' ' + train['author'] + df['text_clean']

#tfidf
tfidf_vec = TfidfVectorizer(stop_words = 'english', max_df = 0.7)
#counts = count_vectorizer.fit_transform(train['total'].values)
tfidf_train = tfidf_vec.fit_transform(train['total'].values) 
#tfidf = tfidf_vec.fit_transform(tfidf_train)

In [10]:
targets = train['label'].values
#test_tfidf = tfidf_vec.transform(test['total'].values)

#split in samples
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf_train, targets, random_state=0)

## PASSIVE AGGRESSIVE CLASSIFIER START

In [11]:
%%time
import pickle
#Initialise a PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter=100)
pac.fit(X_train,y_train)
# save the model to disk
filename = 'PACmodelA.sav'
pickle.dump(pac, open(filename, 'wb'))

#loaded_model = pickle.load(open(filename, 'rb'))


CPU times: user 667 ms, sys: 27.7 ms, total: 695 ms
Wall time: 216 ms


In [13]:
#Predict on the test set and calculate accuracy
pacLoaded = pickle.load(open(filename, 'rb'))
y_pred = pacLoaded.predict(X_test)
score = accuracy_score(y_test,y_pred)
print(f'The Accuracy for the Passive Aggressive Classifier is: {round(score*100,2)}%')

The Accuracy for the Passive Aggressive Classifier is: 97.17%


In [14]:
targets = train['label'].values
pac = PassiveAggressiveClassifier(max_iter=100)
pac.fit(tfidf_train, targets)

example_tfidf = tfidf_vec.transform(test['total'].values)
predictions = pac.predict(example_tfidf)
pred=pd.DataFrame(predictions,columns=['label'])
pred['id']=test['id']
pred.groupby('label').count()

Unnamed: 0_level_0,id
label,Unnamed: 1_level_1
0,2606
1,2594


## DATA SCRAPING - INSERT YOUR OWN URL FROM THE WEB

In [63]:
#get user URL
url=input("Please enter your URL ")

Please enter your URL https://www.9news.com.au/national/fresh-leads-into-melissa-hunt-cold-case-murder-prompt-search-of-nsw-dam-hunter-region/33124c76-b80b-4bfa-9442-a6ec36798e83


In [64]:
#check the URL is correct
url

'https://www.9news.com.au/national/fresh-leads-into-melissa-hunt-cold-case-murder-prompt-search-of-nsw-dam-hunter-region/33124c76-b80b-4bfa-9442-a6ec36798e83'

In [65]:
#LOAD ARTICLE TO PD DATAFRAME

#To get just id, title, author, text in a pandas dataframe
articles_info = []
article_dict = {}
art = Article(url)
art.download()
art.parse()
#will need to change this if possibility of inputting mutiple URLS (separated by comma for example)
article_dict["id"] = "0"
article_dict["title"] = art.title

#create temporary author holder for manipulation (in the case of more than one author scraped)
temp_author = art.authors

#make author/s that are scraped into one string
a = []
new_a = ''
for items in temp_author:
    if len(temp_author[0][0]) > 1:
        for word in items:
            a.append(word)
    else:
        a.append(items)

#join all of the instances in variable a with a space inbetween
new_a = " ".join(a)      
article_dict["author"] = new_a

#IDEA: else get website address if NAN?

article_dict["text"] = art.text
articles_info.append(article_dict)
news_df = pd.DataFrame(data=articles_info)

In [66]:
#Check news dataframe output
news_df

Unnamed: 0,id,title,author,text
0,0,NSW News: Fresh leads into Melissa Hunt death ...,Apr,Police will conduct a secondary search of a da...


### Pre-processing the news data

In [67]:
#data preprocessing
# Do the same for the test dataset
news_test = news_df[["text"]]
news_test["text"] = news_test["text"].astype(str)
news_test["text_lower"] = news_test["text"].str.lower()

PUNCT_TO_REMOVE2 = string.punctuation

def remove_punctuation_test(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE2))


news_test["text_test_clean"] = news_test["text_lower"].apply(lambda text: remove_punctuation_test(text))
del news_test["text_lower"]
del news_test['text']

In [68]:
ps = PorterStemmer()
corpus_test_news = []
for i in range(0, len(news_df)):
    review_news = re.sub('[^a-zA-Z]', ' ', news_test['text_test_clean'][i])
    review_news = review_news.lower()
    review_news = review_news.split()
    review_news = [ps.stem(word) for word in review_news if not word in stopwords.words('english')]
    review_news = ' '.join(review_news)
    corpus_test_news.append(review_news)

#turn this into a string so that it can be concatenated below    
corpus_test_news = "".join(corpus_test_news)

In [69]:
news_df=news_df.fillna(' ')

#create the total column with all information in it
news_df['total'] = news_df['title'] + ' '+ news_df['author']+ news_test['text_test_clean']

In [70]:
#perform tfidf on the news article data
news_test_tfidf = tfidf_vec.transform(news_df['total'].values)

In [71]:
news_df.total

0    NSW News: Fresh leads into Melissa Hunt death ...
Name: total, dtype: object

In [72]:
news_test_tfidf

<1x213331 sparse matrix of type '<class 'numpy.float64'>'
	with 144 stored elements in Compressed Sparse Row format>

In [73]:
#Predict on the news set
news_predictions = pac.predict(news_test_tfidf)

In [74]:
#assign and output the predictions
news_pred=pd.DataFrame(news_predictions,columns=['label'])
news_pred['id']=news_df['id']
news_pred.groupby('label').count()

Unnamed: 0_level_0,id
label,Unnamed: 1_level_1
0,1


In [75]:
news_pred

Unnamed: 0,label,id
0,0,0


In [76]:
#convert value of prediction into integer to use below
final_prediction = int(news_pred.label)

In [77]:
#If you forget which is fake or real, execute this cell

if (final_prediction == 1):
    print('This is fake')
else:
    print('This is true..')

This is true..
