In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
import spacy
import nltk
from nltk.tokenize import word_tokenize
import string
import pickle
from sklearn import model_selection
from sklearn.metrics import confusion_matrix

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/marcuscorreia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# load data
news = pd.read_csv("Data/final_news.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
news.dropna(inplace=True)

In [5]:
news["corpus"] = news["Article_title"] +" " +news["Article_body"]

In [6]:
news.head(2)

Unnamed: 0.1,Unnamed: 0,Article_title,Article_body,Title_length,Body_length,Target,Number_of_tweets,corpus,corpus_length_original
0,4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,9.0,317.0,0.0,0.0,The Battle of New York: Why This Primary Matte...,205.0
1,13,"Trump takes on Cruz, but lightly","Killing Obama administration rules, dismantlin...",6.0,17.0,0.0,0.0,"Trump takes on Cruz, but lightly Killing Obama...",16.0


In [7]:
x = news["corpus"]
y = news["Target"]

In [8]:
# split data into train/test
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=42)

In [9]:
#Tfidif vectorizer
vectorizor = TfidfVectorizer(stop_words = "english")

In [10]:
# fit and transform to dataset
vec_x_train = vectorizor.fit_transform(x_train)

In [11]:
vectorizor.transform(x_test)

<6107x86996 sparse matrix of type '<class 'numpy.float64'>'
	with 937229 stored elements in Compressed Sparse Row format>

In [12]:
vectorizor.get_feature_names_out()

array(['00', '000', '0000', ..., 'zzn3bqnfsk', 'zzucqevt3m',
       'zzzzzzzzzzzzz'], dtype=object)

In [13]:
vec_x_test = vectorizor.transform(x_test)

In [14]:
vec_x_test = vectorizor.transform(x_test)

In [15]:
# passagressive classiifer
PassAgg = PassiveAggressiveClassifier()

In [16]:
PassAgg.fit(vec_x_train,y_train)

PassiveAggressiveClassifier()

In [17]:
pickle.dump(vectorizor, open("Model_files/Vectorisors/pa_vec.pkl", 'wb'))

In [18]:
pickle.dump(PassAgg, open("Model_files/Models/pa_model.pkl", 'wb'))

In [19]:
# remove non ascii characters
def remove_non_ascii(word):
    new_word = word.encode("ascii","ignore").decode()
    return new_word

In [20]:
def clean_input(input):
    # remove punctuation and stop
    # load language model
    sp = spacy.load('en_core_web_sm')
    # import stop words
    all_stopwords = sp.Defaults.stop_words
    # import puncuation
    punc  = string.punctuation

    # replace any weird characters
    text = input
    remove_non_ascii(text)

    # remove puncuation
    new_text = text.translate(str.maketrans('', '', punc))

    # tokenize and make everything lower case
    text_tokens = word_tokenize(new_text.lower())

    # remove stop words
    tokens_without_sw= [word for word in text_tokens if not word in all_stopwords]

    # make list a string again
    tokens_without_sw = " ".join(tokens_without_sw)
    tokens_without_sw
    
    return tokens_without_sw

In [21]:
# all are from abc news
# # this predicts 0
# test_string = """WA health authorities urge anyone with COVID symptoms to get tested and isolate, as state records seven new cases"""

# # this predicts 0
# test_string = """Passenger killed in fiery crash in Adelaide's north, police investigate shooting between cars"""

# # this predicts 0
# test_string = """Environmentalists vow to fight latest Kimberley fracking proposal to unearth Australia's largest oil resource"""

## this predicts 1
# test_string = """Hulk Sad! Tatiana Maslany Says She Is Not Playing She-Hulk""" 

## this predicts 1
test_string = """How Facebook Fact-checking Can Backfire""" 



In [22]:
test_string= clean_input(test_string)

In [23]:
# test on the test data (this is testing on multiple article titles at once)
pa_vec_m = vectorizor.transform(x_test.tolist())

# test on random article title
pa_vec = vectorizor.transform([test_string])

In [24]:
# predicted values of each test article
predictions_m = PassAgg.predict(pa_vec_m)

# predicted values of random test article
predictions = PassAgg.predict(pa_vec)[0]

In [25]:
# print out result
print(predictions_m)
# print out result
predictions

[1. 1. 0. ... 0. 0. 0.]


1.0

In [26]:
z = pd.DataFrame({"Predictions":predictions,"Actual":y_test})
z

Unnamed: 0,Predictions,Actual
2752685,1.0,1.0
1837898,1.0,1.0
6120,1.0,0.0
1347,1.0,0.0
5853,1.0,0.0
...,...,...
1101,1.0,1.0
11436,1.0,0.0
6946,1.0,0.0
4819,1.0,0.0


In [27]:
print(f"Training Data Score: {PassAgg.score(vec_x_train, y_train)}")
print(f"Testing Data Score: {PassAgg.score(vec_x_test, y_test)}")

Training Data Score: 0.9999454088874331
Testing Data Score: 0.9800229245128541


In [28]:
# show how many are not correct
z[z.Predictions != z.Actual].shape

(2827, 2)

In [29]:
#load vectoriser
PA_vec = pickle.load( open("Model_files/Vectorisors/pa_vec.pkl","rb"))
# load model
PA_model = pickle.load(open("Model_files/Models/pa_model.pkl", 'rb'))

In [30]:
# using accuracy
kfold = model_selection.KFold(n_splits=10, random_state=7, shuffle=True)
scoring = 'accuracy'
results = model_selection.cross_val_score(PA_model, vec_x_train, y_train, cv=kfold, scoring=scoring)
print("Accuracy: %.3f (%.3f)" % (results.mean(), results.std()))

Accuracy: 0.978 (0.002)


In [31]:
# using roc
kfold = model_selection.KFold(n_splits=10, random_state=7, shuffle=True)
scoring = 'roc_auc'
results = model_selection.cross_val_score(PA_model, vec_x_train, y_train, cv=kfold, scoring=scoring)
print("Accuracy: %.3f (%.3f)" % (results.mean(), results.std()))

Accuracy: 0.997 (0.001)


In [32]:
# using confusion matrix
predicted2 = PA_model.predict(vec_x_test)
matrix = confusion_matrix(y_test, predicted2)
print(matrix)

[[2763   64]
 [  58 3222]]


In [33]:
#Top 20 real
sorted(zip(PassAgg.coef_[0], vectorizor.get_feature_names_out()), reverse=True)[:20]

[(7.459185163939489, 'image'),
 (7.290170214188319, 'read'),
 (7.065871520935106, 'featured'),
 (5.392496212935144, 'video'),
 (4.945229997967413, 'breaking'),
 (4.855498986971652, 'getty'),
 (4.833984447677141, 'just'),
 (4.373593448718592, 'hillary'),
 (3.8856697737923125, 'images'),
 (3.5254410632040485, 'watch'),
 (3.492633504070863, 'mr'),
 (3.4286860403475035, 'wire'),
 (3.2431339847620255, '21wire'),
 (3.172695271654086, '21st'),
 (3.119365646609558, 'https'),
 (3.115285504805976, 'com'),
 (3.111569573216369, 'ipsos'),
 (3.0271870753516343, 'flickr'),
 (2.847383434776823, 'breitbart'),
 (2.7359047431131667, 'pic')]

In [34]:
#Top 20 fake
sorted(zip(PassAgg.coef_[0], vectorizor.get_feature_names_out()))[:20]

[(-16.73163659576913, 'reuters'),
 (-8.673239702306262, 'said'),
 (-4.4951194945299955, 'thursday'),
 (-3.5080416559632783, 'monday'),
 (-3.1740235708324365, 'tuesday'),
 (-2.798598706079005, 'cnn'),
 (-2.6622009312835697, 'killed'),
 (-2.5654111971960702, 'friday'),
 (-2.5229590528674577, 'wednesday'),
 (-2.5184263762415657, 'showed'),
 (-2.4965484133823783, 'bucket'),
 (-2.4127161498755796, 'washington'),
 (-2.404960984997805, 'saying'),
 (-2.343541850884852, 'citing'),
 (-2.2603181475215046, 'london'),
 (-2.1756301453249596, 'nov'),
 (-2.157356517031424, 'factbox'),
 (-2.0969234287729828, 'general'),
 (-2.0942197237511038, 'expressed'),
 (-2.0754041492949336, 'trade')]