In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import nltk
from nltk.tokenize import word_tokenize
import string
import pickle
from sklearn import model_selection
from sklearn.metrics import confusion_matrix

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/marcuscorreia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# load data
news = pd.read_csv("Data/final_news.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
news.dropna(inplace=True)

In [5]:
news["corpus"] = news["Article_title"] +" " +news["Article_body"]

In [6]:
news.head(2)

Unnamed: 0.1,Unnamed: 0,Article_title,Article_body,Title_length,Body_length,Target,Number_of_tweets,corpus,corpus_length_original
0,4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,9.0,317.0,0.0,0.0,The Battle of New York: Why This Primary Matte...,205.0
1,13,"Trump takes on Cruz, but lightly","Killing Obama administration rules, dismantlin...",6.0,17.0,0.0,0.0,"Trump takes on Cruz, but lightly Killing Obama...",16.0


In [7]:
x = news["corpus"]
y = news["Target"]

In [8]:
# split data into train/test
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.4,random_state=42)

In [9]:
#Tfidif vectorizer
vectorizor = TfidfVectorizer(stop_words = "english")

In [10]:
# fit and transform to dataset
vec_x_train = vectorizor.fit_transform(x_train)

In [11]:
vectorizor.transform(x_test)

<9770x77705 sparse matrix of type '<class 'numpy.float64'>'
	with 1495883 stored elements in Compressed Sparse Row format>

In [12]:
vectorizor.get_feature_names_out()

array(['00', '000', '0000', ..., 'zzn3bqnfsk', 'zzucqevt3m',
       'zzzzzzzzzzzzz'], dtype=object)

In [13]:
vec_x_test = vectorizor.transform(x_test)

In [14]:
vec_x_test = vectorizor.transform(x_test)

In [15]:
# create a logistic regression classifier
log_reg_Classifier = LogisticRegression()

In [16]:
# fit the data
log_reg_Classifier.fit(vec_x_train,y_train)

LogisticRegression()

In [34]:
pickle.dump(vectorizor, open("Model_files/Vectorisors/log_reg_vec.pkl", 'wb'))

In [35]:
pickle.dump(log_reg_Classifier, open("Model_files/Models/log_reg_model.pkl", 'wb'))

In [36]:
# remove non ascii characters
def remove_non_ascii(word):
    new_word = word.encode("ascii","ignore").decode()
    return new_word

In [37]:
def clean_input(input):
    # remove punctuation and stop
    # load language model
    sp = spacy.load('en_core_web_sm')
    # import stop words
    all_stopwords = sp.Defaults.stop_words
    # import puncuation
    punc  = string.punctuation

    # replace any weird characters
    text = input
    remove_non_ascii(text)

    # remove puncuation
    new_text = text.translate(str.maketrans('', '', punc))

    # tokenize and make everything lower case
    text_tokens = word_tokenize(new_text.lower())

    # remove stop words
    tokens_without_sw= [word for word in text_tokens if not word in all_stopwords]

    # make list a string again
    tokens_without_sw = " ".join(tokens_without_sw)
    tokens_without_sw

    return str(tokens_without_sw)

In [38]:
# all are from abc news
# # this predicts 0
# test_string = """WA health authorities urge anyone with COVID symptoms to get tested and isolate, as state records seven new cases"""

# # this predicts 0
# test_string = """Passenger killed in fiery crash in Adelaide's north, police investigate shooting between cars"""

# # this predicts 0
# test_string = """Environmentalists vow to fight latest Kimberley fracking proposal to unearth Australia's largest oil resource"""

## this predicts 1
# test_string = """How Facebook Fact-checking Can Backfire""" 

## this predicts 1
test_string = """Hulk Sad! Tatiana Maslany Says She Is Not Playing She-Hulk""" 

In [27]:
test_string = clean_input(test_string)

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [39]:
# test on the test data (this is testing on multiple article titles at once)
log_reg_vec_m = vectorizor.transform(x_test.tolist())

# test on random article title
log_reg_vec = vectorizor.transform([test_string])

In [40]:
# predicted values of each test article
predictions_m = log_reg_Classifier.predict(log_reg_vec_m)

# predicted values of random test article
predictions = log_reg_Classifier.predict(log_reg_vec)[0]

In [41]:
# print out result
print(predictions_m)

# print out result
predictions

[1. 1. 0. ... 0. 0. 1.]


1.0

In [42]:
z = pd.DataFrame({"Predictions":predictions_m,"Actual":y_test})
z

Unnamed: 0,Predictions,Actual
2752685,1.0,1.0
1837898,1.0,1.0
6120,0.0,0.0
1347,0.0,0.0
5853,0.0,0.0
...,...,...
473,1.0,0.0
12246,1.0,1.0
7021,0.0,0.0
1846,0.0,0.0


In [43]:
print(f"Training Data Score: {log_reg_Classifier.score(vec_x_train, y_train)}")
print(f"Testing Data Score: {log_reg_Classifier.score(vec_x_test, y_test)}")

Training Data Score: 0.9804162401910611
Testing Data Score: 0.9711361310133061


In [44]:
# show how many are not correct
z[z.Predictions != z.Actual].shape

(282, 2)

In [45]:
#load vectoriser
LR_vec = pickle.load( open("Model_files/Vectorisors/log_reg_vec.pkl","rb"))
# load model
LR_model = pickle.load(open("Model_files/Models/log_reg_model.pkl", 'rb'))

In [46]:
# using accuracy
kfold = model_selection.KFold(n_splits=10, random_state=7, shuffle=True)
scoring = 'accuracy'
results = model_selection.cross_val_score(LR_model, vec_x_train, y_train, cv=kfold, scoring=scoring)
print("Accuracy: %.3f (%.3f)" % (results.mean(), results.std()))

Accuracy: 0.969 (0.005)


In [47]:
# using roc
kfold = model_selection.KFold(n_splits=10, random_state=7, shuffle=True)
scoring = 'roc_auc'
results = model_selection.cross_val_score(LR_model, vec_x_train, y_train, cv=kfold, scoring=scoring)
print("Accuracy: %.3f (%.3f)" % (results.mean(), results.std()))

Accuracy: 0.994 (0.002)


In [48]:
# using confusion matrix
predicted2 = LR_model.predict(vec_x_test)
matrix = confusion_matrix(y_test, predicted2)
print(matrix)

[[4374  156]
 [ 126 5114]]


In [49]:
#Top 20 fake
top20_fake = sorted(zip(log_reg_Classifier.coef_[0], vectorizor.get_feature_names_out()), reverse=True)[:20]
top20_fake= pd.DataFrame(top20_fake, columns=["Contribution","Word"])
top20_fake.to_csv("Data/contribution_top20_fake.csv")

In [34]:
top20_fake

Unnamed: 0,Contribution,Word
0,6.269003,video
1,5.827048,image
2,5.605978,just
3,5.410496,featured
4,4.977097,trump
5,4.442915,hillary
6,4.103136,read
7,3.740843,like
8,3.659456,getty
9,3.545001,watch


In [35]:
#Top 20 real
top20_real = sorted(zip(log_reg_Classifier.coef_[0], vectorizor.get_feature_names_out()))[:20]
top20_real= pd.DataFrame(top20_real, columns=["Contribution","Word"])
top20_real.to_csv("Data/contribution_top20_words_real.csv")

In [36]:
top20_real

Unnamed: 0,Contribution,Word
0,-15.066333,reuters
1,-14.301275,said
2,-4.329948,tuesday
3,-4.327835,thursday
4,-3.930982,minister
5,-3.785586,wednesday
6,-3.761954,friday
7,-3.606345,monday
8,-3.091905,washington
9,-2.816135,government
