In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import nltk
from nltk.tokenize import word_tokenize
import string

In [2]:
# load data
news = pd.read_excel("Data/final_news.xlsx",index_col=0)

In [3]:
news.dropna(inplace=True)

In [4]:
news["corpus"] = news["Article_title"] +" " +news["Article_body"]

In [5]:
news.head(2)

Unnamed: 0,Article_title,Article_body,Title_length,Body_length,Target,Number_of_tweets,corpus
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,9,317,0,0,The Battle of New York: Why This Primary Matte...
13,"Trump takes on Cruz, but lightly","Killing Obama administration rules, dismantlin...",6,17,0,0,"Trump takes on Cruz, but lightly Killing Obama..."


In [6]:
x = news["corpus"]
y = news["Target"]

In [7]:
# split data into train/test
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=42)

In [8]:
#Tfidif vectorizer
vectorizor = TfidfVectorizer(stop_words = "english")

In [9]:
# fit and transform to dataset
vec_x_train = vectorizor.fit_transform(x_train)

In [10]:
vectorizor.transform(x_test)

<6107x87028 sparse matrix of type '<class 'numpy.float64'>'
	with 935101 stored elements in Compressed Sparse Row format>

In [11]:
# not sure why there are non english characters, they were removed in data_cleaning.ipybn (towards the end)
vectorizor.get_feature_names_out()

array(['00', '000', '0000', ..., 'zzsg90pbf6', 'zzucqevt3m',
       'zzzzzzzzzzzzz'], dtype=object)

In [12]:
vec_x_test = vectorizor.transform(x_test)

In [13]:
vec_x_test = vectorizor.transform(x_test)

In [14]:
# create a logistic regression classifier
log_reg_Classifier = LogisticRegression()

In [15]:
# fit the data
log_reg_Classifier.fit(vec_x_train,y_train)

LogisticRegression()

In [16]:
score = log_reg_Classifier.score(vec_x_test,y_test)

In [17]:
score

0.9726543310954642

In [18]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/marcuscorreia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [19]:
punc = string.punctuation

In [20]:
# remove non ascii characters
def remove_non_ascii(word):
    new_word = word.encode("ascii","ignore").decode()
    return new_word

In [21]:
def clean_input(input):
    # remove punctuation and stop
    # load language model
    sp = spacy.load('en_core_web_sm')
    # import stop words
    all_stopwords = sp.Defaults.stop_words
    # import puncuation
    punc  = string.punctuation

    # replace any weird characters
    text = input
    remove_non_ascii(text)

    # remove puncuation
    new_text = text.translate(str.maketrans('', '', punc))

    # tokenize and make everything lower case
    text_tokens = word_tokenize(new_text.lower())

    # remove stop words
    tokens_without_sw= [word for word in text_tokens if not word in all_stopwords]

    # make list a string again
    tokens_without_sw = " ".join(tokens_without_sw)
    tokens_without_sw

In [22]:
text = """Britain uses Australian speech to warn Vladimir Putin he is on the brink of a 'massive strategic mistake'"""

In [23]:
clean_input(text)

In [24]:
# test on article titles (this is testing on multiple article titles at once)
log_reg_vec = vectorizor.transform(x_test.tolist())

# test on random article title
# vect = vectorizor.transform([text])

In [25]:
# predicted values of each test row
predictions = log_reg_Classifier.predict(log_reg_vec)

In [26]:
# print out result
predictions

array([0, 1, 1, ..., 1, 0, 1])

In [27]:
news[["Article_title","Target"]].head()

Unnamed: 0,Article_title,Target
4,The Battle of New York: Why This Primary Matters,0
13,"Trump takes on Cruz, but lightly",0
24,Anti-Trump forces seek last-ditch delegate revolt,0
29,GOP insiders: Carly crushed it,0
36,Donald Groped Hillary in 2005! Trump and Weine...,1


In [28]:
z = pd.DataFrame({"Predictions":predictions,"Actual":y_test})
z

Unnamed: 0,Predictions,Actual
14505,0,0
37239,1,1
36529,1,1
19293,0,0
32526,1,1
...,...,...
20828,0,0
28828,1,1
5256,1,1
21782,0,0


In [29]:
print(f"Training Data Score: {log_reg_Classifier.score(vec_x_train, y_train)}")
print(f"Testing Data Score: {log_reg_Classifier.score(vec_x_test, y_test)}")

Training Data Score: 0.9817695540636429
Testing Data Score: 0.9726543310954642


In [30]:
# show how many are not correct
z[z.Predictions != z.Actual].shape

(167, 2)

In [31]:
import pickle

In [32]:
pickle.dump(log_reg_vec, open("Model_files/Vectorisors/log_reg_vec.pkl", 'wb'))

In [33]:
pickle.dump(log_reg_Classifier, open("Model_files/Models/log_reg_model.pkl", 'wb'))