In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
import spacy
import nltk
from nltk.tokenize import word_tokenize
import string

In [2]:
# load data
news = pd.read_excel("Data/final_news.xlsx",index_col=0)

In [3]:
news.dropna(inplace=True)

In [4]:
news["corpus"] = news["Article_title"] +" " +news["Article_body"]

In [5]:
news.head(2)

Unnamed: 0,Article_title,Article_body,Title_length,Body_length,Target,Number_of_tweets,corpus
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,9,317,0,0,The Battle of New York: Why This Primary Matte...
13,"Trump takes on Cruz, but lightly","Killing Obama administration rules, dismantlin...",6,17,0,0,"Trump takes on Cruz, but lightly Killing Obama..."


In [6]:
x = news["corpus"]
y = news["Target"]

In [7]:
# split data into train/test
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=42)

In [8]:
#Tfidif vectorizer
vectorizor = TfidfVectorizer(stop_words = "english")

In [9]:
# fit and transform to dataset
vec_x_train = vectorizor.fit_transform(x_train)

In [10]:
vectorizor.transform(x_test)

<6107x87028 sparse matrix of type '<class 'numpy.float64'>'
	with 935101 stored elements in Compressed Sparse Row format>

In [11]:
# not sure why there are non english characters, they were removed in data_cleaning.ipybn (towards the end)
vectorizor.get_feature_names_out()

array(['00', '000', '0000', ..., 'zzsg90pbf6', 'zzucqevt3m',
       'zzzzzzzzzzzzz'], dtype=object)

In [12]:
vec_x_test = vectorizor.transform(x_test)

In [13]:
vec_x_test = vectorizor.transform(x_test)

In [14]:
# passagressive classiifer
PassAgg = PassiveAggressiveClassifier()

In [15]:
PassAgg.fit(vec_x_train,y_train)

PassiveAggressiveClassifier()

In [16]:
# remove non ascii characters
def remove_non_ascii(word):
    new_word = word.encode("ascii","ignore").decode()
    return new_word

In [17]:
def clean_input(input):
    # remove punctuation and stop
    # load language model
    sp = spacy.load('en_core_web_sm')
    # import stop words
    all_stopwords = sp.Defaults.stop_words
    # import puncuation
    punc  = string.punctuation

    # replace any weird characters
    text = input
    remove_non_ascii(text)

    # remove puncuation
    new_text = text.translate(str.maketrans('', '', punc))

    # tokenize and make everything lower case
    text_tokens = word_tokenize(new_text.lower())

    # remove stop words
    tokens_without_sw= [word for word in text_tokens if not word in all_stopwords]

    # make list a string again
    tokens_without_sw = " ".join(tokens_without_sw)
    tokens_without_sw

In [18]:
text = """Iconic rockstar Meat Loaf dies aged 74"""

In [19]:
clean_input(text)

In [20]:
# test on article titles (this is testing on multiple article titles at once)
pa_vec = vectorizor.transform(x_test.tolist())

# test on random article title
# pa_vec = vectorizor.transform([text])

In [21]:
# predicted values of each test row
predictions = PassAgg.predict(pa_vec)

In [22]:
# print out result
predictions

array([0, 1, 1, ..., 1, 0, 1])

In [23]:
z = pd.DataFrame({"Predictions":predictions,"Actual":y_test})
z

Unnamed: 0,Predictions,Actual
14505,0,0
37239,1,1
36529,1,1
19293,0,0
32526,1,1
...,...,...
20828,0,0
28828,1,1
5256,1,1
21782,0,0


In [24]:
print(f"Training Data Score: {PassAgg.score(vec_x_train, y_train)}")
print(f"Testing Data Score: {PassAgg.score(vec_x_test, y_test)}")

Training Data Score: 0.9999454178265379
Testing Data Score: 0.9798591779924677


In [25]:
# show how many are not correct
z[z.Predictions != z.Actual].shape

(123, 2)

In [26]:
import pickle

In [27]:
pickle.dump(pa_vec, open("Model_files/Vectorisors/pa_vec.pkl", 'wb'))

In [28]:
pickle.dump(pa_vec, open("Model_files/Models/pa_model.pkl", 'wb'))