In [1]:
import pandas as pd
import pycaret.classification as classification


In [2]:
true_df = pd.read_csv("resources/true.csv")
fake_df = pd.read_csv("resources/fake.csv")


In [31]:
true_df["class"] = 0
fake_df["class"] = 1
all_news_df = pd.concat([true_df, fake_df], ignore_index=True)
all_news_df.columns


Index(['title', 'text', 'subject', 'date', 'class'], dtype='object')

In [32]:
all_news_df["date"] = pd.to_datetime(
    all_news_df["date"].apply(lambda x: x.strip()), format="mixed", errors="coerce"
)
all_news_df = all_news_df[all_news_df.date.notnull()].reset_index(drop=True)
all_news_df


Unnamed: 0,title,text,subject,date,class
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,2017-12-31,0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,2017-12-29,0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,2017-12-31,0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,2017-12-30,0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,2017-12-29,0
...,...,...,...,...,...
44883,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,2016-01-16,1
44884,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,2016-01-16,1
44885,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,2016-01-15,1
44886,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,2016-01-14,1


In [3]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import string


In [6]:
stopwords_set = set(stopwords.words("english"))
def preprocess(s: str):
    ps = PorterStemmer()
    s = s.lower()
    s = s.translate(str.maketrans("", "", string.punctuation))
    s = re.sub(r"[^A-Za-z]", " ", s)
    s = re.sub(r"\s+", " ", s)
    s = word_tokenize(s)
    s = [word for word in s if not word in stopwords_set]
    s = [ps.stem(w) for w in s]
    s = " ".join(s).strip()
    return s

joined_texts = all_news_df.apply(lambda x: " ".join([x.title, x.text]), axis=1)
joined_texts


0        As U.S. budget fight looms, Republicans flip t...
1        U.S. military to accept transgender recruits o...
2        Senior U.S. Republican senator: 'Let Mr. Muell...
3        FBI Russia probe helped by Australian diplomat...
4        Trump wants Postal Service to charge 'much mor...
                               ...                        
44883    McPain: John McCain Furious That Iran Treated ...
44884    JUSTICE? Yahoo Settles E-mail Privacy Class-ac...
44885    Sunnistan: US and Allied ‘Safe Zone’ Plan to T...
44886    How to Blow $700 Million: Al Jazeera America F...
44887    10 U.S. Navy Sailors Held by Iranian Military ...
Length: 44888, dtype: object

In [7]:
joined_texts.to_frame("texts").to_parquet("resources/news.parquet")


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import pickle


In [9]:
vectorizer: TfidfVectorizer = None
if os.path.isfile("resources/text_vectorizer.pkl"):
    with open("resources/text_vectorizer.pkl", "rb") as f:
        vectorizer = pickle.load(f)
else:
    vectorizer = TfidfVectorizer(preprocessor=preprocess)
    vectorizer.fit(joined_texts)
    with open("resources/text_vectorizer.pkl", "wb") as f:
        pickle.dump(vectorizer, f)



In [10]:
joined_tfidf_df = pd.DataFrame.sparse.from_spmatrix(vectorizer.transform(joined_texts), columns=vectorizer.vocabulary_)


In [33]:
joined_tfidf_df["class_label"] = pd.array(all_news_df["class"], dtype='Sparse[int]')
joined_tfidf_df


Unnamed: 0,us,budget,fight,loom,republican,flip,fiscal,script,washington,reuter,...,courtroomr,karg,releasedu,blmv,treehouseher,grandmotherlast,emanuelcurr,mondoweiss,overdriveread,class_label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44883,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
44884,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
44885,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
44886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [34]:
joined_tfidf_df.dtypes


us                 Sparse[float64, 0]
budget             Sparse[float64, 0]
fight              Sparse[float64, 0]
loom               Sparse[float64, 0]
republican         Sparse[float64, 0]
                          ...        
grandmotherlast    Sparse[float64, 0]
emanuelcurr        Sparse[float64, 0]
mondoweiss         Sparse[float64, 0]
overdriveread      Sparse[float64, 0]
class_label          Sparse[int64, 0]
Length: 185782, dtype: object

In [4]:
from scipy import sparse


In [5]:
columns = None
with open("resources/columns.pkl", "rb") as f:
  columns = pickle.load(f)
columns

sparse_matrix = sparse.load_npz("resources/news_tfidf.npz")
joined_tfidf_df = pd.DataFrame.sparse.from_spmatrix(sparse_matrix, columns=columns)


In [35]:
joined_coo = joined_tfidf_df.sparse.to_coo()

sparse.save_npz('resources/news_tfidf.npz', joined_coo)
with open("columns.pkl", "wb") as f:
    pickle.dump(joined_tfidf_df.columns, f)


In [36]:
joined_tfidf_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44888 entries, 0 to 44887
Columns: 185782 entries, us to class_label
dtypes: Sparse[float64, 0](185781), Sparse[int64, 0](1)
memory usage: 79.5 MB


In [None]:
s = classification.setup(joined_tfidf_df, target='class_label', preprocess=False)


In [6]:
best = classification.compare_models()


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,1.0,0.0,1.0,1.0,1.0,1.0,1.0,2.302
knn,K Neighbors Classifier,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.083
nb,Naive Bayes,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.635
svm,SVM - Linear Kernel,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.642
ridge,Ridge Classifier,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.778
et,Extra Trees Classifier,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.886
xgboost,Extreme Gradient Boosting,0.6325,0.0,0.2973,1.0,0.4583,0.2876,0.4098,0.894
rf,Random Forest Classifier,0.5381,0.0,1.0,0.531,0.6937,0.0332,0.1294,0.934
ada,Ada Boost Classifier,0.5381,0.0,1.0,0.531,0.6937,0.033,0.1291,0.764
gbc,Gradient Boosting Classifier,0.5381,0.0,1.0,0.531,0.6937,0.0332,0.1294,1.031


Processing:   0%|          | 0/69 [00:00<?, ?it/s]

KeyboardInterrupt: 