In [None]:
# prompt: please install all the below using pip

!pip install numpy pandas matplotlib seaborn scikit-learn tensorflow keras clean_text pandarallel


In [None]:
import nltk
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import gensim
import logging
from gensim.models.doc2vec import TaggedDocument

import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.lm import Vocabulary
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.decomposition import PCA
import joblib
from cleantext import clean
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from pandarallel import pandarallel
import ast
import math
from imblearn.under_sampling import RandomUnderSampler
pandarallel.initialize(progress_bar=True)
import time
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

import os
IN_COLAB = False
if os.getenv("COLAB_RELEASE_TAG"):
   IN_COLAB = True

news_processed = None
bbc_proccessed = None
dataPath = "../data/"

In [None]:
if IN_COLAB:
  from google.colab import drive
  drive.mount('/content/drive')

# Part 1 (Data Processing)

### Task 1
Retrive data sample

In [None]:
# Data path
dataPath = "../data/"
if IN_COLAB:
  dataPath = "/content/drive/MyDrive/"

# Read data from csv using pandas
nsdf = pd.read_csv(dataPath + "news_sample.csv")
nsdf = nsdf.reset_index(drop=True)  # Reset index??

# Safe raw data
nsdf_raw = nsdf.copy(deep=True)

nsdf.info()   # Check column types and missing values

In [None]:
# unique lable values
unique_values = nsdf['type'].unique()
print(unique_values)

In [None]:
#nan and unknown removed as they seem useless when training a classifier
nsdf = nsdf.dropna(subset=['type'])
nsdf = nsdf.loc[nsdf['type']!='unknown']
newunique_values = nsdf['type'].unique()
print(newunique_values)


#### Cleaning and Preprocessing functions

To test the different functions and see the reduction in vocabularity size

In [None]:
def cleanText(data, column):
    data[column] = data[column].parallel_apply(clean_text_help)
    return data

def clean_text_help(text):
    if isinstance(text, str):
        # Remove excess whitespace
        text = re.sub(r"\s+", " ", text).strip()
        #replace dates
        text = re.sub(r"(0[1-9]|[1-2][0-9]|3[0-1])[-/.]?(0[1-9]|1[0-2])[-/.]?([0-9]{2}|[0-9]{4})", "<DATE>", text)  # Replace date type 1
        text = re.sub(r"(0[1-9]|[1-2][0-9]|3[0-1])\s([A-Za-z]{3})\s([0-9]{2}|[0-9]{4})", "<DATE>", text)  # Replace date type 2
        return clean(text, lower=True, no_line_breaks=True, no_numbers=True, no_emails=True, no_urls=True, no_punct=True, replace_with_url=r"__URL__", replace_with_email=r"__EMAIL__", replace_with_number=r"__NUM__", replace_with_digit=r"__NUM__")
    raise TypeError("Clean_text passed non-string")

In [None]:
#Tokenize the text function
def tokenizeText(data, column):
    def tokenize_text_help(text):
        if isinstance(text, str):
            return (word_tokenize(text))
        return text  # Return unchanged if not a string
    data[column] = data[column].parallel_apply(tokenize_text_help)  # Apply function
    return data

In [None]:
#function for removeing stopwords
def remove_stopwords_help(text):
    text = pd.Series(text)
    stop_words = set(stopwords.words('english'))  # Load stopwords
    return text[~text.isin(stop_words)].to_list()  # Remove stopwords

def remove_stopwords(data, column):
    data['content'] = data['content'].parallel_apply(remove_stopwords_help)  # Apply function
    return data

In [None]:
#Returns a pandas series, with word and frequency, very fast.
def getFreq(data, column):
    return len(data[column].str.split().explode().value_counts())

def getFreq_tokinized(data, column):
    return len(data[column].explode().value_counts())

def Vocab_size_tokinized(data, column):
    return len(data[column].explode())

def Vocab_size(data, column):
    return len(data[column].str.split().explode())

In [None]:

#function for removeing stopwords
def dataStemming(data, column):
    ps = PorterStemmer()
    def dataStemming_help(text):
        text = pd.Series(text)
        if(isinstance(text, str)):
            return pd.Series(ps.stem(text))
        return text.apply(ps.stem).to_list()
    data[column] = data[column].parallel_apply(dataStemming_help)  # Apply function
    return data


In [None]:
def calc_reduction(start_value, end_value):
    return round(((start_value - end_value)/start_value)*100, 2)


printing_copy = nsdf_raw.copy(deep=True)
freq_raw = getFreq(printing_copy, 'content')
data_size_raw = Vocab_size(printing_copy, 'content')

printing_copy = printing_copy.dropna(subset=['type'])
printing_copy = printing_copy.loc[printing_copy['type']!='unknown']

freq_pre = getFreq(printing_copy, 'content')
data_size_pre = Vocab_size(printing_copy, 'content')
freq_clean = getFreq(cleanText(printing_copy, 'content'), 'content')
data_size_clean = Vocab_size(printing_copy, 'content')
freq_token = getFreq_tokinized(tokenizeText(printing_copy, 'content'), 'content')
data_size_token = Vocab_size_tokinized(printing_copy, 'content')
freq_stopwords = getFreq_tokinized(remove_stopwords(printing_copy, 'content'), 'content')
data_size_stopwords = Vocab_size_tokinized(printing_copy, 'content')
freq_stemmed = getFreq_tokinized(dataStemming(printing_copy, 'content'), 'content')
data_size_stemmed = Vocab_size_tokinized(printing_copy, 'content')


print("--- Unique words ---")
print("Raw: ", freq_raw)
print("Preprocessing: ", freq_pre, "reduction: ", calc_reduction(freq_raw, freq_pre))
print("After cleaning: ", freq_clean, "reduction: ", calc_reduction(freq_raw, freq_clean))
print("After tokenizing: ", freq_token, "reduction: ", calc_reduction(freq_raw, freq_token))
print("After removing stopwords: ", freq_stopwords, "reduction: ", calc_reduction(freq_raw, freq_stopwords))
print("After stemming: ", freq_stemmed, "reduction: ", calc_reduction(freq_raw, freq_stemmed))

print(" ")

print("--- Word counts ---")
print("Raw: ", data_size_raw)
print("Preprosseing: ", data_size_pre, "reduction: ", calc_reduction(data_size_raw, data_size_pre))
print("After cleaning: ", data_size_clean, "reduction: ", calc_reduction(data_size_raw, data_size_clean))
print("After tokenizing: ", data_size_token, "reduction: ", calc_reduction(data_size_raw, data_size_token))
print("After removing stopwords: ", data_size_stopwords, "reduction: ", calc_reduction(data_size_raw, data_size_stopwords))
print("After stemming: ", data_size_stemmed, "reduction: ", calc_reduction(data_size_raw, data_size_stemmed))


#Delete copy:
del printing_copy
del freq_pre, data_size_pre, freq_clean, data_size_clean, freq_token, data_size_token, freq_stopwords, data_size_stopwords, freq_stemmed, data_size_stemmed



#### Processing and cleaning function

In [None]:
# One big function to process data: uses functional programming approach to simplify changes
def processData(data, column, stemming=True):
    def apply_sequential_helper(functions):
        # assume type siganture of functions to be List[f : String -> string ]
        def inner(text):
            for f in functions:
                text = f(text)
            return text
        return inner

    def clean_text_help(text):
        if isinstance(text, str):
            # Remove excess whitespace
            text = re.sub(r"\s+", " ", text).strip()
            # remove non-ascii
            text = re.sub(r'[^\w _]+', '', text)
            
            #replace dates
            text = re.sub(r"(0[1-9]|[1-2][0-9]|3[0-1])[-/.]?(0[1-9]|1[0-2])[-/.]?([0-9]{2}|[0-9]{4})", " __DATE__ ", text)  # Replace date type 1
            text = re.sub(r"(0[1-9]|[1-2][0-9]|3[0-1])\s([A-Za-z]{3})\s([0-9]{2}|[0-9]{4})", " __DATE__ ", text)  # Replace date type 2

            return clean(text, lower=True, no_line_breaks=True, no_currency_symbols=True, no_numbers=True, 
                         no_emails=True, no_urls=True, no_punct=True, replace_with_url=r" __URL__ ", replace_with_email=r" __EMAIL__ ", 
                         replace_with_number=r" __NUM__ ", replace_with_digit=r" __NUM__ ", replace_with_currency_symbol=r" __CUR__ ")
        raise TypeError("Clean_text passed non-string")

    def tokenize_text_help(text):
        if isinstance(text, str):
            return pd.Series(word_tokenize(text))
        return text  # Return unchanged if not a string

    def remove_stopwords_help(text):
      # text is a Series[str]
        stop_words = set(stopwords.words('english'))  # Load stopwords
        #if isinstance(text, str):
        #    return [word for word in text.at[0, 'content'] if not word.lower() in stop_words]
        #return text  # Return unchanged if not a string
        return text[~text.isin(stop_words)]

    ps = PorterStemmer()
    def dataStemming_help(text):
        #if isinstance(text, str):
        #    return ps.stem(text)
        #return text  # Return unchanged if not a string
        if(isinstance(text, str)):
            return pd.Series(ps.stem(text))
        return text.apply(ps.stem)

    def type_cleaner(text):
        if isinstance(text, str):
            return pd.Series(text).to_list()
        return text.to_list()

    data[column] = data[column].parallel_apply(apply_sequential_helper(
        [clean_text_help, # str -> str
        tokenize_text_help, # str -> list[str]
        remove_stopwords_help, #series[str] -> series[str]
        dataStemming_help if stemming else lambda x:x, #series[str] -> series[str]
        type_cleaner # series[str] -> series[str]
    ]))
    return data

In [None]:

nsdf_processed = processData(nsdf, 'content')
nsdf_processed.dropna(subset=['content'], inplace=True)  # Drop rows with no content
nsdf_processed.reset_index(drop=True, inplace=True)  # Reset index
print(nsdf_processed.at[0, 'content'])

### Task 2

In [None]:
#load data
fakeNewsCorpus = pd.read_csv(dataPath + "995,000_rows.csv")
#Hva saten er den der unnamed???
print(fakeNewsCorpus.head())
#fakeNewsCorpus['content'].duplicated()
news_noDup = fakeNewsCorpus.drop_duplicates(subset=['content']).dropna(subset=['content']).reset_index(drop=True)

In [None]:
#Cleaning
news_processed = processData(news_noDup, 'content')
news_processed.to_json(dataPath + "news_processed.json", orient='records', lines=True)

### Task 3

In [None]:
# avoid reprocessing
if news_processed is None:
    json_reader = pd.read_json(dataPath + "news_processed.json", orient='records', lines=True, chunksize=1500)
    news_processed = pd.concat(json_reader, ignore_index=True)
# timed: 12 min på M1 macbook chunk=1000
print(news_processed.info())   # Check column types and missing values
#fndf = fakeNewsCorpus.reset_index(drop=True)  # Reset index
fndf = news_processed


## Observations about dataset

In [None]:
fndf = news_processed
unique_values = fndf['type'].unique()
print(unique_values)
#hard to know how to classify nan and unknown, so removed for now
# we also remove the a weird type 
fndf = fndf.dropna(subset=['type'])
fndf = fndf.loc[(fndf['type']!='unknown') & (fndf['type']!='unreliable') & (fndf['type'] != '2018-02-10 13:43:39.521661') & (fndf['type'] != "rumor")]
# Need to reset index

newunique_values = fndf['type'].unique()
print(newunique_values)

In [None]:
# reliable, clickbait and political are all, by their contents, factually correct (albeit possibly politcally motivated)
# we deem those to be "real" news
print("adding binary labels")
fndf['type'] = fndf['type'].replace(r'^(reliable|clickbait|political)$', '0', regex=True) 
fndf['type'] = fndf['type'].replace(r'^(?!(0)$).+', '1', regex=True)   # Replace everything except '1' with '0'
#maybe fix this? what remains?
fndf['type'] = fndf['type'].fillna('0')

newunique_values = fndf['type'].unique()
print(newunique_values)
fndf.shape[0]
#fndf['type'] = fndf['type'].astype(int)  # Convert to integer

print(("real vs fake:"))
print(fndf['type'].value_counts())

In [None]:
relib_news = fndf.loc[fndf['type'] == '0']
fake_news = fndf.loc[fndf['type'] == '1']

In [None]:
avg_len_real = relib_news["content"].apply(len).mean()
print(relib_news)
avg_len_fake = fake_news["content"].apply(len).mean()
print("Average length of real news: ", avg_len_real)
print("Average length of fake news: ", avg_len_fake)
plt.bar(["Real", "Fake"], [avg_len_real, avg_len_fake])
plt.title("Average length of news")
plt.ylabel("Average length")
plt.show()

In [None]:
fake_dist = FreqDist(fake_news['content'].explode())
real_dist = FreqDist(relib_news['content'].explode())
all_dist = FreqDist(fndf['content'].explode())
# add one discounting
fake_total = int(pd.Series([b+1 for (a, b) in fake_dist.most_common(10000)]).sum())
real_total = int(pd.Series([b+1 for (a, b) in real_dist.most_common(10000)]).sum())
all_total = int(pd.Series([b+1 for (a, b) in all_dist.most_common(10000)]).sum())
print(f"real total: {real_total}, fake total: {fake_total}")
print(real_total-fake_total)


In [None]:

real_pd_prob = pd.Series(dict(real_dist)).apply(lambda x: x/real_total)
fake_pd_prob = pd.Series(dict(fake_dist)).apply(lambda x: x/fake_total)



In [None]:
def entropy_calculation(row):
    word = row["word"]
    # use one cause of add one discounting
    row["real_prob"] = real_dist.get(word, 1) / all_dist.get(word, 1/all_total)
    #row["real_prob"] = real_prob
    row["fake_prob"] = fake_dist.get(word, 1) / all_dist.get(word, 1/all_total)
    #row["fake_prob"] = fake_prob
    #row["fake_prob"] = -1*fake_prob*np.log2(fake_prob)
    #row["real_entropy"] = -1*real_prob*np.log2(real_prob)
    #row["entropy"] = row["fake_entropy"] + row["real_entropy"]
    row["prob_diff"] = abs(row["real_prob"] - row["fake_prob"])
    return row
    

In [None]:
word_entropy = pd.DataFrame(all_dist.most_common(1000), columns=["word", "count"]).apply(entropy_calculation, axis=1)
word_entropy = word_entropy.sort_values("fake_prob", ascending=False)
print(word_entropy.head(20))

In [None]:
plt.figure(figsize=(14, 9))

word_entropy_lim = word_entropy.head(100)
width = 0.2
# Bar chart with stacking
#plt.bar(word_entropy_lim["word"], word_entropy_lim["real_prob"] + word_entropy_lim["fake_prob"], label="Real", color="blue")
#plt.bar(word_entropy_lim["word"], word_entropy_lim["fake_prob"], label="Fake", color="red")
"""
plt.bar(word_entropy_lim["word"])

# Labels and title
plt.xlabel("Words")
plt.ylabel("Entropy")
plt.title("Stacked Bar Chart of Real vs. Fake Entropy per Word")
plt.legend()

# Rotate x-axis labels for readability
plt.xticks(rotation=90)

# Show plot
plt.show()
"""

word_entropy_lim[["word", "real_prob", "fake_prob"]].plot(kind='bar', x='word', stacked=False, figsize=(14, 9))

In [None]:
vocab = {val for (val, _) in all_dist.most_common(10000)}
print(vocab)
tfidf_real_vec = TfidfVectorizer(analyzer=lambda x: x, vocabulary=vocab)
tfidf_fake_vec = TfidfVectorizer(analyzer=lambda x: x, vocabulary=vocab)
tfidf_real = tfidf_real_vec.fit_transform(relib_news["content"])
tfidf_fake = tfidf_fake_vec.fit_transform(fake_news["content"])



In [None]:
from sklearn.decomposition import PCA
tfidf_model = TfidfVectorizer(analyzer=lambda x: x, max_features=10000)
tfidf_vec = tfidf_model.fit_transform(fndf["content"])
tfidf_vec_scale = StandardScaler(with_mean=False).fit_transform(tfidf_vec)
PCA_model = PCA(n_components=2)
PCA_data = PCA_model.fit_transform(tfidf_vec_scale)
#PCA_data["type"] = fndf["type"]

In [None]:

sns.scatterplot(x = PCA_data[:, 0], y = PCA_data[:, 1], hue=fndf["type"], alpha=0.7)

In [None]:
sns.kdeplot(x = PCA_data[:, 0], y = PCA_data[:, 1], hue=fndf["type"], fill=False, alpha=0.5)

# observationer
- det virker ikke til at de er seperatble kun udfra bag in a word med tf-idf
- der er nogle regions hvor vi kan, og andre hvor vi ikke umiddelbart kan
- måske er f.eks. embeddings bedre?

In [None]:
real_pd = pd.Series(dict(real_dist))
fake_pd = pd.Series(dict(fake_dist))

real_set = set(real_pd.index)
fake_set = set(fake_pd.index)
unique_words = real_set.union(fake_set) - real_set.intersection(fake_set)
print(f"Amount of words only present in one set than another: {len(unique_words)}")

large_diff = [(word, real_pd.get(word, 0) - fake_pd.get(word, 0)) for word in fake_pd.index if abs(real_pd.get(word, 0) - fake_pd.get(word, 0)) > 0.00001]

large_diff.sort(key=lambda x: abs(x[1]), reverse=True)
print(large_diff)

print("Unique tokens:")
for token in ["url", "email", "num", "date"]:
    print(f"TOKEN: {token} - Real: {real_pd[token]}, Fake: {fake_pd[token]}")

plt.bar(real_pd.keys(), real_pd.values)
plt.show()

## Task 4

In [None]:
# Splitting into test, train and validation
X_train_full, X_valtest_full, y_train, y_valtest = train_test_split(fndf, fndf['type'], test_size=0.2, random_state=42)
X_test_full, X_val_full, y_test, y_val = train_test_split(X_valtest_full, y_valtest, test_size=0.5, random_state=42)
# x_train = testing_ x, y_train = training_y
# (x_test
X_train = X_train_full['content']
X_test = X_test_full['content']
X_val = X_val_full['content']
print("train size:", y_train.shape)
print("val size:", y_val.shape)
print("test size:", y_test.shape)


# Part 2

## Task 0, splitting labels into reliable and unreliable.

In [None]:
# done above as we used it for EDA

## Task 1 - Simple linear regression model.

Get the top 10000 words, and how often they occur in each article

In [None]:
#Standarize fndf?

# get top 10000 words for vocab in training data to avoid leaking data from test set
print("Finding vocabulary:")
vocab = X_train.explode().value_counts()[:10000].keys()

vectorizer = CountVectorizer(analyzer=lambda x: x, vocabulary=vocab)
print("vectorizing X_train")
rowsFreq = vectorizer.fit_transform(X_train)
print("vectorizing X_val")
val_rowsFreq = vectorizer.fit_transform(X_test)

print("vectorizing X_test")
test_rowsFreq = vectorizer.fit_transform(X_test)

print(rowsFreq)


Creating the linear regression

In [None]:
#scaler = StandardScaler(with_mean=False)
scaler = Normalizer()
undersampler = RandomUnderSampler(random_state=42)

X_train_scale = scaler.fit_transform(rowsFreq)
X_test_scale = scaler.transform(test_rowsFreq)
X_val_scale = scaler.transform(val_rowsFreq)
x_undersampled, y_undersampled = undersampler.fit_resample(X_train_scale, y_train)
print("Starting model training:")
linReg = LogisticRegression(max_iter=1000, penalty="l1", solver='liblinear', random_state=42)#, class_weight="balanced")
linReg_weighed = LogisticRegression(max_iter=1000, penalty="l1", solver='liblinear', random_state=42, class_weight="balanced")
linReg_undersampled = LogisticRegression(max_iter=1000, penalty="l1", solver='liblinear', random_state=42)
print("fitting standard model")
linReg.fit(X_train_scale, y_train)
print("fitting weighted model")
linReg_weighed.fit(X_train_scale, y_train)
print("fitting undersampled model")
linReg_undersampled.fit(x_undersampled, y_undersampled)

y_pred = linReg.predict(X_val_scale)
y_pred_weighted = linReg_weighed.predict(X_val_scale)
y_pred_undersampled = linReg_undersampled.predict(X_val_scale)
f1 = f1_score(y_val, y_pred)
f1_weighted = f1_score(y_val, y_pred_weighted)
f1_undersampled = f1_score(y_val, y_pred_undersampled)
# Print results
print("Standard : Weighted : Undersampled")
print(f"F1 Score: {f1:.4f} : {f1_weighted:.4f} : {f1_undersampled:.4f}")
print(f"Hyperparameters: max_iter=1000, solver='liblinear', binary bag-of-words")


In [None]:
# Save model
joblib.dump((linReg, linReg_weighed, linReg_undersampled), dataPath + "linReg.pkl")

In [None]:
#Visualize
val_rowsFreq = vectorizer.transform(X_val)
y_val_pred = linReg_weighed.predict(scaler.transform(val_rowsFreq)).astype(int)
print(f"F1 Score: Eval: {f1_score(y_val, y_val_pred):.4f}, Test: {f1_score(y_val, y_val_pred):.4f}")
print(f"Accuracy (eval set): {accuracy_score(y_val, y_val_pred):.4f}")

cm = confusion_matrix(y_val, y_val_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=linReg.classes_)
disp.plot()
plt.show()

In [None]:
# test set (created for evaluation in report)
y_pred = linReg.predict(X_test_scale)
y_train_pred = linReg.predict(X_train_scale)

print(f"F1 Score: Eval: {f1_score(y_train, y_train_pred):.4f}, Test: {f1_score(y_test, y_pred):.4f}")
print(f"Accuracy (test): {accuracy_score(y_test, y_pred_weighted):.4f}")

cm = confusion_matrix(y_test, y_pred_weighted, normalize='true')
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=linReg.classes_)
disp.plot()
plt.show()

### Task 3

In [None]:
bbc_raw = pd.read_csv(dataPath + "bbc_articles.csv")

bbc_processed = processData(bbc_raw, 'text')
bbc_processed['type'] = 1

bbc_processed.to_json(dataPath + "bbc_processed.json", orient='records', lines=True)


In [None]:
# Load BBC df
if bbc_proccessed is None:
    json_reader = pd.read_json(dataPath + "bbc_processed.json", orient='records', lines=True, chunksize=1500)
    bbc_processed = pd.concat(json_reader, ignore_index=True)

bbcdf = bbc_processed

print(X_train.shape, y_train.shape)
print(len(bbcdf['text']), len(bbcdf['type']))
#print(len(BBC_train), len(BBC_y))  # These should be the same


In [None]:
#Load bbcReg
bbcReg = joblib.load(dataPath + "bbcReg.pkl")

In [None]:
BBC_train = pd.concat([X_train, bbcdf['text']]).reset_index(drop=True)
BBC_y = pd.concat([y_train, bbcdf["type"]])

BBC_vocab = BBC_train.explode().value_counts()[:10000].keys()
vectorizer = CountVectorizer(analyzer=lambda x: x, vocabulary=BBC_vocab)
bbcFreq = vectorizer.fit_transform(BBC_train)
bbc_testFreq = vectorizer.fit_transform(X_test)

scaler = Normalizer()
BBC_train = scaler.fit_transform(bbcFreq)
BBC_test = scaler.fit_transform(bbc_testFreq)
bbcReg = LogisticRegression(max_iter=1000, penalty="l1", solver='liblinear', random_state=42)
bbcReg.fit(BBC_train, BBC_y)
BBC_pred = bbcReg.predict(bbc_testFreq)


In [None]:
# Printing results
print(f"Accuracy (test): {accuracy_score(y_test, BBC_pred):.4f}")
print(f"F1: {f1_score(y_test, BBC_pred): .4f}")

In [None]:
joblib.dump(bbcReg, dataPath + "bbcReg.pkl")
joblib.dump((bbcFreq, bbc_testFreq), dataPath + "bbcFreqs.pkl")

# advanced model

In [None]:
print(X_train.to_frame().reset_index(drop=False))

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
print("tagging documents")
tagged_docs = [TaggedDocument(words=doc, tags=[i]) for doc, i in zip(X_train.values, range(X_train.shape[0]))]


In [None]:
vector_size = 100
embed_model = gensim.models.doc2vec.Doc2Vec(tagged_docs, vector_size=vector_size, min_count=2, workers=4)

In [None]:
embed_model.save(dataPath + "doc2vec.model")

In [None]:
embed_model = gensim.models.doc2vec.Doc2Vec.load(dataPath + "doc2vec.model")

In [None]:
X_train_embedded = pd.Series([embed_model.infer_vector(doc) for doc in X_train])
X_val_embed = pd.Series([embed_model.infer_vector(doc) for doc in X_val])
X_test_embed = pd.Series([embed_model.infer_vector(doc) for doc in X_test])

In [None]:
joblib.dump((X_train_embedded, X_val_embed, X_test_embed), dataPath + "X_test_embed.pkl")

In [None]:
(X_train_embedded, X_val_embed, X_test_embed) = joblib.load(dataPath + "X_test_embed.pkl")

In [None]:
import seaborn as sns
from sklearn.decomposition import PCA
PCA_embedded = PCA(n_components=2)
trained_embedded = pd.DataFrame(embed_model.dv.vectors)
PCA_embedded.fit(trained_embedded)
val_embedded = PCA_embedded.transform(X_val_embed.to_list())
val_embedded = pd.DataFrame(val_embedded, columns=["PCA1", "PCA2"])
val_embedded["type"] = y_test

sns.kdeplot(data=val_embedded, x="PCA1", y="PCA2", hue="type", alpha=0.9)



In [None]:
# Combination of various models attempted on doc embeddings
"""
svc = LinearSVC(random_state=42, verbose=1, class_weight="balanced")
svc.fit(X_train_embedded, y_train)
y_pred_svc = svc.predict(X_test_embed.to_list())
print(f"F1 Score: {f1_score(y_test, y_pred_svc):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_svc):.4f}")
"""
scaler = Normalizer()
tf_idf_model = TfidfVectorizer(max_features=10000, analyzer=lambda x: x)
tfidf_train = tf_idf_model.fit_transform(X_train)
tfidf_train = scaler.fit_transform(tfidf_train)
# alpha = 0.001 = 0.88 f1
mlp = MLPClassifier(max_iter = 2000, random_state=42, verbose=1, early_stopping=True, alpha=0.0015)
mlp.fit(tfidf_train, y_train)
tfidf_test = tf_idf_model.transform(X_test)
tfidf_test = scaler.transform(tfidf_test)
y_pred_mlp = mlp.predict(tfidf_test)
print(f"F1 Score: {f1_score(y_test, y_pred_mlp):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_mlp):.4f}")

#building neural network
# 0.777 both acc and f1
"""
mlp = MLPClassifier(max_iter = 2000, random_state=42, verbose=1)
mlp.fit(X_train_embedded, y_train)
y_pred_mlp = mlp.predict(X_test_embed.to_list())
print(f"F1 Score: {f1_score(y_test, y_pred_mlp):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_mlp):.4f}")
# f1 =. .8048

"""

In [None]:
# creating TF-IDF vectors for advanced voting classifier
scaler = Normalizer()
tf_idf_model = TfidfVectorizer(max_features=10000, analyzer=lambda x: x)
tfidf_train = tf_idf_model.fit_transform(X_train)
tfidf_train = scaler.fit_transform(tfidf_train)

tfidf_test = tf_idf_model.transform(X_test)
tfidf_test = scaler.transform(tfidf_test)

tfidf_val = tf_idf_model.transform(X_val)
tfidf_val = scaler.transform(tfidf_val)

In [None]:
import scipy.sparse as sp

# scale doc embeddings
scaler = Normalizer()
X_train_embedded_scaled = scaler.fit_transform(X_train_embedded)
X_test_embed_scaled = scaler.transform(X_test_embed)

# double feature set
print(tfidf_train.shape)
print(X_train_embedded.shape)
print(X_test_embed.shape)
print(type(tfidf_train))
print(type(X_train_embedded))
print(type(X_test_embed))
dense_train_embed = sp.csr_matrix(X_train_embedded_scaled)
dense_test_embed = sp.csr_matrix(X_test_embed_scaled)
total_train = sp.hstack([tfidf_train, dense_train_embed])
total_test = sp.hstack([tfidf_test, dense_test_embed])

In [None]:
X_val_embed_scale = scaler.transform(X_val_embed)
dense_val_embed = sp.csr_matrix(X_val_embed_scale)
total_val = sp.hstack([tfidf_val, dense_val_embed])

In [None]:
# creating voting classifier
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

mlp_total = MLPClassifier(max_iter = 2000, random_state=42, verbose=1, alpha=0.01, early_stopping=True)

neural_only_embed = Pipeline([
    ('only_embed', ColumnTransformer([('embeds', 'passthrough', slice(10000, None))])),
    ('scaler', Normalizer()),
    ('mlp', MLPClassifier(max_iter = 2000, random_state=42, verbose=1))
])

neural_only_tfidf = Pipeline([
    ('only_tfidf', ColumnTransformer([('tfidf', 'passthrough', slice(0, 10000))])),
    ('mlp', MLPClassifier(max_iter = 2000, random_state=42, verbose=1, early_stopping=True, alpha=0.005))
])


voting = VotingClassifier(estimators=[('only_tfidf', neural_only_tfidf), ('mlp', mlp_total), ('only_embed', neural_only_embed)], voting='soft', weights=[0.75, 1, 0.75])
voting.fit(total_train, y_train)
y_pred = voting.predict(total_test)
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

In [None]:
joblib.dump(voting, dataPath + "voting_weighted.pkl")

In [None]:
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, confusion_matrix, ConfusionMatrixDisplay

voting = joblib.load(dataPath + "voting_weighted.pkl")
y_val = y_val.astype(int)
y_test = y_test.astype(int)

y_pred_total = voting.predict(total_val)
print(f"Accuracy (test): {accuracy_score(y_val, y_pred_total):.4f}")
print("F1-score: ", f1_score(y_val, y_pred_total))

print("Recall: ", recall_score(y_val, y_pred_total))
print("Precision: ", precision_score(y_val, y_pred_total))

cm = confusion_matrix(y_val, y_pred_total, normalize='true')
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=voting.classes_)
disp.plot()
plt.show()

cm = confusion_matrix(y_val, y_pred_total)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=voting.classes_)
disp.plot()
plt.show()

for model in voting.estimators_:
    print(f"F1 Score: {f1_score(y_val, model.predict(total_val)):.4f}")
    print(f"Accuracy: {accuracy_score(y_val, model.predict(total_val)):.4f}")

In [None]:
# eval for evaluation - using test set
y_pred_total = voting.predict(total_test)
print(f"Accuracy (test): {accuracy_score(y_test, y_pred_total):.4f}")
print(f"f1_score: {f1_score(y_test, y_pred_total):.4f}")

cm = confusion_matrix(y_test, y_pred_total, normalize='true')
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=voting.classes_)
disp.plot()
plt.show()

for model in voting.estimators_:
    print(f"F1 Score: {f1_score(y_test, model.predict(total_test)):.4f}")
    print(f"Accuracy: {accuracy_score(y_test, model.predict(total_test)):.4f}")

#### Trying HistGradientBoostingClassifier

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=10000, analyzer=lambda x: x)
X_training = vectorizer.fit_transform(X_train)
X_testing = vectorizer.transform(X_test)

In [None]:
### Gradient Descent random forest
from sklearn.decomposition import PCA

pca = PCA(n_components=400)

X_training_pca = pca.fit_transform(X_training)
X_testing_pca = pca.transform(X_testing)


from sklearn.ensemble import HistGradientBoostingClassifier

HGBC = HistGradientBoostingClassifier(max_iter=500, random_state=42, verbose=1)

HGBC.fit(X_training_pca, y_train)
y_pred = HGBC.predict(X_testing_pca)
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.4f}")

In [None]:
# k nearest neighbour
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import Normalizer
scaler = Normalizer()
X_test_embed = np.array([np.array(x) for x in X_test_embed])
print(X_test_embed)
print(X_train_embedded.shape)

X_train_embed_scale = scaler.fit_transform(X_train_embedded)
X_test_embed_scale = scaler.fit_transform(X_test_embed)

knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
print("fitting model")
knn.fit(X_train_embed_scale, y_train)
y_pred_knn = knn.predict(X_test_embed_scale)
print(f"F1 Score: {f1_score(y_test, y_pred_knn):.4f}")