# Define similarity measures

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import jaccard_score as jaccard_similarity


def dot_similarity(vector1, vector2):
    return np.sum(np.multiply(np.array(vector1), np.array(vector2)))

def cosine_similarity(vector1, vector2):
    return dot_similarity(vector1, vector2) / np.sqrt(np.sum(vector1)**2 * np.sum(vector2)**2)

# Data tokenization

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
# ‘english’
def get_boolean_representation(corpus, stop_words_list='english'):
    vectorizer = CountVectorizer(binary=True, stop_words=stop_words_list)
    X = vectorizer.fit_transform(corpus)
    vectorizer.get_feature_names_out()
    return X.toarray()

def get_tf_representation(corpus, stop_words_list='english'):
    vectorizer = CountVectorizer(binary=False, stop_words=stop_words_list)
    X = vectorizer.fit_transform(corpus)
    vectorizer.get_feature_names_out()
    return X.toarray()

def get_tf_idf_representation(corpus, stop_words_list='english'):
    vectorizer = TfidfVectorizer(stop_words=stop_words_list)
    X = vectorizer.fit_transform(corpus)
    vectorizer.get_feature_names_out()
    return X.toarray()

# Define functions for centroid classification

In [3]:
class centroids_model:
    centroids_0 = None
    centroids_1 = None

    def get_centroids(self):
        if self.centroids_0 is None or self.centroids_1 is None :
            print('Error : class must be fitted before')
            exit(-1)
        else:
            return self.centroids_0, self.centroids_1

    def fit(self, data, labels):
        index_label_0 = np.argwhere(np.array(labels) == 0).reshape(-1)
        index_label_1 = np.argwhere(np.array(labels) == 1).reshape(-1)

        # compute centroids
        self.centroids_0 = np.sum(data[index_label_0], axis=0) / len(index_label_0)
        self.centroids_1 = np.sum(data[index_label_1], axis=0) / len(index_label_1)


    def predict(self, data, similarity='cosine'):
        if self.centroids_0 is None or self.centroids_1 is None :
            print('Error : class must be fitted before prediction')
            exit(-1)
        if similarity == 'dot':
            return [1 if dot_similarity(entry, self.centroids_0) < dot_similarity(entry, self.centroids_1) else 0 for entry in data]
        # elif similarity == 'jaccard':
        #     return [1 if jaccard_similarity(entry, self.centroids_0) < jaccard_similarity(entry, self.centroids_1) else 0 for entry in data]
        else:
            return [1 if cosine_similarity(entry, self.centroids_0) < cosine_similarity(entry, self.centroids_1) else 0  for entry in data]

# Experiments for centroids classification

In [7]:
import pandas as pd
"""
df_train = pd.read_csv("../data/DataFrame_train_PREPROCESS.csv")
df_val = pd.read_csv("../data/DataFrame_val_PREPROCESS.csv")

index_train = df_train['index']
index_val = df_val['index']

df = pd.concat([df_train, df_val], ignore_index = True, sort = False)


representation_boolean = get_boolean_representation(df['text'].apply(lambda x : x.lower()))
representation_tf = get_tf_representation(df['text'].apply(lambda x : x.lower()))
representation_tf_idf = get_tf_idf_representation(df['text'].apply(lambda x : x.lower()))

del df

x_train_boolean = representation_boolean[:len(index_train)]
x_val_boolean = representation_boolean[len(index_train):]

x_train_tf = representation_tf[:len(index_train)]
x_val_tf = representation_tf[len(index_train):]

x_train_tf_idf = representation_tf_idf[:len(index_train)]
x_val_tf_idf = representation_tf_idf[len(index_train):]


y_train = df_train['label']
y_val = df_val['label']

del df_val
del df_train"""

In [4]:
import pandas as pd

index_train = np.load('../data/array_idx_train.npy')
index_val = np.load('../data/array_idx_val.npy')

df = pd.read_csv('../data/DataFrame_train_full_preprocessing.csv')

representation_boolean = get_boolean_representation(df['text'].apply(lambda x : x.lower()))

y_train = df.loc[index_train, 'label']
y_val = df.loc[index_val, 'label']

del pd

x_train_boolean = representation_boolean[index_train]
x_val_boolean = representation_boolean[index_val]


In [6]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

# stopwords_lst = stopwords.words("english")

"""model = centroids_model()
model.fit(x_train_boolean, y_train)
y_predict = model.predict(x_val_boolean, similarity='cosine_similarity')
print('Accuracy when using boolean representation', accuracy_score(y_val, y_predict))

model = centroids_model()
model.fit(x_train_tf_idf, y_train)
y_predict = model.predict(x_val_tf_idf, similarity='cosine_similarity')
print('Accuracy when using tf-idf representation', accuracy_score(y_val, y_predict))"""

model = centroids_model()
model.fit(x_train_boolean, y_train)
y_predict = model.predict(x_val_boolean, similarity='cosine_similarity')
print('Accuracy when using bool representation', accuracy_score(y_val, y_predict))
print('AUC-ROC when using bool representation',roc_auc_score(y_val, y_predict))



Accuracy when using bool representation 0.6625
AUC-ROC when using bool representation 0.7814192343604108
