# Word2Vector

In this notebook we show how we applied work to vector for text classification.

In [12]:
# Necessary imports
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import f1_score


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
subs = ['Juvenile fiction', 'History', 'Poetry',
       'Politics and gouvernment', 'Cooking', 'Mistery', 'Philosophy',
       'Christian', 'Love stories', 'Periodicals', 'Humor', 'Travelling',
       'Correspondence', 'Adventure', 'Drama', 'Biography',
       'Historical fiction', 'Science fiction', 'Fantasy fiction', 'Science',
       'Others']

# We load the already processed word embeddings.
X = np.load("embeddings/X.npy")
X_test = np.load("embeddings/X_test.npy")
Y = np.load("embeddings/Y.npy")
Y_test = np.load("embeddings/Y_test.npy")

In [None]:
scaler = StandardScaler()
x_scaled = StandardScaler().fit_transform(X)
X_test_scaled = scaler.fit(X).transform(X_test)

def decide(X):
    Y = X.copy()
    for i,k in enumerate(X):
        if k>0.5:
            Y[i]=1
        else:
            Y[i]=0
    return Y

for s,c in enumerate(subs):
    Y_s = Y.copy()
    Y_test_s = Y_test.copy()
    for i in range(len(Y)):
        if Y_s[i]==s:
            Y_s[i]=1
        else:
            Y_s[i]=0
    for i in range(len(Y_test)):
        if Y_test_s[i]==s:
            Y_test_s[i]=1
        else:
            Y_test_s[i]=0
    M1 = LogisticRegression(max_iter=1000).fit(x_scaled, Y_s)
    M2 = LinearRegression().fit(x_scaled, Y_s)
    M3 = DecisionTreeClassifier().fit(x_scaled,Y_s)
    M4 = GaussianNB().fit(x_scaled,Y_s)
    print("---Results for classifiers of category",c,"---")
    print("Logistic:",f1_score(decide(M1.predict(X_test_scaled)), Y_test_s))
    print("Linear:",f1_score(decide(M2.predict(X_test_scaled)), Y_test_s))
    print("Tree:",f1_score(decide(M3.predict(X_test_scaled)), Y_test_s))
    print("Bayes:",f1_score(decide(M4.predict(X_test_scaled)), Y_test_s))
    

# Neural network classifier

In [None]:
# Definition of the network

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

class BinaryClassifier():
    def __init__(self,input_dim):
        self.model = keras.Sequential(
            [
                keras.layers.Input(shape=(input_dim,)), # input shape : number of TFIDF features
                keras.layers.Dense(units=int(input_dim*0.5), activation='relu'),
                keras.layers.Dense(units=int(25), activation='relu'),
                keras.layers.Dense(units=1, activation='sigmoid')
            ]
        )
        self.model.compile(optimizer="rmsprop", loss="binary_crossentropy")
    def fit(self,x_train,y_train,epochs=200):
        #print(x_train)
        return self.model.fit(x_train,y_train,epochs=epochs)

    def predict(self,x_test):
        return self.model.predict(x_test)

In [None]:
scaler = StandardScaler()
x_scaled = StandardScaler().fit_transform(X)
X_test_scaled = scaler.fit(X).transform(X_test)

#For each subject
for s,c in enumerate(subs):
    Y_s = Y.copy()
    Y_test_s = Y_test.copy()
    for i in range(len(Y)):
        if Y_s[i]==s:
            Y_s[i]=1
        else:
            Y_s[i]=0
    for i in range(len(Y_test)):
        if Y_test_s[i]==s:
            Y_test_s[i]=1
        else:
            Y_test_s[i]=0
    NN = BinaryClassifier(2500)
    NN.fit(x_scaled,Y_s)
    print(c,":",f1_score(decide(NN.predict(X_test_scaled)), Y_test_s))

In [None]:
# Definition of the network

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

class BinaryClassifier():
    def __init__(self,input_dim):
        self.model = keras.Sequential(
            [
                keras.layers.Input(shape=(input_dim,)), # input shape : number of TFIDF features
                keras.layers.Dense(units=int(input_dim*0.5), activation='relu'),
                keras.layers.Dense(units=int(25), activation='relu'),
                keras.layers.Dense(units=1, activation='sigmoid')
            ]
        )
        self.model.compile(optimizer="rmsprop", loss="binary_crossentropy")
    def fit(self,x_train,y_train,epochs=200):
        #print(x_train)
        return self.model.fit(x_train,y_train,epochs=epochs)

    def predict(self,x_test):
        return self.model.predict(x_test)

In [None]:
scaler = StandardScaler()
x_scaled = StandardScaler().fit_transform(X)
X_test_scaled = scaler.fit(X).transform(X_test)

#For each subject
for s,c in enumerate(subs):
    Y_s = Y.copy()
    Y_test_s = Y_test.copy()
    for i in range(len(Y)):
        if Y_s[i]==s:
            Y_s[i]=1
        else:
            Y_s[i]=0
    for i in range(len(Y_test)):
        if Y_test_s[i]==s:
            Y_test_s[i]=1
        else:
            Y_test_s[i]=0
    NN = BinaryClassifier(2500)
    NN.fit(x_scaled,Y_s)
    print(c,":",f1_score(NN.predict(X_test_scaled), Y_test_s))

# Results

It reads the results from `results_w2v.txt` which has been written manually by formating the output of this notebook

In [None]:
import matplotlib.pyplot as plt
algo = ['Neural network',"Logistic","Linear","Tree","Bayes"]
nn = {}
logistic = {}
linear = {}
tree = {}
bayes = {}
A = [nn,logistic,linear,tree,bayes]

s_i= -1
a_i=0
i=0
with open("results_w2v.txt","r") as f:
    for line in f:
        if i%6==0:
            s_i+=1
            a_i=0
            i+=1
            continue
        A[a_i][subs[s_i]] = float(line.split(":")[1])
        a_i+=1
        i+=1
        
fig = plt.figure()
ax1 = fig.add_subplot(111)

ax1.scatter(bayes.keys(),bayes.values(), s=10, c='k', marker="o", label='Bayes')
ax1.scatter(linear.keys(), linear.values(), s=10, c='b', marker="o", label='Linear')
ax1.scatter(logistic.keys(),logistic.values(), s=10, c='r', marker="o", label='Logistic')
ax1.scatter(tree.keys(),tree.values(), s=10, c='g', marker="o", label='Tree')
ax1.scatter(nn.keys(),nn.values(), s=10, c='y', marker="o", label='NN')

# plt.legend(loc='upper left')
plt.legend(prop={'size': 7})
plt.xticks(rotation = 90)
plt.ylim(0,1)
plt.grid()
plt.show()        