In [2]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [3]:
import pandas as pd
df = pd.read_csv('/content/gdrive/My Drive/Mestrado/first_level_dataset.csv')

In [4]:
import string
regular_punct = list(string.punctuation)
def remove_punctuation(text,punct_list):
    for punc in punct_list:
        if punc in text:
            text = text.replace(punc, ' ' )
    return text.strip()

#df['punct'] = df['to_save'].apply(lambda x : remove_punctuation(x, regular_punct)).apply(lambda x : " ".join(x.split()))



from nltk.stem.porter import PorterStemmer

porter_stemmer = PorterStemmer()

def stem_sentences(sentence):
    tokens = sentence.split()
    stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

#df['porter_original'] = df['to_save'].apply(stem_sentences)
#df['porter_punct'] = df['punct'].apply(stem_sentences)


import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

#example text text = 'What can I say about this place. The staff of these restaurants is nice and the eggplant is not bad'

class Splitter(object):
    """
    split the document into sentences and tokenize each sentence
    """
    def __init__(self):
        self.splitter = nltk.data.load('tokenizers/punkt/english.pickle')
        self.tokenizer = nltk.tokenize.TreebankWordTokenizer()

    def split(self,text):
        """
        out : ['What', 'can', 'I', 'say', 'about', 'this', 'place', '.']
        """
        # split into single sentence
        sentences = self.splitter.tokenize(text)
        # tokenization in each sentences
        tokens = [self.tokenizer.tokenize(sent) for sent in sentences]
        return tokens


class LemmatizationWithPOSTagger(object):
    def __init__(self):
        pass
    def get_wordnet_pos(self,treebank_tag):
        """
        return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v) 
        """
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            # As default pos in lemmatization is Noun
            return wordnet.NOUN

    def pos_tag(self,tokens):
        # find the pos tagginf for each tokens [('What', 'WP'), ('can', 'MD'), ('I', 'PRP') ....
        pos_tokens = [nltk.pos_tag(token) for token in tokens]

        # lemmatization using pos tagg   
        # convert into feature set of [('What', 'What', ['WP']), ('can', 'can', ['MD']), ... ie [original WORD, Lemmatized word, POS tag]
        pos_tokens = [ [(word, lemmatizer.lemmatize(word,self.get_wordnet_pos(pos_tag)), [pos_tag]) for (word,pos_tag) in pos] for pos in pos_tokens]
        return pos_tokens

lemmatizer = WordNetLemmatizer()
splitter = Splitter()
lemmatization_using_pos_tagger = LemmatizationWithPOSTagger()

#step 1 split document into sentence followed by tokenization

def lemmatise(text):
  tokens = splitter.split(text)
  #step 2 lemmatization using pos tagger 
  lemma_pos_token = lemmatization_using_pos_tagger.pos_tag(tokens)
  ans = ""
  for sentence in lemma_pos_token:
    for word in sentence:
      ans += word[1] + " "

  return ans[:-1]


#df['lemma_original'] = df['to_save'].apply(lemmatise)
#df['lemma_punct'] = df['punct'].apply(lemmatise)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [5]:
! pip install scikit-multilearn
! pip install iterative-stratification

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-multilearn
  Downloading scikit_multilearn-0.2.0-py3-none-any.whl (89 kB)
[K     |████████████████████████████████| 89 kB 3.3 MB/s 
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting iterative-stratification
  Downloading iterative_stratification-0.1.7-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.7


In [6]:
from ast import literal_eval

df['labels'] = df['labels'].apply(literal_eval)

In [36]:
from tensorflow.keras import layers, regularizers
from tensorflow.keras import Sequential

def create_model(input_size, output):
  model3 = Sequential()
  model3.add(layers.Conv1D(8, 5, activation='relu',kernel_regularizer=regularizers.l1_l2(l1=2e-3, l2=2e-3),\
                           bias_regularizer=regularizers.l2(2e-3), input_shape=input_size) )
  model3.add(layers.MaxPooling1D(5))
  model3.add(layers.Conv1D(8, 5, activation='relu',kernel_regularizer=regularizers.l1_l2(l1=2e-3, l2=2e-3),bias_regularizer=regularizers.l2(2e-3)))
  model3.add(layers.GlobalMaxPooling1D())
  model3.add(layers.Dense(output,activation='sigmoid'))
  return model3


In [37]:
# univariate lstm example
from numpy import array
from keras.models import Sequential
from keras.layers import LSTM, GRU, Input,  Bidirectional
from keras.layers import Dense, Dropout, LeakyReLU, Conv1D, MaxPool1D, GlobalMaxPool1D
from keras.regularizers import l1,l2

def lstm(input_shape, output_shape):
  # define model
  model = Sequential()
  #model.add(Conv1D(filters=8, kernel_size=3,strides=1, padding="causal", activation="relu", input_shape=(7, 128), activity_regularizer=l1(0.0001) ) )
  #model.add(Conv1D(filters=4, kernel_size=3,strides=1, padding="causal", activation="relu", activity_regularizer=l1(0.0001)) )
  #model.add(Conv1D(filters=8, kernel_size=3,strides=1, padding="causal", activation="relu") )
  #model.add(MaxPool1D())
  model.add(Input(shape=input_shape))
  #model.add((LSTM(3, activation=LeakyReLU(alpha=0.1), return_sequences=True, activity_regularizer=l2(0.0001))))
  model.add((LSTM(3, activation=LeakyReLU(alpha=0.1), return_sequences=False, activity_regularizer=l2(0.0001))))

  model.add(Dense(48))
  model.add(Dropout(0.3))
  model.add(Dense(output_shape, activation='sigmoid'))
  return model
  #model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["categorical_accuracy"])

In [9]:
from tensorflow.python.ops.script_ops import numpy_function
from keras import backend as K

def calculating_class_weights(y_true):
    from sklearn.utils.class_weight import compute_class_weight
    number_dim = np.shape(y_true)[1]
    print(number_dim)
    weights = np.empty([number_dim, 2])
    for i in range(number_dim):
        print(compute_class_weight(class_weight='balanced', classes= [0,1], y= y_true[:, i]))
        weights[i] = compute_class_weight(class_weight='balanced', classes= [0.,1.], y= y_true[:, i])

    print(weights)
    return weights


def get_weighted_loss(weights):
    def weighted_loss(y_true, y_pred):
        return K.mean((weights[:,0]**(1-y_true))*(weights[:,1]**(y_true))*K.binary_crossentropy(y_true, y_pred), axis=-1)
    return weighted_loss

In [10]:
!pip install gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [22]:
import gensim
import gensim.downloader
embedding = gensim.downloader.load('glove-twitter-200')



In [12]:
from nltk.tokenize import sent_tokenize, word_tokenize 

def get_embedding_result(word, embedding):
  return embedding[word] if word in embedding else embedding["<UNK>"]


def get_gloVe(text_train, embeddings_index, sequence_size):
  X_train_vector = []

  for word in word_tokenize(text_train):
    if len(X_train_vector) >= sequence_size:
      break
    if word in embeddings_index:
        X_train_vector.append(embedding[word])

  zeros= []
  for i in range(len(X_train_vector), sequence_size):
    zeros.append(np.zeros(X_train_vector[0].shape))
    #X_train_vector.append(np.average(words, axis = 0))
  X_train_vector = zeros + X_train_vector

  return X_train_vector


def get_embeddings(X_train, X_test,  embeddings_index, sequence_size):

  train_result = []
  for text in X_train:
    train_result.append(get_gloVe(text, embeddings_index, sequence_size))
  
  test_result = []
  for text in X_test:
    test_result.append(get_gloVe(text, embeddings_index, sequence_size))

  return train_result, test_result

In [13]:
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
#### one hote enconding on the labels ########
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['labels'])

#### get the text column

X = df['text']

In [14]:
sizes = df['text'].apply(len)
sequence_size_first = int(np.array(sizes).mean())
sequence_size_second = int(np.median(sizes))

In [15]:
sequence_size_first

581

In [16]:
X.shape, y.shape

((453,), (453, 3))

In [29]:
import tensorflow_hub as hub
from nltk.tokenize import sent_tokenize, word_tokenize 

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

def vectorize_sentences(X, number):

    new_X = []
    for instance in X:
      sentences = sent_tokenize(instance)
      current_embedding = embed(sentences[:number]).numpy()
      
      diff = number -len(current_embedding)
      if diff > 0:
          padding = np.zeros((diff, 512))
          current_embedding = np.concatenate((padding, current_embedding), axis=0)
      new_X.append(current_embedding)
    
    return np.array(new_X)


In [35]:
sentence_length = int(np.mean(df['text'].apply(lambda x : len(sent_tokenize(x)))))

In [None]:
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
import tensorflow as tf
from sklearn.metrics import classification_report, multilabel_confusion_matrix, f1_score, accuracy_score # we can use gmean

msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)

for train_index, test_index in msss.split(np.array(X), np.array(y)):

  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]

  #X_train, X_test = get_embeddings(X_train, X_test, embedding, sequence_size_first)
  X_train = vectorize_sentences(X_train, sentence_length)
  X_test = vectorize_sentences(X_test, sentence_length)
  X_train = np.array(X_train)
  X_test = np.array(X_test)
  print("shapes embeddings", X_train.shape, X_test.shape)
  print("shapes y", y_train.shape, y_test.shape)
  print(X_train.shape)
  model = lstm(X_train.shape[1:], 3)
  print(model.summary())
  class_weights = calculating_class_weights(y_train)
  #print(class_weights)
  opt = tf.keras.optimizers.Adam(learning_rate=3e-4)
  model.compile(optimizer=opt, loss="binary_crossentropy")
  model.fit(X_train, tf.cast(y_train, tf.float32), epochs=80, validation_data=(X_test, tf.cast(y_test, tf.float32)))

  def to_labels(pos_probs, threshold):
          return (pos_probs >= threshold).astype('int')
      


  y_predict = model.predict(X_test)
  from sklearn.metrics import classification_report, multilabel_confusion_matrix, accuracy_score, hamming_loss, jaccard_score, precision_score, recall_score
  thresholds = np.arange(0, 1, 0.005)


  #f1_score(y_test, to_labels(y_p, t), average="weighted") * 0.5 + 
  scores = [f1_score(y_test, to_labels(y_predict, t), average="weighted") for t in thresholds]
  ix = np.argmax(scores)
  y_predict = y_predict > thresholds[ix]



  print("accuracy: ", accuracy_score(y_test, y_predict))
  print("hamming loss: ", hamming_loss(y_test, y_predict))
  print("jaccard score: ", jaccard_score(y_test, y_predict, average='weighted'))
  print(multilabel_confusion_matrix(y_test, y_predict))
  print(classification_report(y_test, y_predict))

shapes embeddings (366, 6, 512) (87, 6, 512)
shapes y (366, 3) (87, 3)
(366, 6, 512)
Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_16 (LSTM)              (None, 3)                 6192      
                                                                 
 dense_22 (Dense)            (None, 48)                192       
                                                                 
 dropout_11 (Dropout)        (None, 48)                0         
                                                                 
 dense_23 (Dense)            (None, 3)                 147       
                                                                 
Total params: 6,531
Trainable params: 6,531
Non-trainable params: 0
_________________________________________________________________
None
3
[1.03977273 0.96315789]
[1.14375    0.88834951]
[0.70114943 1.74285714]
[[1.03977273 0.9631

In [18]:
from sklearn.metrics import classification_report, multilabel_confusion_matrix, f1_score, accuracy_score # we can use gmean


print(classification_report(y_test, y_predict))
print(multilabel_confusion_matrix(y_test, y_predict)) 
print(accuracy_score(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.56      1.00      0.72        48
           1       0.60      1.00      0.75        52
           2       0.35      0.92      0.51        26

   micro avg       0.51      0.98      0.68       126
   macro avg       0.50      0.97      0.66       126
weighted avg       0.53      0.98      0.69       126
 samples avg       0.53      0.99      0.67       126

[[[ 1 38]
  [ 0 48]]

 [[ 0 35]
  [ 0 52]]

 [[17 44]
  [ 2 24]]]
0.13793103448275862
