In [1]:
from functions import *
import wordninja
import requests
import random

# Torch cannot work properly in jupyter notebook
import os
count = 0 
if count == 0:
    os.chdir("test_dir")
    count += 1


  result = re.sub('coroll\.', 'coroll', result)
  result = re.sub('pt\.', 'pt', result)
  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to C:\Users\Zoe
[nltk_data]     Lua\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
os.getcwd()

'c:\\Users\\Zoe Lua\\DSA4266_Grp2\\test_dir'

In [3]:
## CONFIG

df_path = "../Data/full_df.pkl"
X_name = 'processed'
y_name = 'class'

#### For preprocessing
all_maxlen_per_sent = [150]
all_token_max_words = [5000]


In [4]:
## Semantic Dictionaries

def get_synonyms_conceptnet(word):
    synonyms = []
    url = f'http://api.conceptnet.io/c/en/{word}?filter=/c/en'
    response = requests.get(url)
    data = response.json()
    for edge in data['edges']:
        if edge['rel']['label'] == 'Synonym' and edge['start']['language'] == 'en' and edge['end']['language'] == 'en':
            start = edge['start']['label']
            end = edge['end']['label']
            synonyms.append(end if start == word else start)

    if synonyms != []:
        synonym = random.choice(synonyms)
    else:
        synonym = synonyms
    return synonym

def get_synonyms_wordnet(word):
    synonyms = []
    synsets = wordnet.synsets(word)
    for synset in synsets:
        synonyms.extend([lemma.name() for lemma in synset.lemmas() if lemma.name() != word])

    if synonyms != []:
        synonym = random.choice(synonyms)
    else:
        synonym = synonyms
    return synonym

In [5]:
class DataPrep():
    def __init__(self, subset = None, text_prep = 'lem', token_max_words = 5000, maxlen_per_sent = 150, undersample = True):
        """
        subset: X[:subset]
        """
        self.df = pd.read_pickle(df_path)
        self.subset = subset
        self.maxlen_per_sent = maxlen_per_sent

        self.remove_duplicates()
        print('Dupes removed')
        self.X = self.df[X_name]
        self.y = self.df[y_name].apply(lambda x: 1 if x == 'spam' else 0)
        self.token_max_words = token_max_words

        if self.subset:
            self.X = self.X[:self.subset]
            self.y = self.y[:self.subset]
        
        print('Tokenizing..')
        self.tokenize()
        print('Finished Tokenizing')

        print('Initialising word2vec')
        self.word_to_vec_map = self.word2vec()

        print('lemm/stemm')
        if text_prep == 'lem':
            self.X = self.lemming()
        if text_prep == 'stem':
            self.X = self.stemming()

        print('Embedding...')
        self.emb_matrix = self.tok_embedding_mat(alternative = [get_synonyms_conceptnet, get_synonyms_wordnet])
        print('Finished embedding')

        print('Padding')
        X_pad = self.pad()
        print('Finished padding')

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X_pad, self.y, test_size=0.33, random_state=42)

        if undersample:
            print('Undersampling..')
            print(Counter(self.y_train))
            self.X_train, self.y_train = self.undersample(self.X_train, self.y_train)
            print(Counter(self.y_train))




    def remove_duplicates(self):
    
        ## First remove all those X values with differing binary y values
        occurrences = self.df.groupby([X_name, y_name]).size().reset_index(name='count')
        duplicates = occurrences[occurrences.duplicated(subset=X_name, keep=False)]
        for index, row in duplicates.iterrows():
            x_value = row[X_name]
            max_count = occurrences[(occurrences[X_name] == x_value)].max()['count']
            occurrences.drop(occurrences[(occurrences[X_name] == x_value) & (occurrences['count'] != max_count)].index, inplace=True)

        ## Remove duplicates
        self.df = occurrences.drop_duplicates(subset = X_name).reset_index(drop = True)
    
    def tokenize(self, join = False):
        def tokenize_helper(text, join = False):
            stop_words = set(stopwords.words('english'))
            tokens = word_tokenize(text)
            tokens = [word.lower() for word in tokens if word.lower() not in stop_words]

            if join:
                tokens = ' '.join([''.join(c for c in word if c not in string.punctuation) for word in tokens if word])
        
            return tokens
        
        self.X = self.X.apply(lambda x: tokenize_helper(x, join))

    ## Embedders
        
    def word2vec(self):
        from gensim.models.word2vec import Word2Vec
        import gensim.downloader as api

        word_to_vec_map = api.load("word2vec-google-news-300")

        return word_to_vec_map
    
    
    ## Stemming/ Lemmetization

    def stemming(self):
        ps = PorterStemmer()

        def stem(row):
            print(row)
            stemmed = []
            for word in row:
                stemmed += [ps.stem(word)]
            print('STEMMED:', stemmed)

            return stemmed

        return self.X.apply(stem)
    

    def lemming(self):

        def lem(row):
            lemmatizer = WordNetLemmatizer()
            lemmed = [lemmatizer.lemmatize(word) for word in row]
            # print(row)
            # print(lemmed,"\n")
            return lemmed

        return self.X.apply(lem)
    
    def tok_embedding_mat(self, alternative):
        """
        embedder: word2vec
        alternative: list of callable to find synonyms from, inorder of precedence
        """
        

        self.tokenizer = text.Tokenizer(num_words=self.token_max_words)
        self.tokenizer.fit_on_texts(self.X)

        self.sequences = self.tokenizer.texts_to_sequences(self.X)

        self.word_index = self.tokenizer.word_index
        self.vocab_len = len(self.word_index) + 1
        self.embed_vector_len = self.word_to_vec_map['moon'].shape[0]

        emb_matrix = np.zeros((self.vocab_len, self.embed_vector_len))


        for word, index in tqdm.tqdm(self.word_index.items(), total = len(self.word_index)):
            try:
                embedding_vector = self.word_to_vec_map[word]
                emb_matrix[index-1, :] = embedding_vector
            except:
                for dictionary in alternative:
                    try: 
                        synonym = dictionary(word)
                        if synonym:
                            # print(f'Found synonym: {synonym} for word: {word}')
                            embedding_vector = self.word_to_vec_map[synonym] 
                            emb_matrix[index-1, :] = embedding_vector
                            break
                    except:
                        continue
        pd.to_pickle(emb_matrix, f"../embeddings/emb_matrix_x{self.subset}_tok_{self.maxlen_per_sent}_len{self.token_max_words}.pkl")

        return emb_matrix


    def pad(self):
        X_pad = pad_sequences(self.sequences, maxlen = self.maxlen_per_sent)
        return X_pad

    def undersample(self):
        undersampler = RandomUnderSampler(random_state=42)
        X_resampled, y_resampled = undersampler.fit_resample(self.X_train, self.y_train)

        return X_resampled, y_resampled


class Train(DataPrep):
    def __init__(self, nodes = 256, subset = None, text_prep = 'lem', token_max_words = 5000, maxlen_per_sent = 150, undersample = True):
        super().__init__(subset, text_prep, token_max_words, maxlen_per_sent, undersample)

        self.nodes = nodes

        self.model = Sequential()
        self.model.add(Embedding(input_dim= self.vocab_len, output_dim= self.embed_vector_len, input_shape = (self.maxlen_per_sent,), trainable=False, embeddings_initializer = initializers.Constant(self.emb_matrix)))
        self.model.add(LSTM(self.nodes))
        self.model.add(Dense(1, activation = 'sigmoid'))

        self.model.compile(optimizer='adam',
                    loss='binary_crossentropy',
                    metrics=['accuracy'])

        # Train model
        self.model.fit(self.X_train, self.y_train, epochs=10, batch_size=1, verbose=1)  

    def predict(self, verbose = False):

        loss, accuracy = self.model.evaluate(self.X_test, self.y_test)
        print("Test Accuracy:", accuracy)

        # Make predictions
        predictions = self.model.predict(self.X_test)

        y_hat = [1 if i> 0.5 else 0 for i in predictions]

        if verbose:
            print("Classification Report:")
            print(classification_report(self.y_test, y_hat))

            print("Confusion Matrix:")
            print(confusion_matrix(self.y_test, y_hat))


class optimize():
    pass
    


In [6]:
test = Train(subset = 500)


Dupes removed
Tokenizing..
Finished Tokenizing
Initialising word2vec
lemm/stemm
Embedding...


100%|██████████| 11219/11219 [2:45:27<00:00,  1.13it/s]  


Finished embedding
Padding
Finished padding
Undersampling..
Counter({0: 196, 1: 139})


TypeError: DataPrep.undersample() takes 1 positional argument but 3 were given

In [None]:
test.predict()

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 133ms/step - accuracy: 0.8667 - loss: 0.6358
Test Accuracy: 0.8666666746139526
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 182ms/step


In [None]:
# Define the objective function
def objective(trial):
    # Define hyperparameters to optimize
    units = trial.suggest_categorical("units", [32, 64, 128])
    epochs = trial.suggest_categorical("epochs", [10, 20, 30])
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
    
    model = Sequential()
    self.model.add(Embedding(input_dim= self.vocab_len, output_dim= self.embed_vector_len, input_shape = (self.maxlen_per_sent,), trainable=False, embeddings_initializer = initializers.Constant(self.emb_matrix)))
    self.model.add(LSTM(self.nodes))
    self.model.add(Dense(1, activation = 'sigmoid'))

    self.model.compile(optimizer='adam',
                    loss='binary_crossentropy',
                    metrics=['accuracy'])

    self.model.fit(self.X_train, self.y_train, epochs=10, batch_size=1, verbose=1)  

    # Evaluate the model
    _, accuracy = model.evaluate(X_val, y_val, verbose=0)

    return accuracy

# Define data and other constants
timesteps = ...
features = ...
num_classes = ...

# Create study object and optimize hyperparameters
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

# Get best hyperparameters and results
best_trial = study.best_trial
best_params = best_trial.params
best_accuracy = best_trial.value

print("Best hyperparameters:", best_params)
print("Best accuracy:", best_accuracy)

## To try:

- GloVe + LSTM + Ray Tune (Hyperpara Tune)
<!-- - GloVe + LSTM  -->
- subword tokenization

# Semantic Embedders

## GloVe

In [12]:
# from gensim.models import KeyedVectors
# words_to_index = tokenizer.word_index

# def read_glove_vector(glove_vec):
#   with open(glove_vec, 'r', encoding='UTF-8') as f:
#     words = set()
#     word_to_vec_map = {}
#     for line in f:
#       w_line = line.split()
#       curr_word = w_line[0]
#       word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)

#   return word_to_vec_map

# def glove_embed(path = '../GloVe/glove.6B.50d.txt', words_to_index):
#   """
#   path: Path to glove txt file
#   words_to_index: tokenizer.word_index
#   """
#   word_to_vec_map = read_glove_vector(path)
#   vocab_len = len(words_to_index)
#   embed_vector_len = word_to_vec_map['moon'].shape[0]

#   emb_matrix = np.zeros((vocab_len, embed_vector_len))

#   for word, index in words_to_index.items():
#     embedding_vector = word_to_vec_map.get(word)
#     if embedding_vector is not None:
#       emb_matrix[index, :] = embedding_vector


## Concept Net
Takes really long to load too

In [14]:
import requests
import random

def get_synonyms_conceptnet(word):
    synonyms = []
    url = f'http://api.conceptnet.io/c/en/{word}?filter=/c/en'
    response = requests.get(url)
    data = response.json()
    for edge in data['edges']:
        if edge['rel']['label'] == 'Synonym' and edge['start']['language'] == 'en' and edge['end']['language'] == 'en':
            start = edge['start']['label']
            end = edge['end']['label']
            synonyms.append(end if start == word else start)

    if synonyms != []:
        synonym = random.choice(synonyms)
    else:
        synonym = synonyms
    return synonym


# Example usage
# word = 'happy'
# synonyms = get_synonyms_conceptnet(word)
# print(synonyms)


In [15]:
# obj = requests.get("http://api.conceptnet.io/c/en/example").json()

## Stemming & Lemmatization

In [17]:
# from nltk.stem import PorterStemmer
# nltk.download("punkt")

# ps = PorterStemmer()

# def stem(row):
#     print(row)
#     stemmed = []
#     for word in row:
#         stemmed += [ps.stem(word)]
#     print('STEMMED:', stemmed)

#     return stemmed

# X.apply(stem)

In [18]:
def lem(row):
    lemmatizer = WordNetLemmatizer()
    lemmed = [lemmatizer.lemmatize(word) for word in row]
    # print(row)
    # print(lemmed,"\n")
    return lemmed

lemmed_X = X.apply(lem)

## Embedding

In [39]:
from tensorflow.keras.preprocessing.text import Tokenizer

def tok_embedding_mat(X, y, alternative, token_max_words = 5000 ):
    """
    embedder: word2vec or GloVe
    alternative: list of callable to find synonyms from, inorder of precedence
    """
    

    tokenizer = Tokenizer(num_words=token_max_words)
    tokenizer.fit_on_texts(X)

    sequences = tokenizer.texts_to_sequences(X)

    word_index = tokenizer.word_index
    vocab_len = len(word_index) + 1
    embed_vector_len = word_to_vec_map['moon'].shape[0]

    emb_matrix = np.zeros((vocab_len, embed_vector_len))


    for word, index in word_index.items():
      try:
        embedding_vector = word_to_vec_map[word]
        emb_matrix[index-1, :] = embedding_vector
      except:
        for dictionary in alternative:
            try:
              synonym = dictionary(word)
              if synonym:
                  print(f'Found synonym: {synonym} for word: {word}')
                  embedding_vector = word_to_vec_map[synonym] 
                  emb_matrix[index-1, :] = embedding_vector
                  break
            except:
               continue

    return tokenizer, emb_matrix




In [40]:
tokenizer, emb_matrix = tok_embedding_mat(lemmed_X, df['class'], [get_synonyms_conceptnet, get_synonyms_wordnet])

Found synonym: canton for word: guangzhou
Found synonym: Guangdong province for word: guangdong
Found synonym: Guangdong for word: guangdong
Found synonym: swine fever for word: csf
Found synonym: Nanking for word: nanjing
Found synonym: Humboldt for word: humboldt
Found synonym: Hangchow for word: hangzhou
Found synonym: Hangchow for word: hangzhou
Found synonym: key performance indicator for word: kpi
Found synonym: Chungking for word: chongqing
Found synonym: Tientsin for word: tianjin
Found synonym: Mukden for word: shenyang
Found synonym: Wuhan for word: wuhan
Found synonym: Yunnan province for word: yunnan
Found synonym: Yunnan for word: yunnan
Found synonym: Taipeh for word: taipei
Found synonym: chang jiang for word: yangtze
Found synonym: Yangtze_River for word: yangtze
Found synonym: Hunan province for word: hunan
Found synonym: Hunan_province for word: hunan
Found synonym: dalinian for word: dalian
Found synonym: Dalian for word: dalian
Found synonym: glutamine for word: gln

In [41]:
emb_matrix

array([[-0.23339844,  0.0189209 , -0.10302734, ..., -0.20214844,
        -0.18652344,  0.22070312],
       [-0.03564453, -0.13378906, -0.07324219, ...,  0.02954102,
        -0.08496094, -0.22363281],
       [-0.04736328,  0.1875    ,  0.0022583 , ..., -0.0035553 ,
        -0.0625    , -0.05566406],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.15917969,  0.06787109,  0.01477051, ..., -0.03295898,
         0.03662109,  0.08984375],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [42]:
pd.to_pickle(emb_matrix, "../embeddings/emb_matrix_x_2000.pkl")

In [44]:
sequences  = tokenizer.texts_to_sequences(X)
word_index = tokenizer.word_index
vocab_len = len(word_index) + 1
embed_vector_len = word_to_vec_map['moon'].shape[0]

In [55]:
X_pad = pad_sequences(sequences, maxlen=150)
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.33, random_state=42)

## LSTM

In [46]:
len(sequences)

2000

In [56]:
y_train

83      1
938     0
1045    0
391     1
1057    0
       ..
1158    0
1331    0
882     1
1498    0
1154    1
Name: class, Length: 1340, dtype: int64

In [57]:
model = Sequential()
model.add(Embedding(input_dim= vocab_len, output_dim= embed_vector_len, input_shape = (150,), trainable=False, embeddings_initializer = initializers.Constant(emb_matrix)))
model.add(LSTM(256))
model.add(Dense(1, activation = 'sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train model
model.fit(X_train, y_train, epochs=10, batch_size=1, verbose=1)

  super().__init__(**kwargs)


Epoch 1/10
[1m1340/1340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m285s[0m 211ms/step - accuracy: 0.8360 - loss: 0.3885
Epoch 2/10
[1m1340/1340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m248s[0m 185ms/step - accuracy: 0.9879 - loss: 0.0399
Epoch 3/10
[1m1340/1340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m256s[0m 191ms/step - accuracy: 0.9974 - loss: 0.0095
Epoch 4/10
[1m1340/1340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m304s[0m 227ms/step - accuracy: 1.0000 - loss: 0.0014
Epoch 5/10
[1m1340/1340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m312s[0m 233ms/step - accuracy: 1.0000 - loss: 1.3424e-04
Epoch 6/10
[1m1340/1340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m284s[0m 212ms/step - accuracy: 1.0000 - loss: 3.6908e-05
Epoch 7/10
[1m1340/1340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m292s[0m 218ms/step - accuracy: 1.0000 - loss: 1.6421e-05
Epoch 8/10
[1m1340/1340[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m275s[0m 205ms/step - accuracy: 1.

<keras.src.callbacks.history.History at 0x28265412960>

In [58]:
from sklearn.metrics import classification_report, confusion_matrix

loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

# Make predictions
predictions = model.predict(X_test)

y_hat = [1 if i> 0.5 else 0 for i in predictions]

## Matrix
print("Classification Report:")
print(classification_report(y_test, y_hat))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_hat))

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 118ms/step - accuracy: 0.9501 - loss: 0.3285
Test Accuracy: 0.9651514887809753
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 118ms/step
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.97      0.97       451
           1       0.94      0.95      0.95       209

    accuracy                           0.97       660
   macro avg       0.96      0.96      0.96       660
weighted avg       0.97      0.97      0.97       660

Confusion Matrix:
[[438  13]
 [ 10 199]]


To do:
- Find ideal tokenizer MAX_WORDS
- Find ideal padding length/dimensions
- Find ideal LSTM Nodes
- Find idea lepochs