## import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.auto import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import spacy
nlp = spacy.load('en_core_web_lg')

vectors generated by Spacy gained from pre-trained GloVe model

<center><h1>N-grams with Word embeddings</h1></center>

How can we incorporate N-Grams features with our word embedding, we use **convolution**.

The window size indicate the N in N-gram, like for example if the kernel size is `3x300`, this means 3-grams model and so on 

In [2]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
stop=nltk.corpus.stopwords.words('english')

In [3]:
# creat tag map between treebank and wordnet
# tree bank : collection of syntactically annotated sentences
tag_map = {
"CC": None, #formatted. conjunction (and, but, or)
'CD': wn.NOUN, # original number (one,two)
'DT': None, # delimiter (a, the)
'EX': wn.ADV, #and my "there" (there)
"FW": None, # foreign word (mea culpa)
"IN": wn.ADV,# preposition/subjunctive conjunction (of, in,by) 
'JJ': [wn.ADJ, wn.ADJ_SAT],# adjective # (yellow)
'JJR': [wn.ADJ, wn.ADJ_SAT], # add, comparator (larger)
"JJS": [wn.ADJ, wn.ADJ_SAT], # add, superlative (wildest)
"LS": None, #list item tag(1,2,One)
'MD': None, # modal (can, should)
'NN': wn.NOUN, #noun, sing. or nugget (llama)
'NNS': wn.NOUN, #noun, plural (Llamas)
"NNP": wn.NOUN, # proper noun, vocals. (IBM)
'NNPS': wn.NOUN, # proper noun, plural (Carolina)
"PDT": [wn.ADJ, wn.ADJ_SAT], # predeterminer (all, both)
'POS':None, # end of ownership
'PRP': None, # personal pronoun (I, you, he)
'prp$': None, # possessive pronoun (your, one's)
"RB": wn.ADV, #adv (quickly, never)
'RBR': wn.ADV, # adverb, comparative (faster)
'RBS':wn.ADV,# adverb, superlative (fastest)
'RP': [wn.ADJ, wn.ADJ_SAT], # particle (up, off)
'SYM': None, #symbol, # (+, %, &)
'TO': None, # 'to' (to)
'UH': None, # interjection (uh, oops)
'VB': wn.VERB,# verb base form # (eat)
'VBD': wn.VERB, # verb past tense (eat)
'VBG': wn.VERB, #verb gerund (to eat)
'VBN': wn.VERB, # past participle (eaten)
"VBP": wn.VERB, #non-3sg pres verb (eat)
'VBZ': wn.VERB,# verb зsg pres (egts) 
}

In [4]:
lemma=WordNetLemmatizer()
def get_lemma(text):
    l=[]
    tags=pos_tag(text.split())
    for token,tag in tags:
        try:
             l.append(lemma.lemmatize(token,pos=tag_map[tag][0]))
        except:
            continue
    return l

In [5]:
from termcolor import colored
def cleaning(df):
    """
    - remove empty strings
    - remove duplicates
    """
    # remove empty string
    blanks=[]
    for i ,v in df.itertuples():
        if v.isspace() ==True or v=='':
            blanks.append(i)
    if len(blanks)>0:
        df=df.drop(blanks)
        print(f"found {colored(len(blanks),'red')} blanks")
    else:
        print("no blanks found")

    # remove duplicates
    if df.duplicated().sum() !=0:
        print(f"dropped {colored(df.duplicated().sum(),'red')} : values")
        df=df.drop_duplicates()
    else:
        print("no duplicates found")
    return df

In [6]:
import re
import string
def preprocessing(df,col):
    # lowercasing
    df[col]=df[col].apply(lambda x: " ".join(t.lower() for t in x.split()))
    # remove emails
    df[col]=df[col].apply(lambda x: re.sub("\S+\@\S+",'',x))
    # remove URls
    df[col]=df[col].apply(lambda x: re.sub("\S+\.\S+",'',x))
    # remove punctations
    df[col]=df[col].apply(lambda x: re.sub(f'[{re.escape(string.punctuation)}]',' ',x))
    # remove stopwords
    df[col]=df[col].apply(lambda x :" ".join(t for t in x.lower().split() if t not in stop))
    # remove \n or \t
    df[col]=df[col].apply(lambda x: re.sub('\n|\t',' ',x))
    # remove 2 characters words
    df[col]=df[col].apply(lambda x: re.sub(r' \w\w ',' ',x) )
    # remove 1 character words
    df[col]=df[col].apply(lambda x: re.sub(r' \w ',' ',x) )
    # remove extra spaces more than or equals "2" 
    df[col]=df[col].apply(lambda x: re.sub(" {2,}",' ',x))
    # select alphapetical only
    df[col]=df[col].apply(lambda x: re.sub("[^a-z]+",' ',x) )
    # strip string
    df[col]=df[col].apply(lambda x: x.strip())
    # get lemma
    df[col]=df[col].apply(lambda x: " ".join(get_lemma(x)))
    return df

In [7]:
# remove empty strings  
# remove duplicates 
# remove 1 character words
# remove 2 characters words

In [8]:
df=pd.read_csv(r'smsspamcollection.tsv',sep='\t')

In [9]:
x=df['message']
y=df['label']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.33,random_state=42)

In [11]:
x_train=pd.DataFrame(X_train,columns=['message'])
x_test=pd.DataFrame(X_test,columns=['message'])

In [12]:
cleaned_train=cleaning(x_train)
cleaned_test=cleaning(x_test)

no blanks found
dropped [31m210[0m : values
no blanks found
dropped [31m65[0m : values


In [21]:
import warnings
warnings.filterwarnings('ignore')

In [22]:
prep_train=preprocessing(cleaned_train,col='message')
prep_test=preprocessing(cleaned_test,col='message')

In [23]:
y_train=y_train[prep_train.index]
y_test=y_test[prep_test.index]

In [24]:
# 

In [25]:
def get_longest_text(texts):
    longest=0
    for text in texts:
        text_len=len(text.split())
        longest=max(text_len,longest)
    return longest

In [26]:
longest_input=get_longest_text(prep_train['message'])
longest_input

61

In [27]:
# no of messages ,no of tokens ,vector
# message --> have 66 token -->each token has 300 vecto size
# 

In [28]:
train_emb=np.zeros((len(prep_train),longest_input,300))
test_emb=np.zeros((len(prep_test),longest_input,300))

In [29]:
train_emb[0]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(61, 300))

In [30]:
from tqdm.auto import tqdm
for i ,text in tqdm(enumerate(nlp.pipe(prep_train['message'])),total=(len(prep_train))):
    for j , token in enumerate(text):
        train_emb[i,j]=token.vector

  0%|          | 0/3523 [00:00<?, ?it/s]

In [31]:
from tqdm.auto import tqdm
for i ,text in tqdm(enumerate(nlp.pipe(prep_test['message'])),total=(len(prep_test))):
    for j , token in enumerate(text):
        test_emb[i,j]=token.vector

  0%|          | 0/1774 [00:00<?, ?it/s]

In [32]:
import tensorflow as tf
from tensorflow.keras.layers import Input,Reshape,Conv2D,MaxPool2D,Flatten,Dense,concatenate
from tensorflow.keras.models import Model

In [33]:
from sklearn.preprocessing import LabelEncoder

In [34]:
lb=LabelEncoder()
y_train=lb.fit_transform(y_train)
y_test=lb.transform(y_test)

# Try LSTM

In [37]:
import tensorflow as tf
from tensorflow.keras import layers, Sequential
from tensorflow.keras.preprocessing import sequence

In [38]:
# inputs = Input((longest_input, 300))
# lstm_1=layers.LSTM(64,return_sequences=True)(inputs)
# outputs=layers.Dense(1,activation='sigmoid')(lstm_1)
# model=Model(inputs=inputs,outputs=outputs)

In [39]:
model=Sequential([
    # input shape (None, 66, 10000)
    # (n_samples,n_time_steps,embed_dimen)
    # layers.Embedding(input_dim=vocab_size,output_dim=128,input_length=66),# (None, 66, 300)
    layers.LSTM(32,return_sequences=True),#(None, 66) 
    layers.LSTM(16,return_sequences=True),#(None, 66) 
    layers.Dropout(0.3),
    layers.LSTM(8),
    layers.Dropout(0.3),
    layers.Dense(1,activation='sigmoid')

])

In [41]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [42]:
model.fit(train_emb,y_train,validation_split=0.2,epochs=10,batch_size=64)

Epoch 1/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 46ms/step - accuracy: 0.8545 - loss: 0.5092 - val_accuracy: 0.8596 - val_loss: 0.4001
Epoch 2/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - accuracy: 0.9262 - loss: 0.2514 - val_accuracy: 0.9674 - val_loss: 0.1582
Epoch 3/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.9617 - loss: 0.1442 - val_accuracy: 0.9504 - val_loss: 0.1463
Epoch 4/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - accuracy: 0.9748 - loss: 0.1148 - val_accuracy: 0.9589 - val_loss: 0.1559
Epoch 5/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.9737 - loss: 0.1042 - val_accuracy: 0.9674 - val_loss: 0.1311
Epoch 6/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.9851 - loss: 0.0811 - val_accuracy: 0.9674 - val_loss: 0.1281
Epoch 7/10
[1m45/45[0m [32m━━━━

<keras.src.callbacks.history.History at 0x1eedaf8acf0>

In [43]:
model2=Sequential([
    # input shape (None, 66, 300)
    # (n_samples,n_time_steps,embed_dimen)
    # layers.Embedding(input_dim=vocab_size,output_dim=100),#output (None, 66, 300)
    layers.Bidirectional(layers.GRU(64,return_sequences=True)),
    layers.Bidirectional(layers.GRU(64)),
    layers.Dense(32,activation='relu'),
    layers.Dense(1,activation='sigmoid')

])
# 98%

In [44]:
model2.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [45]:
model2.fit(train_emb,y_train,validation_split=0.2,epochs=10,batch_size=64)

Epoch 1/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 83ms/step - accuracy: 0.9056 - loss: 0.2525 - val_accuracy: 0.9631 - val_loss: 0.1100
Epoch 2/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 67ms/step - accuracy: 0.9705 - loss: 0.0810 - val_accuracy: 0.9702 - val_loss: 0.1029
Epoch 3/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 66ms/step - accuracy: 0.9833 - loss: 0.0502 - val_accuracy: 0.9773 - val_loss: 0.0973
Epoch 4/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 67ms/step - accuracy: 0.9894 - loss: 0.0339 - val_accuracy: 0.9716 - val_loss: 0.1011
Epoch 5/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 72ms/step - accuracy: 0.9933 - loss: 0.0211 - val_accuracy: 0.9702 - val_loss: 0.1066
Epoch 6/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 67ms/step - accuracy: 0.9950 - loss: 0.0133 - val_accuracy: 0.9674 - val_loss: 0.1318
Epoch 7/10
[1m45/45[0m [32m━━━━

<keras.src.callbacks.history.History at 0x1eedb254690>

In [3]:
text = """Once upon a time, there was a little village..."""

In [4]:
text

'Once upon a time, there was a little village...'

In [7]:
chars = sorted(set(text))

In [36]:
len(chars)

21

In [11]:
char2idx = {c:i for i,c in enumerate(chars)}
idx2char = np.array(chars)
text_as_int = np.array([char2idx[c] for c in text])

In [12]:
text_as_int

array([ 3, 12,  5,  6,  0, 18, 14, 13, 12,  0,  4,  0, 17,  9, 11,  6,  1,
        0, 17,  8,  6, 15,  6,  0, 20,  4, 16,  0,  4,  0, 10,  9, 17, 17,
       10,  6,  0, 19,  9, 10, 10,  4,  7,  6,  2,  2,  2])

In [93]:
seq_length = 40
step = 3
sequences, next_chars = [], []
for i in range(0, len(text_as_int)-seq_length, step):
    sequences.append(text_as_int[i:i+seq_length])
    next_chars.append(text_as_int[i+seq_length])

In [94]:
x, y = np.array(sequences), np.array(next_chars)

In [95]:
x.shape

(3, 40)

In [97]:
y.shape


(3,)

In [29]:
x

array([[ 3, 12,  5,  6,  0, 18, 14, 13, 12,  0,  4,  0, 17,  9, 11,  6,
         1,  0, 17,  8,  6, 15,  6,  0, 20,  4, 16,  0,  4,  0, 10,  9,
        17, 17, 10,  6,  0, 19,  9, 10],
       [ 6,  0, 18, 14, 13, 12,  0,  4,  0, 17,  9, 11,  6,  1,  0, 17,
         8,  6, 15,  6,  0, 20,  4, 16,  0,  4,  0, 10,  9, 17, 17, 10,
         6,  0, 19,  9, 10, 10,  4,  7],
       [14, 13, 12,  0,  4,  0, 17,  9, 11,  6,  1,  0, 17,  8,  6, 15,
         6,  0, 20,  4, 16,  0,  4,  0, 10,  9, 17, 17, 10,  6,  0, 19,
         9, 10, 10,  4,  7,  6,  2,  2]])

In [33]:
# Build stacked LSTM model
vocab_size = len(chars)
embedding_dim = 32
rnn_units = 64
model = Sequential([
    layers.Embedding(vocab_size, embedding_dim, input_length=seq_length),
    layers.LSTM(rnn_units, return_sequences=True),
    layers.Dropout(0.2),
    layers.LSTM(rnn_units),
    layers.Dropout(0.2),
    layers.Dense(vocab_size, activation='softmax')
])

In [34]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
model.fit(x, y, epochs=100, batch_size=8)

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 3.0474
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - loss: 3.0380
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - loss: 3.0297
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - loss: 3.0200
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - loss: 3.0030
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - loss: 3.0009
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 131ms/step - loss: 2.9706
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - loss: 2.9548
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - loss: 2.9103
Epoch 10/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - loss: 2.8794
Epoch 11/1

<keras.src.callbacks.history.History at 0x262b148d160>

# Another Way of using GloVe  

In [None]:
# Spacy Glove

In [None]:
tokenizer=Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(prep_train['text'])
vocab_size = len(tokenizer.word_index) + 1

In [None]:
X_train_seq = tokenizer.texts_to_sequences(prep_train['text'])
X_test_seq  = tokenizer.texts_to_sequences(prep_test['text'])

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad  = pad_sequences(X_test_seq,  maxlen=max_len, padding='post')

In [None]:
print("Creating embedding matrix...")
for word, i in tokenizer.word_index.items():
    if i < vocab_size:
        token = nlp(word)        
        if token.has_vector:
            embedding_matrix[i] = token.vector
            hits += 1
        else:
            misses += 1

print(f"Converted {hits} words. Missed {misses} words.")

In [None]:
from tensorflow.keras import regularizers
model2=Sequential([

    layers.Embedding(input_dim=vocab_size,output_dim=300,weights=[embedding_matrix],trainable=False),
    layers.Bidirectional(layers.LSTM(128,return_sequences=False, recurrent_dropout=0.2)),    
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.Dense(n_classes, activation='softmax')

])