In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.auto import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import spacy
nlp = spacy.load('en_core_web_lg')

vectors generated by Spacy gained from pre-trained GloVe model

In [2]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
stop=nltk.corpus.stopwords.words('english')

In [3]:
# creat tag map between treebank and wordnet
# tree bank : collection of syntactically annotated sentences
tag_map = {
"CC": None, #formatted. conjunction (and, but, or)
'CD': wn.NOUN, # original number (one,two)
'DT': None, # delimiter (a, the)
'EX': wn.ADV, #and my "there" (there)
"FW": None, # foreign word (mea culpa)
"IN": wn.ADV,# preposition/subjunctive conjunction (of, in,by) 
'JJ': [wn.ADJ, wn.ADJ_SAT],# adjective # (yellow)
'JJR': [wn.ADJ, wn.ADJ_SAT], # add, comparator (larger)
"JJS": [wn.ADJ, wn.ADJ_SAT], # add, superlative (wildest)
"LS": None, #list item tag(1,2,One)
'MD': None, # modal (can, should)
'NN': wn.NOUN, #noun, sing. or nugget (llama)
'NNS': wn.NOUN, #noun, plural (Llamas)
"NNP": wn.NOUN, # proper noun, vocals. (IBM)
'NNPS': wn.NOUN, # proper noun, plural (Carolina)
"PDT": [wn.ADJ, wn.ADJ_SAT], # predeterminer (all, both)
'POS':None, # end of ownership
'PRP': None, # personal pronoun (I, you, he)
'prp$': None, # possessive pronoun (your, one's)
"RB": wn.ADV, #adv (quickly, never)
'RBR': wn.ADV, # adverb, comparative (faster)
'RBS':wn.ADV,# adverb, superlative (fastest)
'RP': [wn.ADJ, wn.ADJ_SAT], # particle (up, off)
'SYM': None, #symbol, # (+, %, &)
'TO': None, # 'to' (to)
'UH': None, # interjection (uh, oops)
'VB': wn.VERB,# verb base form # (eat)
'VBD': wn.VERB, # verb past tense (eat)
'VBG': wn.VERB, #verb gerund (to eat)
'VBN': wn.VERB, # past participle (eaten)
"VBP": wn.VERB, #non-3sg pres verb (eat)
'VBZ': wn.VERB,# verb зsg pres (egts) 
}

In [4]:
lemma=WordNetLemmatizer()
def get_lemma(text):
    l=[]
    tags=pos_tag(text.split())
    for token,tag in tags:
        try:
             l.append(lemma.lemmatize(token,pos=tag_map[tag][0]))
        except:
            continue
    return l

In [5]:
from termcolor import colored
def cleaning(df):
    """
    - remove empty strings
    - remove duplicates
    """
    # remove empty string
    blanks=[]
    for i ,v in df.itertuples():
        if pd.isna(v) or (isinstance(v, str) and (v.strip() == "")):
            blanks.append(i)
    if blanks:
        df=df.drop(blanks)
        print(f"found {colored(len(blanks),'red')} blanks")
    else:
        print("no blanks found")

    # remove duplicates
    if df.duplicated().sum() !=0:
        print(f"dropped {colored(df.duplicated().sum(),'red')} : values")
        df=df.drop_duplicates()
    else:
        print("no duplicates found")
    return df

In [6]:
import re
import string
def preprocessing(df,col):

     # split quotation marks and words
    df[col]=df[col].apply(lambda x: re.sub(r"([?.!,¿])", r" \1 ", x))
    # lowercasing
    df[col]=df[col].apply(lambda x: x.lower())
    # remove emails
    df[col]=df[col].apply(lambda x: re.sub("\S+\@\S+",'',x))
    # remove URls
    df[col]=df[col].apply(lambda x: re.sub("\S+\.\S+",'',x))
    # remove punctations
    # df[col]=df[col].apply(lambda x: re.sub(f'[{re.escape(string.punctuation)}]',' ',x))
    # remove stopwords
    # df[col]=df[col].apply(lambda x :" ".join(t for t in x.lower().split() if t not in stop))
    # remove \n or \t
    df[col]=df[col].apply(lambda x: re.sub('\n|\t',' ',x))
    # remove 2 characters words
    df[col]=df[col].apply(lambda x: re.sub(r' \w\w ',' ',x) )
    # remove 1 character words
    df[col]=df[col].apply(lambda x: re.sub(r' \w ',' ',x) )
    # remove extra spaces more than or equals "2" 
    df[col]=df[col].apply(lambda x: re.sub(" {2,}",' ',x))
    # select alphapetical only
    df[col]=df[col].apply(lambda x: re.sub("[^a-z]+",' ',x) )
    # strip string
    df[col]=df[col].apply(lambda x: x.strip())
    # get lemma
    df[col]=df[col].apply(lambda x: " ".join(get_lemma(x)))
    return df

In [7]:
# remove empty strings  
# remove duplicates 
# remove 1 character words
# remove 2 characters words

In [9]:
df=pd.read_csv(r'smsspamcollection.tsv',sep='\t')

In [10]:
x=df['message']
y=df['label']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.33,random_state=42)

In [12]:
x_train=pd.DataFrame(X_train,columns=['message'])
x_test=pd.DataFrame(X_test,columns=['message'])

In [13]:
cleaned_train=cleaning(x_train)
cleaned_test=cleaning(x_test)

no blanks found
dropped [31m210[0m : values
no blanks found
dropped [31m65[0m : values


In [14]:
import warnings
warnings.filterwarnings('ignore')

In [15]:
prep_train=preprocessing(cleaned_train,col='message')
prep_test=preprocessing(cleaned_test,col='message')

In [16]:
y_train=y_train[prep_train.index]
y_test=y_test[prep_test.index]

In [17]:
from sklearn.preprocessing import LabelEncoder

lb=LabelEncoder()
y_train=lb.fit_transform(y_train)
y_test=lb.transform(y_test)

In [18]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [19]:
#does we cound vacab before cleaning and removing stopwords or without removing stopwords

In [20]:
x

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: message, Length: 5572, dtype: object

In [22]:
# import re
# text = ' '.join(x)
# words = re.findall(r'\b\w+\b', text.lower())
# unique_words = set(words)
# vocab_size = len(unique_words)

In [27]:
# vocab_size

In [28]:
# vocab_size=10000
maxlen=66
tokenizer=Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(prep_train['message'])
vocab_size = len(tokenizer.word_index) + 1
vocab_size

5431

In [29]:
sequences=tokenizer.texts_to_sequences(prep_train['message'])
# (num_samples, num_timesteps)
padded_sequences=pad_sequences(sequences,padding='post',maxlen=maxlen)

In [30]:
padded_sequences.shape

(3523, 66)

In [31]:
padded_sequences

array([[ 268,   11,  682, ...,    0,    0,    0],
       [   3,   31,  959, ...,    0,    0,    0],
       [1730, 1348,   30, ...,    0,    0,    0],
       ...,
       [  72,   56,   12, ...,    0,    0,    0],
       [1046,  620, 1316, ...,    0,    0,    0],
       [  12,   14,   35, ...,    0,    0,    0]],
      shape=(3523, 66), dtype=int32)

In [32]:
import tensorflow as tf
from tensorflow.keras import layers, Sequential
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence

In [33]:
model=Sequential([
    # input shape (None, 66, 10000)
    # (n_samples,n_time_steps,embed_dimen)
    # layers.Embedding(input_dim=vocab_size,output_dim=64,input_length=66) #input_length is optional,# (None, 66, 300)
    layers.Embedding(input_dim=vocab_size,output_dim=100),#output (None, 66, 300)
    layers.LSTM(64),#(None, 66) 
    layers.Dense(32,activation='relu'),
    # layers.Dropout(0.3),
    layers.Dense(1,activation='sigmoid')

])
# 85%

In [34]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(padded_sequences,y_train,validation_split=0.2,epochs=10,batch_size=64)

Epoch 1/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - accuracy: 0.8520 - loss: 0.4215 - val_accuracy: 0.8596 - val_loss: 0.4082
Epoch 2/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - accuracy: 0.8705 - loss: 0.3878 - val_accuracy: 0.8596 - val_loss: 0.4089
Epoch 3/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.8705 - loss: 0.3880 - val_accuracy: 0.8596 - val_loss: 0.4095
Epoch 4/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.8705 - loss: 0.3887 - val_accuracy: 0.8596 - val_loss: 0.4058
Epoch 5/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.8705 - loss: 0.3900 - val_accuracy: 0.8596 - val_loss: 0.4061
Epoch 6/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.8705 - loss: 0.3878 - val_accuracy: 0.8596 - val_loss: 0.4057
Epoch 7/10
[1m45/45[0m [32m━━━━

<keras.src.callbacks.history.History at 0x153d9fca660>

In [35]:
model2=Sequential([
    # input shape (None, 66, 300)
    # (n_samples,n_time_steps,embed_dimen)
    layers.Embedding(input_dim=vocab_size,output_dim=100),#output (None, 66, 300)
    layers.Bidirectional(layers.GRU(64,return_sequences=True)),
    layers.Bidirectional(layers.GRU(64)),
    layers.Dense(32,activation='relu'),
    layers.Dense(1,activation='sigmoid')

])
# 98%

In [36]:
model2.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model2.fit(padded_sequences,y_train,validation_split=0.2,epochs=10,batch_size=64)

Epoch 1/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 89ms/step - accuracy: 0.8875 - loss: 0.3005 - val_accuracy: 0.9603 - val_loss: 0.1218
Epoch 2/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 70ms/step - accuracy: 0.9876 - loss: 0.0491 - val_accuracy: 0.9787 - val_loss: 0.0835
Epoch 3/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 75ms/step - accuracy: 0.9968 - loss: 0.0173 - val_accuracy: 0.9816 - val_loss: 0.0849
Epoch 4/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 77ms/step - accuracy: 0.9975 - loss: 0.0080 - val_accuracy: 0.9716 - val_loss: 0.1077
Epoch 5/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 73ms/step - accuracy: 0.9996 - loss: 0.0014 - val_accuracy: 0.9816 - val_loss: 0.0961
Epoch 6/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 71ms/step - accuracy: 0.9993 - loss: 0.0018 - val_accuracy: 0.9830 - val_loss: 0.0952
Epoch 7/10
[1m45/45[0m [32m━━━

<keras.src.callbacks.history.History at 0x153da0ea490>

In [37]:
y_train

array([0, 0, 0, ..., 0, 0, 0], shape=(3523,))

In [38]:
model.summary()

# Try using embedding (None,300,1)
make each message being represented with one vector of size 300

In [86]:
train_v=np.zeros((len(prep_train),300))
test_v=np.zeros((len(prep_test),300))

In [87]:
from tqdm.auto import tqdm

In [88]:
for i,doc in tqdm(enumerate(nlp.pipe(prep_train['message'])),total=len(prep_train)):
    train_v[i,:] =doc.vector

  0%|          | 0/3523 [00:00<?, ?it/s]

In [89]:
train_v

array([[-0.0279815 , -0.269225  , -0.23353499, ...,  0.100813  ,
        -0.03346   ,  0.26445001],
       [-0.1392675 ,  0.1478274 , -0.12691452, ..., -0.2005147 ,
         0.13427116,  0.10181274],
       [ 0.15467668, -0.021534  , -0.00230667, ...,  0.01061   ,
         0.07159001, -0.10689136],
       ...,
       [-0.0696772 ,  0.10397701, -0.16257419, ..., -0.0448302 ,
         0.1180622 ,  0.0576344 ],
       [-0.28794026,  0.00544374, -0.11241525, ...,  0.09337795,
         0.12980551,  0.21480799],
       [-0.0931615 ,  0.2744745 , -0.42537424, ..., -0.12483674,
         0.08389725,  0.14581725]], shape=(3523, 300))

In [90]:
for i,doc in tqdm(enumerate(nlp.pipe(prep_test['message'])),total=len(prep_test)):
    test_v[i,:] =doc.vector

  0%|          | 0/1774 [00:00<?, ?it/s]

In [91]:
train_v=np.expand_dims(train_v,axis=-1)

In [93]:
train_v.shape

(3523, 300, 1)

In [94]:
# model=Sequential([
#     # input shape (None, 66, 300)
#     # (n_samples,n_time_steps,embed_dimen)
#     # layers.Embedding(input_dim=vocab_size,output_dim=100),#output (None, 66, 300)
#     layers.Bidirectional(layers.GRU(64,return_sequences=True)),
#     layers.Bidirectional(layers.GRU(64)),
#     layers.Dense(32,activation='relu'),
#     layers.Dense(1,activation='sigmoid')

# ])

In [190]:
model=Sequential([
    # input shape (None, 66, 10000)
    # (n_samples,n_time_steps,embed_dimen)
    # layers.Embedding(input_dim=vocab_size,output_dim=128,input_length=66),# (None, 66, 300)
    layers.LSTM(32,return_sequences=True),#(None, 66) 
    layers.LSTM(16,return_sequences=True),#(None, 66) 
    layers.Dropout(0.3),
    layers.LSTM(8),
    layers.Dropout(0.3),
    layers.Dense(1,activation='sigmoid')

])

In [95]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [96]:
model.fit(train_v,y_train,validation_split=0.2,epochs=10,batch_size=64)

Epoch 1/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 347ms/step - accuracy: 0.8669 - loss: 0.4339 - val_accuracy: 0.8596 - val_loss: 0.3966
Epoch 2/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 336ms/step - accuracy: 0.8705 - loss: 0.3704 - val_accuracy: 0.8596 - val_loss: 0.3645
Epoch 3/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 338ms/step - accuracy: 0.8705 - loss: 0.3291 - val_accuracy: 0.8596 - val_loss: 0.3086
Epoch 4/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 343ms/step - accuracy: 0.8705 - loss: 0.2954 - val_accuracy: 0.8582 - val_loss: 0.2894
Epoch 5/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 349ms/step - accuracy: 0.8896 - loss: 0.2532 - val_accuracy: 0.8922 - val_loss: 0.2485
Epoch 6/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 345ms/step - accuracy: 0.9017 - loss: 0.2337 - val_accuracy: 0.8950 - val_loss: 0.2229
Epoch 7/10
[1m45/45[

<keras.src.callbacks.history.History at 0x17f447f89d0>