In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.auto import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import spacy
nlp = spacy.load('en_core_web_lg')

In [2]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
stop=nltk.corpus.stopwords.words('english')

In [3]:
# creat tag map between treebank and wordnet
# tree bank : collection of syntactically annotated sentences
tag_map = {
"CC": None, #formatted. conjunction (and, but, or)
'CD': wn.NOUN, # original number (one,two)
'DT': None, # delimiter (a, the)
'EX': wn.ADV, #and my "there" (there)
"FW": None, # foreign word (mea culpa)
"IN": wn.ADV,# preposition/subjunctive conjunction (of, in,by) 
'JJ': [wn.ADJ, wn.ADJ_SAT],# adjective # (yellow)
'JJR': [wn.ADJ, wn.ADJ_SAT], # add, comparator (larger)
"JJS": [wn.ADJ, wn.ADJ_SAT], # add, superlative (wildest)
"LS": None, #list item tag(1,2,One)
'MD': None, # modal (can, should)
'NN': wn.NOUN, #noun, sing. or nugget (llama)
'NNS': wn.NOUN, #noun, plural (Llamas)
"NNP": wn.NOUN, # proper noun, vocals. (IBM)
'NNPS': wn.NOUN, # proper noun, plural (Carolina)
"PDT": [wn.ADJ, wn.ADJ_SAT], # predeterminer (all, both)
'POS':None, # end of ownership
'PRP': None, # personal pronoun (I, you, he)
'prp$': None, # possessive pronoun (your, one's)
"RB": wn.ADV, #adv (quickly, never)
'RBR': wn.ADV, # adverb, comparative (faster)
'RBS':wn.ADV,# adverb, superlative (fastest)
'RP': [wn.ADJ, wn.ADJ_SAT], # particle (up, off)
'SYM': None, #symbol, # (+, %, &)
'TO': None, # 'to' (to)
'UH': None, # interjection (uh, oops)
'VB': wn.VERB,# verb base form # (eat)
'VBD': wn.VERB, # verb past tense (eat)
'VBG': wn.VERB, #verb gerund (to eat)
'VBN': wn.VERB, # past participle (eaten)
"VBP": wn.VERB, #non-3sg pres verb (eat)
'VBZ': wn.VERB,# verb зsg pres (egts) 
}

In [4]:
lemma=WordNetLemmatizer()
def get_lemma(text):
    l=[]
    tags=pos_tag(text.split())
    for token,tag in tags:
        try:
             l.append(lemma.lemmatize(token,pos=tag_map[tag][0]))
        except:
            continue
    return l

In [5]:
from termcolor import colored
def cleaning(df):
    """
    - remove empty strings
    - remove duplicates
    """
    # remove empty string
    blanks=[]
    for i ,v in df.itertuples():
        if v.isspace() ==True or v=='':
            blanks.append(i)
    if len(blanks)>0:
        df=df.drop(blanks)
        print(f"found {colored(len(blanks),'red')} blanks")
    else:
        print("no blanks found")

    # remove duplicates
    if df.duplicated().sum() !=0:
        print(f"dropped {colored(df.duplicated().sum(),'red')} : values")
        df=df.drop_duplicates()
    else:
        print("no duplicates found")
    return df

In [6]:
import re
import string
def preprocessing(df,col):
    # lowercasing
    df[col]=df[col].apply(lambda x: " ".join(t.lower() for t in x.split()))
    # remove emails
    df[col]=df[col].apply(lambda x: re.sub("\S+\@\S+",'',x))
    # remove URls
    df[col]=df[col].apply(lambda x: re.sub("\S+\.\S+",'',x))
    # remove punctations
    df[col]=df[col].apply(lambda x: re.sub(f'[{re.escape(string.punctuation)}]',' ',x))
    # remove stopwords
    df[col]=df[col].apply(lambda x :" ".join(t for t in x.lower().split() if t not in stop))
    # remove \n or \t
    df[col]=df[col].apply(lambda x: re.sub('\n|\t',' ',x))
    # remove 2 characters words
    df[col]=df[col].apply(lambda x: re.sub(r' \w\w ',' ',x) )
    # remove 1 character words
    df[col]=df[col].apply(lambda x: re.sub(r' \w ',' ',x) )
    # remove extra spaces more than or equals "2" 
    df[col]=df[col].apply(lambda x: re.sub(" {2,}",' ',x))
    # select alphapetical only
    df[col]=df[col].apply(lambda x: re.sub("[^a-z]+",' ',x) )
    # strip string
    df[col]=df[col].apply(lambda x: x.strip())
    # get lemma
    df[col]=df[col].apply(lambda x: " ".join(get_lemma(x)))
    return df

In [7]:
# remove empty strings  
# remove duplicates 
# remove 1 character words
# remove 2 characters words

In [8]:
df=pd.read_csv(r'smsspamcollection.tsv',sep='\t')

In [9]:
x=df['message']
y=df['label']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.33,random_state=42)

In [11]:
x_train=pd.DataFrame(X_train,columns=['message'])
x_test=pd.DataFrame(X_test,columns=['message'])

In [12]:
cleaned_train=cleaning(x_train)
cleaned_test=cleaning(x_test)

no blanks found
dropped [31m210[0m : values
no blanks found
dropped [31m65[0m : values


In [13]:
import warnings
warnings.filterwarnings('ignore')

In [14]:
prep_train=preprocessing(cleaned_train,col='message')
prep_test=preprocessing(cleaned_test,col='message')

In [15]:
y_train=y_train[prep_train.index]
y_test=y_test[prep_test.index]

In [16]:
def get_longest_seq(texts):
    longest=0
    for text in texts:
        text_len=len(text)
        longest=max(text_len,longest)
    return longest

In [17]:
longest_input=get_longest_seq(prep_train['message'])
longest_input

378

In [18]:
import string

In [19]:
unique_chars = len(string.ascii_letters+string.digits)


In [20]:
from collections import defaultdict
char2idx = defaultdict(lambda:0)

In [21]:
char2idx

defaultdict(<function __main__.<lambda>()>, {})

In [22]:
for idx, char in enumerate(string.ascii_letters+string.digits, 1):
    char2idx[char] = idx

In [23]:
def encode_text(text):
    text_v = np.zeros((longest_input, unique_chars))
    for i, char in enumerate(text):
        if i<longest_input:
            text_v[i, char2idx[char]] = 1
    return text_v 

In [24]:
encode_text("aahelo")

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(378, 62))

In [25]:
def encode_texts(texts):
    # (None,378,62)
    text_v = np.zeros((len(texts), longest_input, unique_chars))
    for i, text in enumerate(texts):
        text_v[i] = encode_text(text)
    return text_v

In [26]:
train_encoded=encode_texts(prep_train['message'])
test_encoded=encode_texts(prep_test['message'])

In [27]:
train_encoded.shape

(3523, 378, 62)

In [28]:
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()
y_train=lb.fit_transform(y_train)
y_test=lb.transform(y_test)

In [29]:
import tensorflow as tf
from tensorflow.keras import layers, Sequential

In [30]:
model=Sequential([
    # input shape (None, 66, 10000)
    # (n_samples,n_time_steps,embed_dimen)
    # layers.Embedding(input_dim=vocab_size,output_dim=128,input_length=66),# (None, 66, 300)
    layers.LSTM(32,return_sequences=True),#(None, 66) 
    layers.Dropout(0.1),
    layers.LSTM(16),
    layers.Dropout(0.1),
    layers.Dense(1,activation='sigmoid')

])
# 85%

In [32]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(train_encoded,y_train,validation_split=0.2,epochs=10,batch_size=64)

Epoch 1/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 141ms/step - accuracy: 0.8705 - loss: 0.4953 - val_accuracy: 0.8596 - val_loss: 0.4075
Epoch 2/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 135ms/step - accuracy: 0.8705 - loss: 0.3901 - val_accuracy: 0.8596 - val_loss: 0.4099
Epoch 3/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 144ms/step - accuracy: 0.8705 - loss: 0.3879 - val_accuracy: 0.8596 - val_loss: 0.4058
Epoch 4/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 150ms/step - accuracy: 0.8705 - loss: 0.3907 - val_accuracy: 0.8596 - val_loss: 0.4094
Epoch 5/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 136ms/step - accuracy: 0.8705 - loss: 0.3866 - val_accuracy: 0.8596 - val_loss: 0.4058
Epoch 6/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 141ms/step - accuracy: 0.8705 - loss: 0.3883 - val_accuracy: 0.8596 - val_loss: 0.4059
Epoch 7/10
[1m45/45[0m [3

<keras.src.callbacks.history.History at 0x21c18978ad0>

In [34]:
model2=Sequential([
    # input shape (None, 66, 300)
    # (n_samples,n_time_steps,embed_dimen)
    # layers.Embedding(input_dim=vocab_size,output_dim=100),#output (None, 66, 300)
    layers.Bidirectional(layers.GRU(64,return_sequences=True)),
    layers.Bidirectional(layers.GRU(64)),
    layers.Dense(32,activation='relu'),
    layers.Dense(1,activation='sigmoid')

])
# 90%

In [36]:
model2.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model2.fit(train_encoded,y_train,validation_split=0.2,epochs=10,batch_size=64)

Epoch 1/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 465ms/step - accuracy: 0.8666 - loss: 0.4428 - val_accuracy: 0.8596 - val_loss: 0.3967
Epoch 2/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 460ms/step - accuracy: 0.8705 - loss: 0.3626 - val_accuracy: 0.8596 - val_loss: 0.3816
Epoch 3/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 456ms/step - accuracy: 0.8765 - loss: 0.3405 - val_accuracy: 0.8667 - val_loss: 0.3654
Epoch 4/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 459ms/step - accuracy: 0.8801 - loss: 0.3341 - val_accuracy: 0.8638 - val_loss: 0.3573
Epoch 5/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 471ms/step - accuracy: 0.8875 - loss: 0.3187 - val_accuracy: 0.8865 - val_loss: 0.3103
Epoch 6/10
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 461ms/step - accuracy: 0.8701 - loss: 0.3882 - val_accuracy: 0.8596 - val_loss: 0.3676
Epoch 7/10
[1m45/45[

<keras.src.callbacks.history.History at 0x21c1a2d7ed0>

In [66]:
model.evaluate(test_encoded, y_test)

[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - accuracy: 0.8692 - loss: 0.3879


[0.3878634572029114, 0.869222104549408]

In [67]:
prediction=model.predict(test_encoded)

[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step  


In [68]:
prediction

array([[0.13140504],
       [0.13140504],
       [0.13140504],
       ...,
       [0.13140504],
       [0.13140506],
       [0.13140504]], shape=(1774, 1), dtype=float32)

In [69]:
p=np.where(prediction>0.5,1,0)

In [70]:
print(classification_report(y_test, p))

              precision    recall  f1-score   support

           0       0.87      1.00      0.93      1542
           1       0.00      0.00      0.00       232

    accuracy                           0.87      1774
   macro avg       0.43      0.50      0.47      1774
weighted avg       0.76      0.87      0.81      1774



In [71]:
lb.classes_

array(['ham', 'spam'], dtype=object)

In [72]:
def predict_char_embd(text):
    text_v = np.zeros((1, longest_input, unique_chars))
    for i, token in enumerate(text):
        if i>longest_input:#>= to work on long text
            break
        text_v[0, i] = char2idx[token]
    # predict the input 
    prediction = model.predict(text_v)
    p=np.where(prediction>0.5,1,0)
    label = lb.classes_[p]
    return label

In [73]:
predict_char_embd("how mcuh for the french course hack hack hack >>>>> ?")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step


array([['ham']], dtype=object)

In [74]:
x=x_train.iloc[50].message

x=x+x+x+x+x+x

len(x)

342

In [75]:
predict_char_embd(x)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step


array([['ham']], dtype=object)

In [76]:

x=x+x+x+x+x+x

len(x)

2052

In [77]:
predict_char_embd(x)


IndexError: index 378 is out of bounds for axis 1 with size 378