In [18]:
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.auto import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import spacy
nlp = spacy.load('en_core_web_lg')

vectors generated by Spacy gained from pre-trained GloVe model

In [6]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
stop=nltk.corpus.stopwords.words('english')

In [7]:
# creat tag map between treebank and wordnet
# tree bank : collection of syntactically annotated sentences
tag_map = {
"CC": None, #formatted. conjunction (and, but, or)
'CD': wn.NOUN, # original number (one,two)
'DT': None, # delimiter (a, the)
'EX': wn.ADV, #and my "there" (there)
"FW": None, # foreign word (mea culpa)
"IN": wn.ADV,# preposition/subjunctive conjunction (of, in,by) 
'JJ': [wn.ADJ, wn.ADJ_SAT],# adjective # (yellow)
'JJR': [wn.ADJ, wn.ADJ_SAT], # add, comparator (larger)
"JJS": [wn.ADJ, wn.ADJ_SAT], # add, superlative (wildest)
"LS": None, #list item tag(1,2,One)
'MD': None, # modal (can, should)
'NN': wn.NOUN, #noun, sing. or nugget (llama)
'NNS': wn.NOUN, #noun, plural (Llamas)
"NNP": wn.NOUN, # proper noun, vocals. (IBM)
'NNPS': wn.NOUN, # proper noun, plural (Carolina)
"PDT": [wn.ADJ, wn.ADJ_SAT], # predeterminer (all, both)
'POS':None, # end of ownership
'PRP': None, # personal pronoun (I, you, he)
'prp$': None, # possessive pronoun (your, one's)
"RB": wn.ADV, #adv (quickly, never)
'RBR': wn.ADV, # adverb, comparative (faster)
'RBS':wn.ADV,# adverb, superlative (fastest)
'RP': [wn.ADJ, wn.ADJ_SAT], # particle (up, off)
'SYM': None, #symbol, # (+, %, &)
'TO': None, # 'to' (to)
'UH': None, # interjection (uh, oops)
'VB': wn.VERB,# verb base form # (eat)
'VBD': wn.VERB, # verb past tense (eat)
'VBG': wn.VERB, #verb gerund (to eat)
'VBN': wn.VERB, # past participle (eaten)
"VBP": wn.VERB, #non-3sg pres verb (eat)
'VBZ': wn.VERB,# verb зsg pres (egts) 
}

In [8]:
lemma=WordNetLemmatizer()
def get_lemma(text):
    l=[]
    tags=pos_tag(text.split())
    for token,tag in tags:
        try:
             l.append(lemma.lemmatize(token,pos=tag_map[tag][0]))
        except:
            continue
    return l

In [9]:
from termcolor import colored
def cleaning(df):
    """
    - remove empty strings
    - remove duplicates
    """
    # remove empty string
    blanks=[]
    for i ,v in df.itertuples():
        if v.isspace() ==True or v=='':
            blanks.append(i)
    if len(blanks)>0:
        df=df.drop(blanks)
        print(f"found {colored(len(blanks),'red')} blanks")
    else:
        print("no blanks found")

    # remove duplicates
    if df.duplicated().sum() !=0:
        print(f"dropped {colored(df.duplicated().sum(),'red')} : values")
        df=df.drop_duplicates()
    else:
        print("no duplicates found")
    return df

In [None]:
w = re.sub(r"([?.!,¿])", r" \1 ", w)

In [10]:
import re
import string
def preprocessing(df,col):
    # lowercasing
    df[col]=df[col].apply(lambda x: " ".join(t.lower() for t in x.split()))
    # remove emails
    df[col]=df[col].apply(lambda x: re.sub("\S+\@\S+",'',x))
    # remove URls
    df[col]=df[col].apply(lambda x: re.sub("\S+\.\S+",'',x))
    # remove punctations
    df[col]=df[col].apply(lambda x: re.sub(f'[{re.escape(string.punctuation)}]',' ',x))
    # remove stopwords
    df[col]=df[col].apply(lambda x :" ".join(t for t in x.lower().split() if t not in stop))
    # remove \n or \t
    df[col]=df[col].apply(lambda x: re.sub('\n|\t',' ',x))
    # remove 2 characters words
    df[col]=df[col].apply(lambda x: re.sub(r' \w\w ',' ',x) )
    # remove 1 character words
    df[col]=df[col].apply(lambda x: re.sub(r' \w ',' ',x) )
    # remove extra spaces more than or equals "2" 
    df[col]=df[col].apply(lambda x: re.sub(" {2,}",' ',x))
    # select alphapetical only
    df[col]=df[col].apply(lambda x: re.sub("[^a-z]+",' ',x) )
    # strip string
    df[col]=df[col].apply(lambda x: x.strip())
    # get lemma
    df[col]=df[col].apply(lambda x: " ".join(get_lemma(x)))
    return df

In [11]:
# remove empty strings  
# remove duplicates 
# remove 1 character words
# remove 2 characters words

In [20]:
df=pd.read_csv(r'smsspamcollection.tsv',sep='\t')

In [21]:
x=df['message']
y=df['label']

In [63]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.33,random_state=42)

In [64]:
x_train=pd.DataFrame(X_train,columns=['message'])
x_test=pd.DataFrame(X_test,columns=['message'])

In [65]:
cleaned_train=cleaning(x_train)
cleaned_test=cleaning(x_test)

no blanks found
dropped [31m210[0m : values
no blanks found
dropped [31m65[0m : values


In [66]:
import warnings
warnings.filterwarnings('ignore')

In [67]:
prep_train=preprocessing(cleaned_train,col='message')
prep_test=preprocessing(cleaned_test,col='message')

In [68]:
y_train=y_train[prep_train.index]
y_test=y_test[prep_test.index]

In [69]:
train_v=np.zeros((len(prep_train),300))
test_v=np.zeros((len(prep_test),300))

In [70]:
from tqdm.auto import tqdm

In [71]:
for i,doc in tqdm(enumerate(nlp.pipe(prep_train['message'])),total=len(prep_train)):
    train_v[i,:] =doc.vector

  0%|          | 0/3523 [00:00<?, ?it/s]

In [72]:
for i,doc in tqdm(enumerate(nlp.pipe(prep_test['message'])),total=len(prep_test)):
    test_v[i,:] =doc.vector

  0%|          | 0/1774 [00:00<?, ?it/s]

In [73]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [74]:
clf=LinearSVC()
clf.fit(train_v,y_train)

In [75]:
print(classification_report(y_test, clf.predict(test_v)))

              precision    recall  f1-score   support

         ham       0.98      0.97      0.98      1542
        spam       0.82      0.89      0.85       232

    accuracy                           0.96      1774
   macro avg       0.90      0.93      0.91      1774
weighted avg       0.96      0.96      0.96      1774



In [83]:
from sklearn.preprocessing import LabelEncoder

In [84]:
lb=LabelEncoder()
y_train=lb.fit_transform(y_train)
y_test=lb.transform(y_test)

In [85]:
import xgboost

In [86]:
clf=xgboost.XGBClassifier()

In [87]:
clf.fit(train_v,y_train)

In [88]:
print(classification_report(y_test, clf.predict(test_v)))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      1542
           1       0.96      0.89      0.92       232

    accuracy                           0.98      1774
   macro avg       0.97      0.94      0.96      1774
weighted avg       0.98      0.98      0.98      1774



In [102]:
from sklearn.linear_model import LogisticRegression

In [104]:
clf=LogisticRegression()

In [105]:
clf.fit(train_v,y_train)

In [106]:
print(classification_report(y_test, clf.predict(test_v)))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98      1542
           1       0.85      0.82      0.84       232

    accuracy                           0.96      1774
   macro avg       0.91      0.90      0.91      1774
weighted avg       0.96      0.96      0.96      1774

