In [1]:
import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import nltk

# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('averaged_perceptron_tagger')

import re
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer

from imblearn.combine import SMOTETomek 

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

In [3]:
df = pd.read_csv(r"C:\Users\Jaiprakash\OneDrive\Desktop\TestProject\SpamHamClassifier\SpamHam.txt", sep="\t", names=["Label", "messages"])

## Pre-processing the data
### Some common Problems:
- Remove Punctuation
- Replace short words
- Making text lower case
- Remove stopwords
- Lemmatization

In [4]:
# Replacing punctuations with space
df["message"] = df["messages"].copy()
df['message'] = df['message'].str.replace("[^a-zA-Z0-9]", " ")
df["message"]

0       Go until jurong point  crazy   Available only ...
1                           Ok lar    Joking wif u oni   
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor    U c already then say   
4       Nah I don t think he goes to usf  he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will   b going to esplanade fr home 
5569    Pity    was in mood for that  So   any other s...
5570    The guy did some bitching but I acted like i d...
5571                           Rofl  Its true to its name
Name: message, Length: 5572, dtype: object

In [5]:
#Remove short words and convert words to lower case
df["message"]=df["message"].apply(lambda row: " ".join([word.lower() for word in row.split() if len(word)> 2]))

In [6]:
#Remove stop words
stop_words = stopwords.words("english")
def remove_stopwords(sentence):
    sentence_list = word_tokenize(sentence)
    sentence_new = [word for word in sentence_list if word not in stop_words]
    return sentence_new
df["message"] = df["message"].apply(lambda row: " ".join(remove_stopwords(row)))

In [7]:
# Begin Lemmatization 
# function to convert nltk tag to wordnet tag
lemmatizer = WordNetLemmatizer()

# Finds the part of speech tag & Convert the detailed POS tag into a shallow information
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

# lemmatize sentence using pos tag
def lemmatize_sentence(sentence):
    # word tokenize -> pos tag (detailed) -> wordnet tag (shallow pos) -> lemmatizer -> 
    # root word tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

df["message"] = df["message"].apply(lambda row: lemmatize_sentence(row))

In [8]:
df["message"]

0       jurong point crazy available bugis great world...
1                                      lar joking wif oni
2       free entry wkly comp win cup final tkts 21st m...
3                           dun say early hor already say
4                     nah think go usf life around though
                              ...                        
5567    2nd time try contact 750 pound prize claim eas...
5568                                    go esplanade home
5569                                 pity mood suggestion
5570    guy bitch act like interested buying something...
5571                                       rofl true name
Name: message, Length: 5572, dtype: object

## Word Embedding:
- Count/Frequency(OHE, BOW, TF-IDF),
- Deep Learning trained models(Word2Vec).

#### Eg: 
This mobile is good, This mobile is not good, This mobile words fine & Affordable.
- In BOW and TF-IDF Approach, the semantic meaning of word is not captures.
- Hence we can use Word2Vec model, which can capture Semanctic meaning and also reduce:
    - Sparcity(Vectors having 1 or 0),
    - Dimentions of Vector.
- We can either use pretrained models like: 'word2vec-google-news-300' which converts a word into 300 dimensions.
- Or we can train our model from scratch

In [9]:
import gensim.downloader as api
# wv = api.load('word2vec-google-news-300')

In [10]:
# !pip install gensim
from gensim.utils import simple_preprocess

words = []
for row in df["message"]:
    words.append(simple_preprocess(row))

## Word2Vec

In [11]:
#Training Word2Vec Model from scratch
import gensim
model = gensim.models.Word2Vec(words, window=6, min_count=2)

In [12]:
model.wv.index_to_key

['get',
 'call',
 'come',
 'day',
 'free',
 'know',
 'go',
 'good',
 'like',
 'send',
 'time',
 'love',
 'want',
 'say',
 'text',
 'tell',
 'take',
 'think',
 'need',
 'one',
 'see',
 'txt',
 'today',
 'make',
 'stop',
 'home',
 'reply',
 'lor',
 'sorry',
 'still',
 'mobile',
 'back',
 'dont',
 'well',
 'phone',
 'week',
 'new',
 'please',
 'later',
 'pls',
 'work',
 'miss',
 'give',
 'ask',
 'dear',
 'msg',
 'message',
 'night',
 'wait',
 'thing',
 'try',
 'great',
 'much',
 'hope',
 'claim',
 'leave',
 'hey',
 'number',
 'min',
 'happy',
 'meet',
 'wat',
 'way',
 'yes',
 'www',
 'find',
 'friend',
 'late',
 'let',
 'na',
 'prize',
 'wan',
 'right',
 'win',
 'tomorrow',
 'already',
 'pick',
 'cash',
 'amp',
 'life',
 'yeah',
 'really',
 'feel',
 'tone',
 'babe',
 'keep',
 'sleep',
 'care',
 'morning',
 'last',
 'even',
 'service',
 'thanks',
 'buy',
 'anything',
 'com',
 'would',
 'contact',
 'year',
 'start',
 'use',
 'lol',
 'also',
 'nokia',
 'every',
 'look',
 'wish',
 'sure',
 'u

In [13]:
model.corpus_count

5572

In [14]:
model.epochs

5

In [15]:
model.wv.similar_by_word("happy")

[('day', 0.9994715452194214),
 ('good', 0.9994264841079712),
 ('love', 0.9993661046028137),
 ('dear', 0.9993555545806885),
 ('friend', 0.9993463754653931),
 ('hope', 0.9993451237678528),
 ('even', 0.999329149723053),
 ('heart', 0.9993277788162231),
 ('wish', 0.9993218183517456),
 ('one', 0.999321460723877)]

In [16]:
model.wv["night"].shape

(100,)

In [17]:
def avg_word2vec(doc):  
    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)

In [18]:
!pip install tqdm



In [19]:
words[1]

['lar', 'joking', 'wif', 'oni']

In [20]:
from tqdm import tqdm
X=[]
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|███████████████████████████████████████████████████████████████████████████| 5572/5572 [00:00<00:00, 12383.26it/s]


In [21]:
y = df.Label.map({'spam' : 1, 'ham' : 0})

In [22]:
model.wv.similar_by_word("price")

[('cash', 0.9993554949760437),
 ('reply', 0.9993322491645813),
 ('txt', 0.9993255138397217),
 ('box', 0.9993114471435547),
 ('send', 0.9992994070053101),
 ('holiday', 0.9992895722389221),
 ('text', 0.999282956123352),
 ('www', 0.9992719292640686),
 ('call', 0.9992600679397583),
 ('msg', 0.999239444732666)]