In [1]:
import gensim
from gensim.models import Word2Vec, KeyedVectors


import sys
import os

# Move up one directory (adjust `..` based on depth)
root_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Add to sys.path
sys.path.append(root_dir)

print(f"Root directory added to sys.path: {root_dir}")

from common_codes.utils import Utils
import pandas as pd
import numpy as np


Root directory added to sys.path: /Users/murtuzasaifee/Documents/Personal/Codes/MLWorkspace/python_ml_nlp/src


In [2]:
messages = pd.read_csv(Utils.get_file_path('SMSSpamCollection'), sep='\t', names=['label', 'message'])
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
## Data Cleaning and Preprocessing

import re
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

lemmatizer = nltk.WordNetLemmatizer()

corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/murtuzasaifee/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
[[i,j,k] for i,j,k in zip(list(map(len,corpus)),corpus, messages['message']) if i<1]

[[0, '', '645'], [0, '', ':) '], [0, '', ':-) :-)']]

In [5]:
## Also we can use to lower case and have the words for each sentence

from nltk import sent_tokenize
from gensim.utils import simple_preprocess

words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))
        
        
words

[['go',
  'until',
  'jurong',
  'point',
  'crazy',
  'available',
  'only',
  'in',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'there',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'in',
  'wkly',
  'comp',
  'to',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'to',
  'to',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply',
  'over'],
 ['dun', 'say', 'so', 'early', 'hor', 'already', 'then', 'say'],
 ['nah',
  'don',
  'think',
  'he',
  'go',
  'to',
  'usf',
  'he',
  'life',
  'around',
  'here',
  'though'],
 ['freemsg',
  'hey',
  'there',
  'darling',
  'it',
  'been',
  'week',
  'now',
  'and',
  'no',
  'word',
  'back',
  'like',
  'some',
  'fun',
  'you',
  'up',
  'for',
  'it',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'to',
  'send',
  'to',
  'rcv'],
 ['even',
  'my',
  'brother',
  'is',
  'not',
  'like',
  'to',
  'spea

In [6]:
## Lets train word2vec model from scratch

model = gensim.models.Word2Vec(words)

## To get all the vocalbulary
model.wv.index_to_key

['to',
 'you',
 'the',
 'it',
 'and',
 'in',
 'is',
 'me',
 'my',
 'for',
 'your',
 'call',
 'of',
 'that',
 'have',
 'on',
 'now',
 'are',
 'can',
 'so',
 'but',
 'not',
 'or',
 'we',
 'do',
 'get',
 'at',
 'ur',
 'will',
 'if',
 'be',
 'with',
 'no',
 'just',
 'this',
 'gt',
 'lt',
 'go',
 'how',
 'up',
 'when',
 'ok',
 'day',
 'what',
 'free',
 'from',
 'all',
 'out',
 'know',
 'll',
 'come',
 'like',
 'good',
 'time',
 'am',
 'then',
 'got',
 'wa',
 'there',
 'he',
 'love',
 'text',
 'only',
 'want',
 'send',
 'one',
 'need',
 'txt',
 'today',
 'by',
 'going',
 'don',
 'stop',
 'home',
 'she',
 'about',
 'lor',
 'sorry',
 'see',
 'still',
 'mobile',
 'take',
 'back',
 'da',
 'reply',
 'dont',
 'our',
 'think',
 'tell',
 'week',
 'hi',
 'phone',
 'they',
 'new',
 'please',
 'later',
 'pls',
 'any',
 'her',
 'ha',
 'co',
 'did',
 'been',
 'msg',
 'min',
 'some',
 'an',
 'night',
 'make',
 'dear',
 'who',
 'here',
 'message',
 'say',
 'well',
 'where',
 're',
 'thing',
 'much',
 'oh',

In [7]:
model.corpus_count

5569

In [8]:
model.epochs

5

In [9]:
model.wv.similar_by_word('good')

[('my', 0.9991448521614075),
 ('night', 0.9988651871681213),
 ('day', 0.998776376247406),
 ('all', 0.9987455010414124),
 ('thing', 0.9986538887023926),
 ('and', 0.9986380338668823),
 ('well', 0.9986335635185242),
 ('happy', 0.9985867142677307),
 ('morning', 0.9985832571983337),
 ('hope', 0.9985809922218323)]

In [10]:

def avg_word2vec(doc):
    # remove out-of-vocabulary words
    #sent = [word for word in doc if word in model.wv.index_to_key]
    #print(sent)
    
    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)
                #or [np.zeros(len(model.wv.index_to_key))], axis=0)

In [11]:
!pip install tqdm



In [12]:
from tqdm import tqdm

#apply for the entire sentences
import numpy as np
X=[]
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 5569/5569 [00:00<00:00, 24343.96it/s]


In [13]:
X

[array([-0.19768243,  0.23205064,  0.1119635 ,  0.0949717 ,  0.08960452,
        -0.48824236,  0.16883916,  0.4709699 , -0.24746735, -0.11500701,
        -0.19556947, -0.3834689 , -0.0866647 ,  0.11324724,  0.1699792 ,
        -0.1575021 ,  0.13385098, -0.31221345, -0.06513942, -0.53660595,
         0.18610425,  0.10472994,  0.07995072, -0.1993881 , -0.02120191,
        -0.01880861, -0.19882776, -0.20438512, -0.26430959,  0.03411366,
         0.32607985,  0.02310064,  0.12066553, -0.18580097, -0.13917422,
         0.3688136 ,  0.07463004, -0.11470792, -0.09945092, -0.4795422 ,
         0.1015646 , -0.25038278, -0.20496924,  0.03601195,  0.16582017,
         0.00763975, -0.12351135, -0.06229073,  0.22243218,  0.13675074,
         0.17848654, -0.19393975, -0.05381218,  0.06300893, -0.08687491,
         0.03650757,  0.1723307 ,  0.0261496 , -0.379018  ,  0.1703771 ,
        -0.01874622,  0.16271764, -0.00778936, -0.14038147, -0.26806065,
         0.25672698,  0.08918218,  0.22429699, -0.3

In [14]:
## Output Features

## This we are doing because in X there were 3 data which were less than corpus. 
# That is because we have removed the special characters and the words were not there. 
# So we are removing those rows from the y as well.
y = messages[list(map(lambda x: len(x)>0 ,corpus))]
y = pd.get_dummies(y['label']).astype(int)
y=y.iloc[:,0].values

In [15]:
y.shape

(5569,)

In [16]:
y

array([1, 1, 0, ..., 1, 1, 1])

In [17]:
len(X)

5569

In [18]:
X[0].reshape(1,-1).shape

(1, 100)

In [19]:
## this is the final independent features
df = pd.concat([pd.DataFrame(X[i].reshape(1, -1)) for i in range(len(X))], ignore_index=True)

df.head()


  df = pd.concat([pd.DataFrame(X[i].reshape(1, -1)) for i in range(len(X))], ignore_index=True)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.197682,0.232051,0.111964,0.094972,0.089605,-0.488242,0.168839,0.47097,-0.247467,-0.115007,...,0.347369,0.158862,0.078788,0.057889,0.437545,0.174328,0.183319,-0.192928,0.152751,0.008612
1,-0.181826,0.202818,0.09517,0.081306,0.083538,-0.424606,0.136306,0.416765,-0.217056,-0.094045,...,0.310786,0.132381,0.063801,0.044606,0.371392,0.148378,0.161423,-0.177806,0.14046,0.001181
2,-0.210248,0.249268,0.120321,0.11622,0.071807,-0.52389,0.17041,0.466809,-0.259569,-0.138648,...,0.338186,0.160678,0.078387,0.044574,0.448849,0.155519,0.13466,-0.228468,0.1841,0.02502
3,-0.27115,0.312161,0.146613,0.127665,0.122477,-0.65768,0.222386,0.643094,-0.338982,-0.147043,...,0.475223,0.212694,0.102724,0.083747,0.58533,0.242966,0.266078,-0.263699,0.209002,0.002578
4,-0.232043,0.257157,0.132045,0.104998,0.110395,-0.557448,0.185524,0.544166,-0.288893,-0.130961,...,0.404905,0.179118,0.090718,0.073397,0.499896,0.206559,0.219005,-0.233777,0.172007,0.001721


In [20]:
df['Output']=y

In [21]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Output
0,-0.197682,0.232051,0.111964,0.094972,0.089605,-0.488242,0.168839,0.47097,-0.247467,-0.115007,...,0.158862,0.078788,0.057889,0.437545,0.174328,0.183319,-0.192928,0.152751,0.008612,1
1,-0.181826,0.202818,0.09517,0.081306,0.083538,-0.424606,0.136306,0.416765,-0.217056,-0.094045,...,0.132381,0.063801,0.044606,0.371392,0.148378,0.161423,-0.177806,0.14046,0.001181,1
2,-0.210248,0.249268,0.120321,0.11622,0.071807,-0.52389,0.17041,0.466809,-0.259569,-0.138648,...,0.160678,0.078387,0.044574,0.448849,0.155519,0.13466,-0.228468,0.1841,0.02502,0
3,-0.27115,0.312161,0.146613,0.127665,0.122477,-0.65768,0.222386,0.643094,-0.338982,-0.147043,...,0.212694,0.102724,0.083747,0.58533,0.242966,0.266078,-0.263699,0.209002,0.002578,1
4,-0.232043,0.257157,0.132045,0.104998,0.110395,-0.557448,0.185524,0.544166,-0.288893,-0.130961,...,0.179118,0.090718,0.073397,0.499896,0.206559,0.219005,-0.233777,0.172007,0.001721,1


In [22]:
df.isnull().sum()

0         12
1         12
2         12
3         12
4         12
          ..
96        12
97        12
98        12
99        12
Output     0
Length: 101, dtype: int64

In [23]:
df.dropna(inplace=True)

In [34]:
## Independent Feature
X=df.drop('Output',axis=1)

y=df['Output']

In [35]:
df.isnull().sum()

0         0
1         0
2         0
3         0
4         0
         ..
96        0
97        0
98        0
99        0
Output    0
Length: 101, dtype: int64

In [36]:
## Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20)

In [37]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
2570,-0.25326,0.300062,0.148712,0.133322,0.108465,-0.627051,0.211874,0.581502,-0.321264,-0.154135,...,0.423251,0.198516,0.098488,0.05877,0.551657,0.204135,0.200148,-0.259798,0.208074,0.016528
5141,-0.277448,0.300754,0.159381,0.112174,0.136762,-0.651797,0.218385,0.641232,-0.338205,-0.157692,...,0.492217,0.218415,0.101977,0.074789,0.581028,0.256976,0.250681,-0.275048,0.194407,0.009778
3277,-0.248171,0.267293,0.139691,0.102085,0.121794,-0.583328,0.192527,0.569927,-0.294525,-0.140439,...,0.434953,0.184343,0.097936,0.081871,0.52747,0.229724,0.246221,-0.237127,0.162021,0.006611
376,-0.254234,0.254244,0.13405,0.095884,0.13675,-0.592656,0.181828,0.59245,-0.306321,-0.14465,...,0.460038,0.186965,0.088985,0.081641,0.524063,0.247159,0.256935,-0.251636,0.168259,0.007158
4355,-0.261441,0.291714,0.149407,0.120994,0.119319,-0.626212,0.211821,0.609247,-0.324853,-0.157248,...,0.455515,0.204039,0.105097,0.078663,0.558047,0.239337,0.244556,-0.258466,0.193645,0.006007


In [38]:
y.info()

<class 'pandas.core.series.Series'>
Index: 5557 entries, 0 to 5568
Series name: Output
Non-Null Count  Dtype
--------------  -----
5557 non-null   int64
dtypes: int64(1)
memory usage: 86.8 KB


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5557 entries, 0 to 5568
Columns: 101 entries, 0 to Output
dtypes: float32(100), int64(1)
memory usage: 2.2 MB


In [40]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()

classifier.fit(X_train,y_train)
y_pred=classifier.predict(X_test)

from sklearn.metrics import accuracy_score,classification_report
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.9712230215827338
              precision    recall  f1-score   support

           0       0.92      0.85      0.88       141
           1       0.98      0.99      0.98       971

    accuracy                           0.97      1112
   macro avg       0.95      0.92      0.93      1112
weighted avg       0.97      0.97      0.97      1112

