### Train Word2Vec from scratch

In [391]:
%pip install gensim
import gensim
from gensim.models import word2vec, KeyedVectors
import os, re
import pandas as pd
import numpy as np

Note: you may need to restart the kernel to use updated packages.


In [392]:
messages = pd.read_csv('SMSSpamCollection.csv', sep='\t', names=['labels', 'message'])
messages, messages.shape

(     labels                                            message
 0       ham  Go until jurong point, crazy.. Available only ...
 1       ham                      Ok lar... Joking wif u oni...
 2      spam  Free entry in 2 a wkly comp to win FA Cup fina...
 3       ham  U dun say so early hor... U c already then say...
 4       ham  Nah I don't think he goes to usf, he lives aro...
 ...     ...                                                ...
 5567   spam  This is the 2nd time we have tried 2 contact u...
 5568    ham               Will Ã¼ b going to esplanade fr home?
 5569    ham  Pity, * was in mood for that. So...any other s...
 5570    ham  The guy did some bitching but I acted like i'd...
 5571    ham                         Rofl. Its true to its name
 
 [5572 rows x 2 columns],
 (5572, 2))

In [393]:
# model_directory = r"E:\NLP\gensim"
# model_path = os.path.join(model_directory, "word2vec-google-news-300.model")
# wv = KeyedVectors.load(model_path)

In [394]:
# vec = wv['king']
# vec.shape

In [395]:
import nltk
nltk.download("wordnet")
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\GOURAV\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [396]:
lemmatizer = WordNetLemmatizer()

In [397]:
corpus = []
for i in range(len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review ]
    review = ' '.join(review)
    corpus.append(review)

In [398]:
[[i,j,k] for i,j,k in zip(list(map(len,corpus)),corpus, messages['message']) if i<1]

[[0, '', '645'], [0, '', ':) '], [0, '', ':-) :-)']]

In [399]:
corpus

['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat',
 'ok lar joking wif u oni',
 'free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry question std txt rate t c s apply over s',
 'u dun say so early hor u c already then say',
 'nah i don t think he go to usf he life around here though',
 'freemsg hey there darling it s been week s now and no word back i d like some fun you up for it still tb ok xxx std chgs to send to rcv',
 'even my brother is not like to speak with me they treat me like aid patent',
 'a per your request melle melle oru minnaminunginte nurungu vettam ha been set a your callertune for all caller press to copy your friend callertune',
 'winner a a valued network customer you have been selected to receivea prize reward to claim call claim code kl valid hour only',
 'had your mobile month or more u r entitled to update to the latest colour mobile with camera for free call the mobile up

In [400]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [435]:
words = []
for sent in corpus:
    sent_token = sent_tokenize(sent)
    print(sent_token)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat']
['ok lar joking wif u oni']
['free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry question std txt rate t c s apply over s']
['u dun say so early hor u c already then say']
['nah i don t think he go to usf he life around here though']
['freemsg hey there darling it s been week s now and no word back i d like some fun you up for it still tb ok xxx std chgs to send to rcv']
['even my brother is not like to speak with me they treat me like aid patent']
['a per your request melle melle oru minnaminunginte nurungu vettam ha been set a your callertune for all caller press to copy your friend callertune']
['winner a a valued network customer you have been selected to receivea prize reward to claim call claim code kl valid hour only']
['had your mobile month or more u r entitled to update to the latest colour mobile with camera for free call the mobile up

In [402]:
words

[['go',
  'until',
  'jurong',
  'point',
  'crazy',
  'available',
  'only',
  'in',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'there',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'in',
  'wkly',
  'comp',
  'to',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'to',
  'to',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply',
  'over'],
 ['dun', 'say', 'so', 'early', 'hor', 'already', 'then', 'say'],
 ['nah',
  'don',
  'think',
  'he',
  'go',
  'to',
  'usf',
  'he',
  'life',
  'around',
  'here',
  'though'],
 ['freemsg',
  'hey',
  'there',
  'darling',
  'it',
  'been',
  'week',
  'now',
  'and',
  'no',
  'word',
  'back',
  'like',
  'some',
  'fun',
  'you',
  'up',
  'for',
  'it',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'to',
  'send',
  'to',
  'rcv'],
 ['even',
  'my',
  'brother',
  'is',
  'not',
  'like',
  'to',
  'spea

In [403]:
import gensim

In [404]:
model = gensim.models.Word2Vec(words, vector_size=100)

In [405]:
model.wv.index_to_key

['to',
 'you',
 'the',
 'it',
 'and',
 'in',
 'is',
 'me',
 'my',
 'for',
 'your',
 'call',
 'of',
 'that',
 'have',
 'on',
 'now',
 'are',
 'can',
 'so',
 'but',
 'not',
 'or',
 'we',
 'do',
 'get',
 'at',
 'ur',
 'will',
 'if',
 'be',
 'with',
 'no',
 'just',
 'this',
 'gt',
 'lt',
 'go',
 'how',
 'up',
 'when',
 'ok',
 'day',
 'what',
 'free',
 'from',
 'all',
 'out',
 'know',
 'll',
 'come',
 'like',
 'good',
 'time',
 'am',
 'then',
 'got',
 'wa',
 'there',
 'he',
 'love',
 'text',
 'only',
 'want',
 'send',
 'one',
 'need',
 'txt',
 'today',
 'by',
 'going',
 'don',
 'stop',
 'home',
 'she',
 'about',
 'lor',
 'sorry',
 'see',
 'still',
 'mobile',
 'take',
 'back',
 'da',
 'reply',
 'dont',
 'our',
 'think',
 'tell',
 'week',
 'hi',
 'phone',
 'they',
 'new',
 'please',
 'later',
 'pls',
 'any',
 'her',
 'ha',
 'co',
 'did',
 'been',
 'msg',
 'min',
 'some',
 'an',
 'night',
 'make',
 'dear',
 'who',
 'here',
 'message',
 'say',
 'well',
 'where',
 're',
 'thing',
 'much',
 'oh',

In [406]:
model.corpus_count

5569

In [407]:
model.wv.similar_by_word('good')

[('day', 0.9987229704856873),
 ('my', 0.9987215995788574),
 ('well', 0.9987139105796814),
 ('hope', 0.9985823035240173),
 ('all', 0.9984821081161499),
 ('great', 0.9984737038612366),
 ('where', 0.998400092124939),
 ('not', 0.9983817338943481),
 ('but', 0.9983750581741333),
 ('hey', 0.9983746409416199)]

In [408]:
def avg_word2vec(doc):
    word_vectors = [model.wv[word] for word in doc if word in model.wv.index_to_key]
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(model.wv.vector_size)

In [409]:
%pip install tqdm
from tqdm import tqdm

Note: you may need to restart the kernel to use updated packages.


In [410]:
X = []
for i in range(len(words)):
    X.append(avg_word2vec(words[i]))

In [411]:
X

[array([-1.71901003e-01,  2.38518789e-01,  1.15987293e-01,  1.01317674e-01,
         9.50029790e-02, -4.87670988e-01,  1.70652792e-01,  4.69117463e-01,
        -2.79838264e-01, -1.19746052e-01, -1.60718039e-01, -3.62952590e-01,
        -5.32832630e-02,  1.25012636e-01,  1.83655515e-01, -1.51157409e-01,
         1.29657716e-01, -2.93809354e-01, -6.71653748e-02, -5.10018587e-01,
         2.03251019e-01,  1.29632756e-01,  7.37347305e-02, -2.25308046e-01,
        -2.81593483e-02, -1.67162456e-02, -2.15953380e-01, -1.83421135e-01,
        -2.51972079e-01,  3.10592316e-02,  3.06342036e-01,  2.09046975e-02,
         1.10848173e-01, -1.77659884e-01, -1.37495577e-01,  3.82333010e-01,
         7.22519606e-02, -1.21558428e-01, -1.13294728e-01, -4.76530761e-01,
         1.22841492e-01, -2.54581034e-01, -1.81726277e-01,  2.45621838e-02,
         1.47301033e-01,  1.14536192e-02, -1.21417269e-01, -3.99263501e-02,
         2.02758700e-01,  1.39902681e-01,  1.71564624e-01, -1.85631886e-01,
        -6.6

In [412]:
X[0]

array([-1.71901003e-01,  2.38518789e-01,  1.15987293e-01,  1.01317674e-01,
        9.50029790e-02, -4.87670988e-01,  1.70652792e-01,  4.69117463e-01,
       -2.79838264e-01, -1.19746052e-01, -1.60718039e-01, -3.62952590e-01,
       -5.32832630e-02,  1.25012636e-01,  1.83655515e-01, -1.51157409e-01,
        1.29657716e-01, -2.93809354e-01, -6.71653748e-02, -5.10018587e-01,
        2.03251019e-01,  1.29632756e-01,  7.37347305e-02, -2.25308046e-01,
       -2.81593483e-02, -1.67162456e-02, -2.15953380e-01, -1.83421135e-01,
       -2.51972079e-01,  3.10592316e-02,  3.06342036e-01,  2.09046975e-02,
        1.10848173e-01, -1.77659884e-01, -1.37495577e-01,  3.82333010e-01,
        7.22519606e-02, -1.21558428e-01, -1.13294728e-01, -4.76530761e-01,
        1.22841492e-01, -2.54581034e-01, -1.81726277e-01,  2.45621838e-02,
        1.47301033e-01,  1.14536192e-02, -1.21417269e-01, -3.99263501e-02,
        2.02758700e-01,  1.39902681e-01,  1.71564624e-01, -1.85631886e-01,
       -6.63656890e-02,  

In [428]:
## independent features
X_new = np.array(X)
X_new.shape, len(X_new)

((5569, 100), 5569)

In [414]:
X_new[0]

array([-1.71901003e-01,  2.38518789e-01,  1.15987293e-01,  1.01317674e-01,
        9.50029790e-02, -4.87670988e-01,  1.70652792e-01,  4.69117463e-01,
       -2.79838264e-01, -1.19746052e-01, -1.60718039e-01, -3.62952590e-01,
       -5.32832630e-02,  1.25012636e-01,  1.83655515e-01, -1.51157409e-01,
        1.29657716e-01, -2.93809354e-01, -6.71653748e-02, -5.10018587e-01,
        2.03251019e-01,  1.29632756e-01,  7.37347305e-02, -2.25308046e-01,
       -2.81593483e-02, -1.67162456e-02, -2.15953380e-01, -1.83421135e-01,
       -2.51972079e-01,  3.10592316e-02,  3.06342036e-01,  2.09046975e-02,
        1.10848173e-01, -1.77659884e-01, -1.37495577e-01,  3.82333010e-01,
        7.22519606e-02, -1.21558428e-01, -1.13294728e-01, -4.76530761e-01,
        1.22841492e-01, -2.54581034e-01, -1.81726277e-01,  2.45621838e-02,
        1.47301033e-01,  1.14536192e-02, -1.21417269e-01, -3.99263501e-02,
        2.02758700e-01,  1.39902681e-01,  1.71564624e-01, -1.85631886e-01,
       -6.63656890e-02,  

In [415]:
df = pd.DataFrame(X_new)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.171901,0.238519,0.115987,0.101318,0.095003,-0.487671,0.170653,0.469117,-0.279838,-0.119746,...,0.353402,0.151133,0.039316,0.056658,0.416593,0.180812,0.147586,-0.197433,0.144407,0.006079
1,-0.162513,0.209077,0.10008,0.088069,0.090368,-0.427797,0.137277,0.417016,-0.246956,-0.097988,...,0.319145,0.125738,0.030251,0.044053,0.356651,0.156388,0.131867,-0.183201,0.133663,-0.001021
2,-0.180867,0.253368,0.120615,0.118939,0.080597,-0.525572,0.17293,0.465684,-0.295507,-0.141446,...,0.345733,0.152629,0.039045,0.042281,0.425459,0.170894,0.098163,-0.227028,0.169088,0.022337
3,-0.23999,0.318394,0.152425,0.138025,0.132262,-0.659956,0.222052,0.64035,-0.385476,-0.155073,...,0.486194,0.200453,0.047928,0.083667,0.553422,0.251471,0.216784,-0.272988,0.197372,-0.000314
4,-0.205227,0.262383,0.13632,0.112498,0.11902,-0.559396,0.187444,0.544887,-0.328734,-0.138495,...,0.413885,0.170021,0.045125,0.073063,0.473733,0.215058,0.179271,-0.239839,0.162844,-0.002379


In [416]:
df.isnull().sum()

0     0
1     0
2     0
3     0
4     0
     ..
95    0
96    0
97    0
98    0
99    0
Length: 100, dtype: int64

In [427]:
valid_indices = [i for i, text in enumerate(corpus) if len(text) > 0]
y = messages.iloc[valid_indices]['labels']
y = pd.get_dummies(y, dtype=int)
y = y.iloc[:, 0].values
type(y),len(valid_indices)

5569

In [418]:
df['output'] = y
X=df.drop(columns=['output'], axis=1)
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.171901,0.238519,0.115987,0.101318,0.095003,-0.487671,0.170653,0.469117,-0.279838,-0.119746,...,0.353402,0.151133,0.039316,0.056658,0.416593,0.180812,0.147586,-0.197433,0.144407,0.006079
1,-0.162513,0.209077,0.100080,0.088069,0.090368,-0.427797,0.137277,0.417016,-0.246956,-0.097988,...,0.319145,0.125738,0.030251,0.044053,0.356651,0.156388,0.131867,-0.183201,0.133663,-0.001021
2,-0.180867,0.253368,0.120615,0.118939,0.080597,-0.525572,0.172930,0.465684,-0.295507,-0.141446,...,0.345733,0.152629,0.039045,0.042281,0.425459,0.170894,0.098163,-0.227028,0.169088,0.022337
3,-0.239990,0.318394,0.152425,0.138025,0.132262,-0.659956,0.222052,0.640350,-0.385476,-0.155073,...,0.486194,0.200453,0.047928,0.083667,0.553422,0.251471,0.216784,-0.272988,0.197372,-0.000314
4,-0.205227,0.262383,0.136320,0.112498,0.119020,-0.559396,0.187444,0.544887,-0.328734,-0.138495,...,0.413885,0.170021,0.045125,0.073063,0.473733,0.215058,0.179271,-0.239839,0.162844,-0.002379
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5564,-0.209392,0.301750,0.155592,0.134916,0.106652,-0.599706,0.211712,0.556008,-0.343741,-0.157726,...,0.422699,0.184844,0.053844,0.054834,0.509177,0.205104,0.138313,-0.250284,0.184009,0.025478
5565,-0.213828,0.280625,0.136676,0.128272,0.115799,-0.595075,0.201423,0.566000,-0.356460,-0.143571,...,0.430385,0.187020,0.033664,0.063535,0.496051,0.216838,0.163350,-0.259224,0.183298,0.009906
5566,-0.238537,0.323800,0.152022,0.126118,0.120387,-0.651460,0.229385,0.636964,-0.377807,-0.167890,...,0.480744,0.212837,0.057023,0.090006,0.561101,0.252854,0.216566,-0.257525,0.185173,0.004197
5567,-0.211637,0.285901,0.139618,0.121601,0.110417,-0.592179,0.201427,0.567805,-0.340724,-0.152995,...,0.423110,0.184598,0.054165,0.074561,0.503731,0.223222,0.174562,-0.240238,0.169524,0.001651


In [419]:
y=df['output']
y

0       1
1       1
2       0
3       1
4       1
       ..
5564    0
5565    1
5566    1
5567    1
5568    1
Name: output, Length: 5569, dtype: int32

In [420]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [434]:
X_train.shape

(4455, 100)

In [422]:
y_train

3918    1
3986    0
2758    1
4796    0
4503    1
       ..
3772    1
5191    1
5226    1
5390    1
860     1
Name: output, Length: 4455, dtype: int32

In [429]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [430]:
classifier.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [431]:
y_pred = classifier.predict(X_test)

In [432]:
from sklearn.metrics import accuracy_score,classification_report
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.93      0.87      0.90       159
           1       0.98      0.99      0.98       955

    accuracy                           0.97      1114
   macro avg       0.96      0.93      0.94      1114
weighted avg       0.97      0.97      0.97      1114

0.9730700179533214
