# Smote: equilibrio de clases

In [46]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
from keras_preprocessing.sequence import pad_sequences
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import nltk

import warnings
warnings.filterwarnings("ignore")


In [41]:
df = pd.read_csv('https://raw.githubusercontent.com/lihkir/Data/main/spam_text_class.csv',delimiter=',',encoding='latin-1')
df = df[['Category','Message']]
df = df[pd.notnull(df['Message'])]
df.index = range(5572)

In [39]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


 this function cleans and preprocesses text data by removing HTML tags, replacing triple pipes with spaces, replacing URLs with $'<URL>'$, converting text to lowercase, and removing 'x' characters.

In [47]:
from bs4 import BeautifulSoup

def cleanText(text):
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\|\|\|', r' ', text) 
    text = re.sub(r'http\S+', r'<URL>', text)
    text = text.lower()
    text = text.replace('x', '')
    return text
df['Message'] = df['Message'].apply(cleanText)

In [48]:
train, test = train_test_split(df, test_size=0.000001 , random_state=42)

In [23]:
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) <= 0:
                continue
            tokens.append(word.lower())
    return tokens

train_tagged = train.apply(lambda r: TaggedDocument(words=tokenize_text(r['Message']), tags=[r.Category]), axis=1)
test_tagged = test.apply(lambda r: TaggedDocument(words=tokenize_text(r['Message']), tags=[r.Category]), axis=1)

# Número máximo de palabras a utilizar (más frecuentes)
max_fatures = 500000

#Número máximo de palabras en cada reclamación.

MAX_SEQUENCE_LENGTH = 50

tokenizer = Tokenizer(num_words=max_fatures, split=' ', filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['Message'].values)
X = tokenizer.texts_to_sequences(df['Message'].values)
X = pad_sequences(X)
print('Found %s unique tokens.' % len(X))

Found 5572 unique tokens.


In [24]:
X = tokenizer.texts_to_sequences(df['Message'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (5572, 50)


In [25]:
train_tagged.values

array([TaggedDocument(words=['and', 'also', 'i', "'ve", 'sorta', 'blown', 'him', 'off', 'a', 'couple', 'times', 'recently', 'so', 'id', 'rather', 'not', 'tet', 'him', 'out', 'of', 'the', 'blue', 'looking', 'for', 'weed'], tags=['ham']),
       TaggedDocument(words=['mmm', 'thats', 'better', 'now', 'i', 'got', 'a', 'roast', 'down', 'me', '!', 'iâ\x92d', 'b', 'better', 'if', 'i', 'had', 'a', 'few', 'drinks', 'down', 'me', '2', '!', 'good', 'indian', '?'], tags=['ham']),
       TaggedDocument(words=['mm', 'have', 'some', 'kanji', 'dont', 'eat', 'anything', 'heavy', 'ok'], tags=['ham']),
       ...,
       TaggedDocument(words=['prabha', '..', 'i', "'m", 'soryda', '..', 'realy', '..', 'frm', 'heart', 'i', "'m", 'sory'], tags=['ham']),
       TaggedDocument(words=['nt', 'joking', 'seriously', 'i', 'told'], tags=['ham']),
       TaggedDocument(words=['did', 'he', 'just', 'say', 'somebody', 'is', 'named', 'tampa'], tags=['ham'])],
      dtype=object)

In [26]:
d2v_model = Doc2Vec(dm=1, dm_mean=1, vector_size=20, window=8, min_count=1, workers=1, alpha=0.065, min_alpha=0.065)

In [27]:
d2v_model.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 5571/5571 [00:00<00:00, 927719.36it/s]




In [28]:
%%time
for epoch in range(30):
    d2v_model.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    d2v_model.alpha -= 0.002
    d2v_model.min_alpha = d2v_model.alpha

100%|██████████| 5571/5571 [00:00<00:00, 651947.98it/s]


100%|██████████| 5571/5571 [00:00<00:00, 1277484.42it/s]
100%|██████████| 5571/5571 [00:00<?, ?it/s]
100%|██████████| 5571/5571 [00:00<00:00, 337266.07it/s]
100%|██████████| 5571/5571 [00:00<00:00, 1113325.12it/s]
100%|██████████| 5571/5571 [00:00<00:00, 356163.58it/s]
100%|██████████| 5571/5571 [00:00<00:00, 1005312.03it/s]
100%|██████████| 5571/5571 [00:00<00:00, 686542.31it/s]
100%|██████████| 5571/5571 [00:00<00:00, 1112053.47it/s]
100%|██████████| 5571/5571 [00:00<?, ?it/s]
100%|██████████| 5571/5571 [00:00<?, ?it/s]
100%|██████████| 5571/5571 [00:00<00:00, 10837879.21it/s]
100%|██████████| 5571/5571 [00:00<?, ?it/s]
100%|██████████| 5571/5571 [00:00<?, ?it/s]
100%|██████████| 5571/5571 [00:00<?, ?it/s]
100%|██████████| 5571/5571 [00:00<00:00, 356049.61it/s]
100%|██████████| 5571/5571 [00:00<?, ?it/s]
100%|██████████| 5571/5571 [00:00<?, ?it/s]
100%|██████████| 5571/5571 [00:00<?, ?it/s]
100%|██████████| 5571/5571 [00:00<00:00, 1561929.65it/s]
100%|██████████| 5571/5571 [00:00<00

CPU times: total: 7.05 s
Wall time: 16.9 s


In [31]:
len(d2v_model.wv.vocab)

AttributeError: The vocab attribute was removed from KeyedVector in Gensim 4.0.0.
Use KeyedVector's .key_to_index dict, .index_to_key list, and methods .get_vecattr(key, attr) and .set_vecattr(key, attr, new_val) instead.
See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4

In [29]:
print(d2v_model)

Doc2Vec<dm/m,d20,n5,w8,s0.001>


In [30]:
embedding_matrix = np.zeros((len(d2v_model.wv.vocab)+ 1, 20))

AttributeError: The vocab attribute was removed from KeyedVector in Gensim 4.0.0.
Use KeyedVector's .key_to_index dict, .index_to_key list, and methods .get_vecattr(key, attr) and .set_vecattr(key, attr, new_val) instead.
See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4