# Text Classification 

## SkipGram Model - Word2Vec implementation

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter

### dataset : Indian News Articles

link : https://www.kaggle.com/datasets/therohk/india-headlines-news-dataset 

In [41]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

In [42]:
# data = np.loadtxt('./data/india-news-headlines.csv', dtype=str)
# data = data[1:]
df = pd.read_csv('./data/india-news-headlines.csv')
df.head( )

Unnamed: 0,publish_date,headline_category,headline_text
0,20010102,unknown,Status quo will not be disturbed at Ayodhya; s...
1,20010102,unknown,Fissures in Hurriyat over Pak visit
2,20010102,unknown,America's unwanted heading for India?
3,20010102,unknown,For bigwigs; it is destination Goa
4,20010102,unknown,Extra buses to clear tourist traffic


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3876557 entries, 0 to 3876556
Data columns (total 3 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   publish_date       int64 
 1   headline_category  object
 2   headline_text      object
dtypes: int64(1), object(2)
memory usage: 88.7+ MB


In [44]:
df = df.dropna()
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df.shape

(3850912, 3)

In [45]:
sentances = df['headline_text'].tolist()
type(sentances)
sentances[:5]

list

['Status quo will not be disturbed at Ayodhya; says Vajpayee',
 'Fissures in Hurriyat over Pak visit',
 "America's unwanted heading for India?",
 'For bigwigs; it is destination Goa',
 'Extra buses to clear tourist traffic']

In [46]:
import re 
def pre_process_data(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]','',text)
    return text

sentances = [pre_process_data(text) for text in sentances] 
sentances[:5]


['status quo will not be disturbed at ayodhya says vajpayee',
 'fissures in hurriyat over pak visit',
 'americas unwanted heading for india',
 'for bigwigs it is destination goa',
 'extra buses to clear tourist traffic']

In [47]:
sentances = sentances[:15000]

In [48]:
d = 128  # Embedding dimension
m = 2    # Context window size
batch_size = 32
epochs = 100
learning_rate = 0.001

In [49]:
# tokenized_sentences = [sentence.split() for sentence in sentances]
# tokenized_sentences[:4] , len(tokenized_sentences)

In [50]:
# words = [word for sentence in tokenized_sentences for word in sentence]
# words[:4] , len(words)

In [51]:
# word_counts = Counter(words)
# vocab = {word: idx for idx, (word, _) in enumerate(word_counts.most_common())}
# vocab_size = len(vocab)
# vocab_size

In [52]:
# pairs = []
# for sentence in tokenized_sentences:
#     indices = [vocab[word] for word in sentence]
#     for i in range(len(indices)):
#         target = indices[i]
#         context = indices[max(0, i - m):i] + indices[i + 1:min(len(indices), i + m + 1)]
#         for ctx in context:
#             pairs.append((target, ctx))
# pairs[:4] , len(pairs)

In [53]:
# idx2word = {idx: word for word, idx in vocab.items()}

In [56]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [57]:
class SkipGramDataset(Dataset):
    def __init__(self, data):
        tokenized_sentences = [sentence.split() for sentence in data]
        self.words = [word for sentence in tokenized_sentences for word in sentence]
        self.word_counts = Counter(self.words)
        self.vocab = {word: idx for idx, (word, _) in enumerate(self.word_counts.most_common())}
        self.vocab_size = len(self.vocab)
        self.word_dict = {idx: word for word, idx in self.vocab.items()}
        
        self.pairs = []
        for sentence in tokenized_sentences:
            indices = [self.vocab[word] for word in sentence]
            for i in range(len(indices)):
                target = indices[i]
                context = indices[max(0, i - m):i] + indices[i + 1:min(len(indices), i + m + 1)]
                for ctx in context:
                    self.pairs.append((target, ctx))
        
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, idx):
        return torch.tensor(self.pairs[idx][0], dtype=torch.long).to(device), torch.tensor(self.pairs[idx][1], dtype=torch.long).to(device)

    def idx2word(self, idx):
        return self.word_dict[idx]
    
dataset = SkipGramDataset(sentances)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
len(dataset)

274470

In [58]:
dataset.vocab_size

16115

In [None]:
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.in_embed = nn.Embedding(vocab_size, embedding_dim)
        self.out_embed = nn.Embedding(vocab_size, embedding_dim)
        
    def forward(self, target, context):
        target_embed = self.in_embed(target) # (batch_size, d)
        target_embed = target_embed.to(device)
        context_embed = self.out_embed(context)  # (batch_size, d)
        context_embed = context_embed.to(device)
        scores = torch.matmul(target_embed, context_embed.T)  # (batch_size, batch_size)
        return scores

model = Word2Vec(dataset.vocab_size, d)
model = model.to('cuda')
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [60]:
import datetime

epochs = 2
start_train = datetime.datetime.now()

for epoch in range(epochs):
    total_loss = 0
    start_epoch = datetime.datetime.now()
    for target, context in dataloader:
        optimizer.zero_grad()
        scores = model(target, context)
        loss = criterion(scores, torch.arange(len(target)).to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
    end_epoch = datetime.datetime.now()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader):.4f} Time : {end_epoch - start_epoch}")
end_train = datetime.datetime.now()
print(f"Total training time: {end_train - start_train}")

# Get final embeddings
embeddings = model.in_embed.weight.data
print("Embeddings shape:", embeddings.shape)

Epoch 1/2, Loss: 18.6476
Epoch 2/2, Loss: 12.7575
Embeddings shape: torch.Size([16115, 128])


In [61]:
word = "india"
embedding = embeddings[dataset.vocab[word]]
embedding

tensor([ 0.8281,  0.2208,  0.2066, -0.3692, -0.2861, -0.5029, -0.9231, -0.0129,
        -0.2746,  0.6212, -0.2126, -0.0185,  0.1650, -0.1373,  0.1493,  0.2634,
        -0.3162,  0.2602, -0.2889, -0.0176, -0.1919,  0.1611, -0.4257,  0.3053,
        -0.1197,  0.8984, -0.3155, -0.3865,  0.1984,  0.2698, -0.1225,  0.1945,
        -0.2526, -0.2370,  0.2030,  0.0103, -0.3100,  0.1015, -1.0637,  0.6350,
         0.2944,  0.1062, -0.8214, -0.5063,  0.0519,  0.1600,  0.4495,  0.6756,
        -0.0444,  0.7521, -0.5282, -0.3213, -0.1128, -0.8681,  0.0423,  0.2985,
         0.4212,  0.8281, -0.9976,  0.3831,  0.2669,  0.9346,  0.1325,  0.4874,
        -0.2070, -0.1826, -0.1507,  0.3772,  0.6873, -0.3246, -0.0659,  0.7672,
         0.3524,  0.5708, -0.4227,  0.0271, -0.3807,  0.8528,  0.0433, -0.2789,
         0.7951, -0.4810, -0.1612, -0.2830, -0.4202, -0.3223, -0.4604, -0.3238,
        -0.0356,  1.0936, -0.4676,  0.2899,  0.1627, -0.0911,  0.6669, -0.2958,
         1.6538,  0.7600, -0.0513, -0.64

In [63]:
def find_similar(word, top_k=5):
    with torch.no_grad():
        vec = embeddings[dataset.vocab[word]]
        similarities = torch.matmul(embeddings, vec)
        values, indices = torch.topk(similarities, top_k+1)
        return [(dataset.idx2word(idx.item()), val.item()) for val, idx in zip(values[1:], indices[1:])]
    
find_similar("india")

[('testify', 22.02251625061035),
 ('humiliation', 20.87239646911621),
 ('e', 20.559024810791016),
 ('hoitytoity', 20.296947479248047),
 ('mouma', 20.176151275634766)]

In [65]:
find_similar("goa")

[('trainers', 30.954187393188477),
 ('priests', 29.73798942565918),
 ('restriction', 29.651779174804688),
 ('allegations', 27.303354263305664),
 ('f', 26.921924591064453)]

In [71]:
import datetime

epochs = 100
start_train = datetime.datetime.now()

for epoch in range(epochs):
    total_loss = 0
    start_epoch = datetime.datetime.now()
    for target, context in dataloader:
        optimizer.zero_grad()
        scores = model(target, context)
        loss = criterion(scores, torch.arange(len(target)).to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
    end_epoch = datetime.datetime.now()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader):.4f} Time : {end_epoch - start_epoch}")
end_train = datetime.datetime.now()
print(f"Total training time: {end_train - start_train}")

# Get final embeddings
embeddings = model.in_embed.weight.data
print("Embeddings shape:", embeddings.shape)

Epoch 1/100, Loss: 5.9219 Time : 0:00:58.164656
Epoch 2/100, Loss: 4.8245 Time : 0:01:03.099172
Epoch 3/100, Loss: 4.0234 Time : 0:00:57.366843
Epoch 4/100, Loss: 3.4184 Time : 0:01:04.583211
Epoch 5/100, Loss: 2.9561 Time : 0:00:58.608970
Epoch 6/100, Loss: 2.6140 Time : 0:01:03.982722
Epoch 7/100, Loss: 2.3386 Time : 0:00:54.838242
Epoch 8/100, Loss: 2.1318 Time : 0:01:03.527977
Epoch 9/100, Loss: 1.9658 Time : 0:00:56.782572
Epoch 10/100, Loss: 1.8342 Time : 0:01:02.251838
Epoch 11/100, Loss: 1.7314 Time : 0:00:57.739489
Epoch 12/100, Loss: 1.6460 Time : 0:01:02.666478
Epoch 13/100, Loss: 1.5748 Time : 0:00:55.932510
Epoch 14/100, Loss: 1.5190 Time : 0:01:02.170413
Epoch 15/100, Loss: 1.4716 Time : 0:00:55.336758
Epoch 16/100, Loss: 1.4305 Time : 0:01:03.820962
Epoch 17/100, Loss: 1.3986 Time : 0:00:57.933283
Epoch 18/100, Loss: 1.3705 Time : 0:01:00.363841
Epoch 19/100, Loss: 1.3478 Time : 0:00:56.754024
Epoch 20/100, Loss: 1.3255 Time : 0:01:02.396801
Epoch 21/100, Loss: 1.3109 Ti

* 100 epochs
* Total training time: 2:02:26.107574
* Embeddings shape: torch.Size([16115, 128])

In [72]:
word = "india"
embedding = embeddings[dataset.vocab[word]]
embedding

tensor([ 0.0194,  0.3255,  0.0346,  0.0183,  0.1571,  0.0103, -0.1187,  0.0466,
         0.4334,  0.0987, -0.3504, -0.3079, -0.0638,  0.2358,  0.0409,  0.0637,
        -0.2056,  0.0773,  0.2327,  0.0081, -0.1010,  0.1506, -0.2580, -0.1388,
        -0.1459,  0.3029, -0.0941,  0.0629, -0.3272, -0.4064, -0.3322,  0.4888,
         0.0069, -0.0429, -0.3430,  0.2600,  0.0395,  0.0536, -0.2117,  0.1122,
         0.1337, -0.0182, -0.0374,  0.2303, -0.0293,  0.0690,  0.1730, -0.2067,
        -0.1530,  0.4344, -0.0104, -0.1476,  0.0397,  0.2683,  0.2841,  0.3988,
         0.3831,  0.2640, -0.2403,  0.0968, -0.2361,  0.2960,  0.4244,  0.0815,
        -0.2283, -0.1886,  0.0197, -0.0378,  0.1208, -0.2162, -0.0120, -0.0064,
        -0.0338, -0.0991,  0.1906, -0.1373, -0.1937,  0.2037, -0.5391,  0.0145,
        -0.0810,  0.0641, -0.1783, -0.3374, -0.1284,  0.0445,  0.0844, -0.3269,
         0.1566,  0.1763, -0.1409,  0.2428,  0.4478,  0.0144,  0.1651, -0.1897,
         0.2562,  0.0517,  0.0119, -0.37

In [73]:
find_similar("india")

[('nawruz', 9.902406692504883),
 ('singareni', 9.812178611755371),
 ('septuplets', 9.148886680603027),
 ('diagnostic', 9.141351699829102),
 ('plough', 8.840997695922852)]

In [90]:
def find_similar_cosine(word, top_k=5):
    with torch.no_grad():
        vec = embeddings[dataset.vocab[word]]
        similarities = torch.matmul(embeddings, vec) / (torch.norm(embeddings, dim=1) * torch.norm(vec))
        values, indices = torch.topk(similarities, top_k+1)
        return [(dataset.idx2word(idx.item()), val.item()) for val, idx in zip(values[1:], indices[1:])]

find_similar_cosine("india")

[('us', 0.4843333661556244),
 ('pm', 0.44458457827568054),
 ('it', 0.4277379512786865),
 ('talks', 0.41505417227745056),
 ('its', 0.400892972946167)]

In [83]:
find_similar_cosine("goa")

[('ministers', 0.33838722109794617),
 ('fashions', 0.3322593569755554),
 ('set', 0.3311415910720825),
 ('at', 0.32622039318084717),
 ('new', 0.3222562074661255)]

In [84]:
find_similar_cosine("man")

[('and', 0.4343026280403137),
 ('as', 0.42826682329177856),
 ('set', 0.407269150018692),
 ('down', 0.4007364809513092),
 ('chief', 0.40044018626213074)]

In [138]:
w1 =  embeddings[dataset.vocab["man"]]
w2 =  embeddings[dataset.vocab["king"]]
w3 =  embeddings[dataset.vocab["queen"]]
w4 = w1 - w2 + w3
def find_similar_cosine_vec(vec, top_k=5):
    with torch.no_grad():
        similarities = torch.matmul(embeddings, vec) / (torch.norm(embeddings, dim=1) * torch.norm(vec))
        values, indices = torch.topk(similarities, top_k+1)
        return [(dataset.idx2word(idx.item()), val.item()) for val, idx in zip(values, indices)]

find_similar_cosine_vec(w4)

[('queen', 0.7592395544052124),
 ('fests', 0.33734554052352905),
 ('expired', 0.31791630387306213),
 ('landmine', 0.31200072169303894),
 ('lethal', 0.29223430156707764),
 ('2612', 0.2892310321331024)]

In [137]:
def analogy(word1, word2, word3):
    w1 = embeddings[dataset.vocab[word1]]
    w2 = embeddings[dataset.vocab[word2]]
    w3 = embeddings[dataset.vocab[word3]]
    w4 = w2 - w1 + w3
    embs =  find_similar_cosine_vec(w4)
    return [emb for emb in embs if emb[0] not in [word1, word2, word3]] 

analogy("king", "man" ,"queen")

[('fests', 0.33734554052352905),
 ('expired', 0.31791630387306213),
 ('landmine', 0.31200072169303894),
 ('lethal', 0.29223430156707764),
 ('2612', 0.2892310321331024)]

In [76]:
# folder = './model_parameters/'
# np.save(folder + 'embeddings.npy', embeddings.cpu().numpy()) 
# np.save(folder + 'vocab.npy', dataset.vocab)
# np.save(folder + 'word_dict.npy', dataset.word_dict)
# np.save(folder + 'word_counts.npy', dataset.word_counts)
# np.save(folder + 'pairs.npy', dataset.pairs)
# np.save(folder + 'words.npy', dataset.words)

In [134]:
folder = './model_parameters/'
embeddings = torch.tensor(np.load(folder + 'embeddings.npy')) 

## SkipGram - pretrained model

In [60]:
import gensim 
from gensim.models import KeyedVectors
import gensim.downloader as api

def download_word2vec_model(model_name="word2vec-google-news-300"):
    try:
        # Check if model is available
        available_models = api.info()['models'].keys()
        if model_name not in available_models:
            raise ValueError(
                f"Model '{model_name}' not found. Available models: {', '.join(available_models)}"
            )

        print(f"Downloading {model_name}...")
        model_path = api.load(model_name, return_path=True)
        print(f"Model downloaded successfully to: {model_path}")

        return model_path

    except Exception as e:
        print(f"Error downloading model: {str(e)}")
        raise

word2vec_path = download_word2vec_model()
embeddings = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

Downloading word2vec-google-news-300...
Model downloaded successfully to: C:\Users\myalla/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz


In [92]:
embeddings.most_similar("india")

[('indian', 0.6967039704322815),
 ('usa', 0.6836211085319519),
 ('pakistan', 0.681516706943512),
 ('chennai', 0.6675503253936768),
 ('america', 0.6589399576187134),
 ('sri_lanka', 0.64982008934021),
 ('canada', 0.6490967869758606),
 ('australia', 0.6368584036827087),
 ('mexico', 0.6239137649536133),
 ('uk', 0.6221641898155212)]

In [96]:
w1 = embeddings.get_vector("king")
w2 = embeddings.get_vector("man")
w3 = embeddings.get_vector("woman")

w4 = w1 - w2 + w3

embeddings.similar_by_vector(w4)

[('king', 0.8449392318725586),
 ('queen', 0.7300517559051514),
 ('monarch', 0.645466148853302),
 ('princess', 0.6156251430511475),
 ('crown_prince', 0.5818676352500916),
 ('prince', 0.5777117609977722),
 ('kings', 0.5613663792610168),
 ('sultan', 0.5376775860786438),
 ('Queen_Consort', 0.5344247817993164),
 ('queens', 0.5289887189865112)]

In [105]:
def analogy_gensim(word1, word2, word3):
    w1 = embeddings.get_vector(word1)
    w2 = embeddings.get_vector(word2)
    w3 = embeddings.get_vector(word3)
    w4 = w2 - w1 + w3
    embs =  embeddings.similar_by_vector(w4)
    return [emb for emb in embs if emb[0] not in [word1, word2, word3]] 

analogy_gensim("king", "man" ,"queen")

[('woman', 0.7186801433563232),
 ('girl', 0.5882835388183594),
 ('lady', 0.5754351615905762),
 ('teenage_girl', 0.5700528025627136),
 ('teenager', 0.5378326177597046),
 ('schoolgirl', 0.497780978679657),
 ('policewoman', 0.49065014719963074),
 ('blonde', 0.4870774745941162),
 ('redhead', 0.4778464436531067)]

In [116]:
analogy_gensim("paris", "france" ,"tokyo")[:3]
analogy_gensim("usa", "dollar","india")[:3]
analogy_gensim("germany", "hitler" ,"india")[:3]
analogy_gensim("usa", "english" ,"india")[:3]

[('japan', 0.5214436054229736),
 ('hong_kong', 0.46592071652412415),
 ('japanese', 0.4565887153148651)]

[('rupee', 0.6182144284248352),
 ('greenback', 0.5907650589942932),
 ('Japanese_Yen', 0.5396061539649963)]

[('bjp', 0.5662038326263428),
 ('sonia', 0.5460007786750793),
 ('modi', 0.5448266863822937)]

[('English', 0.5612783432006836),
 ('Hindi', 0.5493810772895813),
 ('Institute_ITRI_eng', 0.5471854209899902)]

## using the pretrained model to classify the news articles

In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### Dataset : E commerce 

In [36]:
dataset_file= './../datasets/Ecommerce/ecommerceDataset.csv'
df = pd.read_csv(dataset_file, header=None)
df.head()

Unnamed: 0,0,1
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [37]:
df.columns = ['type', 'text']
df.head()

Unnamed: 0,type,text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [38]:
df.value_counts('type')

type
Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: count, dtype: int64

In [51]:
import re
from nltk.corpus import stopwords
def preprocessing_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'[\s+]', ' ', text)
    return text
df['pre_text'] = df['text'].astype(str)
df['pre_text'] = df['pre_text'].str.lower()
df['pre_text'] = df['pre_text'].apply(preprocessing_text)
df.head()

Unnamed: 0,type,text,pre_text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,paper plane design framed wall hanging motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",saf floral framed painting wood 30 inch x 10 i...
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,saf uv textured modern art print framed painti...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",saf flower print framed painting synthetic 135...
4,Household,Incredible Gifts India Wooden Happy Birthday U...,incredible gifts india wooden happy birthday u...


In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27802 entries, 0 to 27801
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   type      27802 non-null  object
 1   text      27802 non-null  object
 2   pre_text  27802 non-null  object
dtypes: object(3)
memory usage: 651.7+ KB


In [53]:
df['pre_text'][0]

'paper plane design framed wall hanging motivational office decor art prints 87 x 87 inch  set of 4 painting made up in synthetic frame with uv textured print which gives multi effects and attracts towards it this is an special series of paintings which makes your wall very beautiful and gives a royal touch this painting is ready to hang you would be proud to possess this unique painting that is a niche apart we use only the most modern and efficient printing technology on our prints with only the and inks and precision epson roland and hp printers this innovative hd printing technique results in durable and spectacular looking prints of the highest that last a lifetime we print solely with topnotch 100 inks to achieve brilliant and true colours due to their high level of uv resistance our prints retain their beautiful colours for many years add colour and style to your living space with this digitally printed painting some are for pleasure and some for eternal blissso bring home this 

In [54]:
df = df.dropna()
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df.shape

(27802, 3)

In [55]:
X = df['pre_text'].tolist()
y = df['type'].tolist()
len(X), len(y)

(27802, 27802)

In [61]:
# word2vec_path = download_word2vec_model()
# embeddings = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
dim = embeddings.vector_size
dim

300

In [75]:
def preprocess(text):
    text = re.sub(r'[\s+]', ' ', text)
    return text.split()

Xd = []
skipped = []
for i in range(len(X)):
    if type(X[i]) != str:
        print(X[i] , type(X[i]),i, 'skipped', y[i])
        skipped.append(i)
        continue
    prep_text = preprocess(X[i])
    emb = np.zeros(dim)
    cnt = 0
    for text in prep_text:
        try:
            ei= embeddings.get_vector(text)
            emb += ei
            cnt += 1
        except KeyError:
            pass
    if cnt > 0:
        emb /= cnt
    else :
        print(prep_text, i, 'skipped', y[i])
        skipped.append(i)
        continue
    Xd.append(emb)
    
Xd = np.array(Xd)
Xd.shape, len(skipped)

['paper', 'plane', 'design', 'framed', 'wall', 'hanging', 'motivational', 'office', 'decor', 'art', 'prints', '87', 'x', '87', 'inch', 'set', 'of', '4', 'painting', 'made', 'up', 'in', 'synthetic', 'frame', 'with', 'uv', 'textured', 'print', 'which', 'gives', 'multi', 'effects', 'and', 'attracts', 'towards', 'it', 'this', 'is', 'an', 'special', 'series', 'of', 'paintings', 'which', 'makes', 'your', 'wall', 'very', 'beautiful', 'and', 'gives', 'a', 'royal', 'touch', 'this', 'painting', 'is', 'ready', 'to', 'hang', 'you', 'would', 'be', 'proud', 'to', 'possess', 'this', 'unique', 'painting', 'that', 'is', 'a', 'niche', 'apart', 'we', 'use', 'only', 'the', 'most', 'modern', 'and', 'efficient', 'printing', 'technology', 'on', 'our', 'prints', 'with', 'only', 'the', 'and', 'inks', 'and', 'precision', 'epson', 'roland', 'and', 'hp', 'printers', 'this', 'innovative', 'hd', 'printing', 'technique', 'results', 'in', 'durable', 'and', 'spectacular', 'looking', 'prints', 'of', 'the', 'highest', '

((27786, 300), 16)

In [76]:
y = np.array(y)
y.shape

(27802,)

In [77]:
mask = np.ones(len(y), dtype=bool)
mask[skipped] = False
y[mask].shape, Xd.shape

((27786,), (27786, 300))

In [78]:
dataset_file= './../datasets/Ecommerce/'

np.save(dataset_file + 'X.npy', Xd)
np.save(dataset_file + 'y.npy', y[mask])

In [79]:
X = np.load(dataset_file + 'X.npy')
y = np.load(dataset_file + 'y.npy')

X_train , X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((22228, 300), (5558, 300), (22228,), (5558,))

In [87]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [88]:
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred) * 100

92.92911119107593