In [4]:
import json

# File path for JSON dataset
file_path = 'data.json'

# Load JSON data
with open(file_path, 'r') as file:
    data = json.load(file)

# Extracting the 'text' content from the dataset
texts = [item['_source']['text'] for item in data] if isinstance(data, list) else [data['_source']['text']]
print(texts)

['Dear sir \nGood day to you. We are Wholesale manufacturer and exporter in customized soccer uniform,baseball uniform and sports wear and undergarments etc.\n\n\n1. We are expert team of managers, merchandisers, designers and most of all great professional stitching labour.\n\n2. fast turnaround time.\n\n3. artworks within 12-24hours.\n\n4. warranty on our made goods\n\n5. unbeatable quality and unbeatable prices\n\n6. wide range of customization\n\n7. we offer free samples\n\nFeel free to tell us if you need any customized artwork or customized sample.\n\nHoping to hear from you soon on my offer!\n\nThank you so much!\n\nasquareindustry524@gmail.com\n\nRegards:\nAsquareindustry \nAlso contact us on WhatsApp: +923321208372']


In [2]:
#!pip install gensim transformers sentence-transformers

Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/15/fc/7b6dd7e1adc0a6407b845ed4be1999e98b6917d0694e57316d140cc85484/transformers-4.39.3-py3-none-any.whl.metadata
  Downloading transformers-4.39.3-py3-none-any.whl.metadata (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting sentence-transformers
  Obtaining dependency information for sentence-transformers from https://files.pythonhosted.org/packages/ba/20/7ef81df2e07322d95332d07c1c38c597f543c1f666d689a3153ba6fa09e3/sentence_transformers-2.6.1-py3-none-any.whl.metadata
  Downloading sentence_transformers-2.6.1-py3-none-any.whl.metadata (11 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Obtaining dependency information for huggingface-hub<1.0,>=0.19.3 from https://files.pythonhosted.org/packages/05/c0/779afbad8e75565c09ffa24a88b5dd7e293c92b74eb09d

In [5]:
# Data Cleaning and Preprocessing
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# nltk.download('punkt')
# nltk.download('stopwords')

def preprocess_text(text):
    # Tokenization: Split the text into words/tokens.
    tokens = word_tokenize(text)
    
    # Lowercasing: Convert all characters to lowercase to ensure uniformity.
    tokens = [word.lower() for word in tokens]
    
    # Removing Punctuation: Strip punctuation from each word.
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    
    # Removing Non-Alphabetic Characters: Filter out any tokens that are not alphabetic.
    words = [word for word in stripped if word.isalpha()]
    
    # Stopwords Removal: Filter out stopwords like 'the', 'is', etc.
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    
    # The result is a list of clean, lowercased, alphabetic tokens without any stopwords.
    return words


# Apply preprocessing to each text
processed_data = [preprocess_text(text) for text in texts]
print(processed_data)

[['dear', 'sir', 'good', 'day', 'wholesale', 'manufacturer', 'exporter', 'customized', 'soccer', 'uniform', 'baseball', 'uniform', 'sports', 'wear', 'undergarments', 'etc', 'expert', 'team', 'managers', 'merchandisers', 'designers', 'great', 'professional', 'stitching', 'labour', 'fast', 'turnaround', 'time', 'artworks', 'within', 'warranty', 'made', 'goods', 'unbeatable', 'quality', 'unbeatable', 'prices', 'wide', 'range', 'customization', 'offer', 'free', 'samples', 'feel', 'free', 'tell', 'us', 'need', 'customized', 'artwork', 'customized', 'sample', 'hoping', 'hear', 'soon', 'offer', 'thank', 'much', 'gmailcom', 'regards', 'asquareindustry', 'also', 'contact', 'us', 'whatsapp']]


In [11]:
# Word2Vec
from gensim.models import Word2Vec

# Creating the Word2Vec model
model = Word2Vec(processed_data, vector_size=100, window=5, min_count=1, workers=4)


Word2Vec<vocab=58, vector_size=100, alpha=0.025>


In [18]:
# BERT
import torch
from transformers import BertTokenizer, BertModel

# Function to convert tokens back to string
def tokens_to_string(tokens):
    return ' '.join(tokens)

# Assuming you have your preprocessed data in processed_data
for tokens in processed_data:
    text = tokens_to_string(tokens)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    
    # Encoding the text
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    
    # Get embeddings from BERT model
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Extract the embeddings for the [CLS] token
    embeddings = outputs.last_hidden_state[:, 0, :]
    print(embeddings)


tensor([[ 6.3844e-02,  1.0504e-01,  4.0992e-02, -4.2579e-02, -2.3354e-01,
         -3.9933e-01,  1.1702e-01,  4.6059e-01, -7.6370e-02, -1.8849e-01,
          1.2808e-01,  3.3672e-02,  1.8109e-01,  8.5573e-02,  4.0654e-02,
          1.6405e-01, -4.2237e-01,  5.8249e-01,  2.5443e-01,  3.0992e-01,
         -2.7429e-01, -9.7950e-01,  2.1268e-01, -3.9676e-01,  1.7533e-03,
         -4.0917e-01, -3.8727e-02, -3.5095e-01, -7.8682e-02,  1.0377e-01,
          3.8977e-01,  3.3523e-01, -3.9679e-02, -5.1565e-01,  3.6672e-01,
         -3.2922e-01,  3.8769e-01, -1.5823e-01,  2.9197e-01,  1.5891e-01,
         -1.4887e-02,  2.9068e-01,  6.2196e-01,  1.5573e-01, -6.9632e-02,
         -1.0264e-01, -3.0597e+00, -1.4376e-01, -4.2515e-02, -3.5150e-01,
          5.5508e-01, -2.2956e-01,  2.7049e-01,  1.6017e-01,  2.8476e-01,
          5.1459e-01, -6.0721e-01,  2.0733e-01, -9.8158e-02,  4.4031e-01,
         -1.0629e-02,  1.9578e-02, -2.8376e-01,  9.5047e-03,  2.3789e-01,
          3.4480e-01, -1.6752e-01,  5.

In [15]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

embeddings = model.encode(processed_data)

print(embeddings)


[[-8.22104812e-02  3.90439481e-02  6.34335354e-03  9.53142438e-03
  -3.43552255e-03 -5.94012626e-02  5.88293187e-03 -2.69118380e-02
  -1.90554757e-03 -2.55809408e-02 -2.82144453e-02  4.21938449e-02
  -1.21377734e-02  2.96761356e-02  5.66073656e-02  7.18065202e-02
   8.29539355e-03 -5.61062433e-02  3.21218595e-02 -2.16934104e-02
   2.98571456e-02  4.57499623e-02  3.66484486e-02 -3.93893868e-02
  -3.98137532e-02 -2.53301132e-02  6.96491543e-03  5.64314425e-02
  -2.51339134e-02  5.26237208e-03  2.23284960e-02  4.52725403e-02
  -2.12371051e-02  6.86998814e-02 -5.28390966e-02 -3.86430398e-02
   6.96342788e-04 -1.33246072e-02 -6.85782361e-05 -3.45795304e-02
  -1.72064523e-03 -4.65044342e-02  1.36138853e-02 -6.78495411e-03
   8.92007947e-02  3.84858996e-02 -3.83521356e-02 -7.44843706e-02
  -2.98544904e-03 -2.62982305e-02 -8.28009695e-02 -9.89479423e-02
  -6.65626824e-02  3.30890389e-03  1.07534409e-01 -4.89039533e-02
   3.78467403e-02 -4.80679348e-02  6.90913722e-02 -1.91480294e-03
  -2.88868