This notebook attempts to preprocess provided posts, tokenize and use an embedding model to generate 

In [17]:
import datetime
import json
import csv

# File path for JSON dataset
file_path = 'unimelb_group_17_sample_data.json'

# Load JSON data
with open(file_path, 'r') as file:
    data = json.load(file)

texts = []

for item in data["data"]:
    if "text" in item['_source']["content_types"]: #ignore posts that do not have text content

        # Convert Unix timestamp to datetime object
        dt = datetime.datetime.fromtimestamp(item['_source']['action_time'] / 1000)  # Divide by 1000 to convert milliseconds to seconds

        # Format the datetime object as a string
        year = dt.strftime("%Y")  # Customize the format as per your requirement
        month = dt.strftime("%m")
        day = dt.strftime("%d")
        texts.append({"id": item['_id'], "year": year, "month": month, "day": day,"record_type": item['_source']['record_type'], "text": item['_source']['text']})

for text in texts:
    print("##########################")
    print(text)

# File path for the CSV file
csv_file_path = 'texts.csv'

# Write texts to CSV file
with open(csv_file_path, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['id', 'year','month','day','record_type', 'text'])  # Write header
    for text in texts:
        writer.writerow([text['id'], text['year'], text['month'], text['day'], text['record_type'], text['text']])  # Write data


##########################
{'id': 'acdf2096275061f2019bb708822c5aa9', 'year': '2023', 'month': '08', 'day': '25', 'record_type': 'message', 'text': 'Dear sir \nGood day to you. #helloworld We are Wholesale manufacturer and exporter in customized soccer uniform,baseball uniform and sports wear and undergarments etc.\n\n\n1. We are expert team of managers, merchandisers, designers and most of all great professional stitching labour.\n\n2. fast turnaround time.\n\n3. artworks within 12-24hours.\n\n4. warranty on our made goods\n\n5. unbeatable quality and unbeatable prices\n\n6. wide range of customization\n\n7. we offer free samples\n\nFeel free to tell us if you need any customized artwork or customized sample.\n\nHoping to hear from you soon on my offer!\n\nThank you so much!\n\nasquareindustry524@gmail.com\n\nRegards:\nAsquareindustry \nAlso contact us on WhatsApp: +923321208372'}
##########################
{'id': '0dd4388f06e4c5987e1f83e94e35a968', 'year': '2016', 'month': '04', 'day

In [2]:
#!pip install gensim transformers sentence-transformers

Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/15/fc/7b6dd7e1adc0a6407b845ed4be1999e98b6917d0694e57316d140cc85484/transformers-4.39.3-py3-none-any.whl.metadata
  Downloading transformers-4.39.3-py3-none-any.whl.metadata (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting sentence-transformers
  Obtaining dependency information for sentence-transformers from https://files.pythonhosted.org/packages/ba/20/7ef81df2e07322d95332d07c1c38c597f543c1f666d689a3153ba6fa09e3/sentence_transformers-2.6.1-py3-none-any.whl.metadata
  Downloading sentence_transformers-2.6.1-py3-none-any.whl.metadata (11 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Obtaining dependency information for huggingface-hub<1.0,>=0.19.3 from https://files.pythonhosted.org/packages/05/c0/779afbad8e75565c09ffa24a88b5dd7e293c92b74eb09d

In [46]:
# Data Cleaning and Preprocessing
import string
import re
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, TweetTokenizer 

tweet_tokenizer = TweetTokenizer()

lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

# nltk.download('punkt')
# nltk.download('stopwords')

# Typical cleaning techniques
def clean_text(text):
    text = re.sub("<[^>]+>", "", text).strip()     # Renove HTML markup - if any 
    text = text.lower()     # Convert text to lowercase
    # text = re.sub(r'\d+', '', text)     # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))     # Remove punctuation
    text = re.sub(r"[^\w\s-]+", "", text).strip()
    text = re.sub(r'\s+', ' ', text).strip()     # Remove extra whitespaces
    return text

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma


def preprocess_text(text):
    # # Tokenization: Split the text into words/tokens.
    tokens = tweet_tokenizer.tokenize(text)
    
    # Lowercasing: Convert all characters to lowercase to ensure uniformity.
    tokens = [word.lower() for word in tokens]
    
    # Removing Punctuation: Strip punctuation from each word.
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    
    # Removing Non-Alphabetic Characters: Filter out any tokens that are not alphabetic.
    # words = [word for word in stripped if word.isalpha()]
    
    # Stopwords Removal: Filter out stopwords like 'the', 'is', etc.

    # The result is a list of clean, lowercased, alphabetic tokens without any stopwords.
    return stripped 


# Apply preprocessing to each text
processed_data = [preprocess_text(text) for text in texts]
# for text in processed_data:
#     print("##########################")
#     print(text)

#NEED to improve #hashtag

In [11]:
# Word2Vec
from gensim.models import Word2Vec

# Creating the Word2Vec model
model = Word2Vec(processed_data, vector_size=100, window=5, min_count=1, workers=4)


Word2Vec<vocab=58, vector_size=100, alpha=0.025>


In [18]:
# # BERT
# import torch
# from transformers import BertTokenizer, BertModel

# # Function to convert tokens back to string
# def tokens_to_string(tokens):
#     return ' '.join(tokens)

# # Assuming you have your preprocessed data in processed_data
# for tokens in processed_data:
#     text = tokens_to_string(tokens)
#     tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#     model = BertModel.from_pretrained('bert-base-uncased')
    
#     # Encoding the text
#     inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    
#     # Get embeddings from BERT model
#     with torch.no_grad():
#         outputs = model(**inputs)
    
#     # Extract the embeddings for the [CLS] token
#     embeddings = outputs.last_hidden_state[:, 0, :]
#     print(embeddings)


tensor([[ 6.3844e-02,  1.0504e-01,  4.0992e-02, -4.2579e-02, -2.3354e-01,
         -3.9933e-01,  1.1702e-01,  4.6059e-01, -7.6370e-02, -1.8849e-01,
          1.2808e-01,  3.3672e-02,  1.8109e-01,  8.5573e-02,  4.0654e-02,
          1.6405e-01, -4.2237e-01,  5.8249e-01,  2.5443e-01,  3.0992e-01,
         -2.7429e-01, -9.7950e-01,  2.1268e-01, -3.9676e-01,  1.7533e-03,
         -4.0917e-01, -3.8727e-02, -3.5095e-01, -7.8682e-02,  1.0377e-01,
          3.8977e-01,  3.3523e-01, -3.9679e-02, -5.1565e-01,  3.6672e-01,
         -3.2922e-01,  3.8769e-01, -1.5823e-01,  2.9197e-01,  1.5891e-01,
         -1.4887e-02,  2.9068e-01,  6.2196e-01,  1.5573e-01, -6.9632e-02,
         -1.0264e-01, -3.0597e+00, -1.4376e-01, -4.2515e-02, -3.5150e-01,
          5.5508e-01, -2.2956e-01,  2.7049e-01,  1.6017e-01,  2.8476e-01,
          5.1459e-01, -6.0721e-01,  2.0733e-01, -9.8158e-02,  4.4031e-01,
         -1.0629e-02,  1.9578e-02, -2.8376e-01,  9.5047e-03,  2.3789e-01,
          3.4480e-01, -1.6752e-01,  5.

In [48]:
pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Collecting transformers<5.0.0,>=4.34.0 (from sentence-transformers)
  Downloading transformers-4.40.1-py3-none-any.whl.metadata (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.3.0-cp38-none-macosx_11_0_arm64.whl.metadata (26 kB)
Collecting huggingface-hub>=0.15.1 (from sentence-transformers)
  Downloading huggingface_hub-0.23.0-py3-none-any.whl.metadata (12 kB)
Collecting Pillow (from sentence-transformers)
  Downloading pillow-10.3.0-cp38-cp38-macosx_11_0_arm64.whl.metadata (9.2 kB)
Collecting filelock (from huggingface-hub>=0.15.1->sentence-transformers)
  Downloading filelock-3.14.0-py3-none-any.whl.metadata (2.8 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub>=0.15.1->sentence-transformers)
  Dow

In [50]:
from sentence_transformers import SentenceTransformer

sentences = ["This is an example sentence", "Each sentence is converted"]
model = SentenceTransformer('all-MiniLM-L6-v2')

embeddings = model.encode(sentences)

print(embeddings)


[[ 6.76569194e-02  6.34958968e-02  4.87130880e-02  7.93049857e-02
   3.74480411e-02  2.65279552e-03  3.93749215e-02 -7.09845219e-03
   5.93614466e-02  3.15370038e-02  6.00980744e-02 -5.29051870e-02
   4.06067446e-02 -2.59308498e-02  2.98427679e-02  1.12691044e-03
   7.35149086e-02 -5.03819622e-02 -1.22386597e-01  2.37028301e-02
   2.97265332e-02  4.24769297e-02  2.56337728e-02  1.99516793e-03
  -5.69190569e-02 -2.71598455e-02 -3.29035483e-02  6.60248622e-02
   1.19007140e-01 -4.58791330e-02 -7.26215169e-02 -3.25839631e-02
   5.23413792e-02  4.50553223e-02  8.25298205e-03  3.67023945e-02
  -1.39415069e-02  6.53919503e-02 -2.64272671e-02  2.06361787e-04
  -1.36643453e-02 -3.62809934e-02 -1.95043236e-02 -2.89738607e-02
   3.94270830e-02 -8.84090811e-02  2.62425351e-03  1.36714010e-02
   4.83062677e-02 -3.11565958e-02 -1.17329143e-01 -5.11690564e-02
  -8.85287672e-02 -2.18961760e-02  1.42986327e-02  4.44167666e-02
  -1.34814540e-02  7.43392482e-02  2.66382638e-02 -1.98762100e-02
   1.79191

In [54]:
embeddings[1]

array([ 8.64385813e-02,  1.02762625e-01,  5.39454259e-03,  2.04439019e-03,
       -9.96338855e-03,  2.53855139e-02,  4.92875539e-02, -3.06265913e-02,
        6.87254667e-02,  1.01366024e-02,  7.75397718e-02, -9.00807083e-02,
        6.10612938e-03, -5.69898561e-02,  1.41714755e-02,  2.80491374e-02,
       -8.68464559e-02,  7.64399171e-02, -1.03491239e-01, -6.77438229e-02,
        6.99946880e-02,  8.44251141e-02, -7.24918908e-03,  1.04770260e-02,
        1.34020504e-02,  6.77576736e-02, -9.42085907e-02, -3.71689871e-02,
        5.22617772e-02, -3.10853329e-02, -9.63406488e-02,  1.57716852e-02,
        2.57866681e-02,  7.85245076e-02,  7.89949223e-02,  1.91516839e-02,
        1.64356660e-02,  3.10084550e-03,  3.81311290e-02,  2.37090886e-02,
        1.05389543e-02, -4.40644771e-02,  4.41738702e-02, -2.58727577e-02,
        6.15378730e-02, -4.05427702e-02, -8.64140466e-02,  3.19722854e-02,
       -8.90694733e-04, -2.44437177e-02, -9.19721723e-02,  2.33939793e-02,
       -8.30293372e-02,  

# Store embeddings in a vector DB