In [1]:
import os
import re
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from tqdm.auto import tqdm
from gensim.models import KeyedVectors, FastText
import multiprocessing
import numpy as np
from collections import Counter

class Model:
    def __init__(self, filename, data_dir='/content/drive/MyDrive/SLANGuage_Data/text_only'):
        self.filename = filename
        self.data_path = os.path.join(data_dir, f"{filename}.json")
        self.data = None
        self.tokenized_data = None
        self.fasttext_model = None
        self.pretrained_vectors = None

    def load_data(self):
        """Load the dataset."""
        self.data = pd.read_json(self.data_path, lines=True)
        self.data.rename(columns={0: 'text'}, inplace=True)
        print("Data loaded successfully.")
        print(self.data.head())

    def preprocess(self):
      """Preprocess the text data."""
      print("Starting preprocessing...")

      # Step 1: Remove @usernames
      print("Removing @usernames...")
      self.data['text'] = self.data['text'].progress_apply(
          lambda x: re.sub(r'@\S+\s', ' ', x)
      )

      # Step 2: Remove links starting with http/https
      print("Removing links...")
      self.data['text'] = self.data['text'].progress_apply(
          lambda x: re.sub(r'https?://\S+\s*', ' ', x)
      )

      # Step 3: Clean special tokens ([CLS], [SEP])
      print("Cleaning special tokens...")
      self.data['text'] = self.data['text'].progress_apply(
          lambda x: x.replace('[CLS]', '').replace('[SEP]', '').strip().lower()
      )

      # Step 4: Normalize spaces
      print("Normalizing spaces...")
      self.data['text'] = self.data['text'].progress_apply(
          lambda x: re.sub(r'\s+', ' ', x)
      )

      # Tokenization with its own progress bar
      print("Tokenizing and removing 'RT'...")
      tokenizer = RegexpTokenizer(r"\w+(?:'\w+)?|[^\w\s]")
      self.tokenized_data = self.data['text'].progress_apply(
          lambda x: [token for token in tokenizer.tokenize(x) if token.lower() != 'rt']
      ).tolist()

      print("Preprocessing completed.")

    def preview_tokens(self, len=2):
      """Preview the tokenized data."""
      print("Previewing tokenized data...")
      print(self.tokenized_data[:len])

    def load_pretrained_vectors(self, vector_path='/content/drive/MyDrive/SLANGuage_Data/cc.en.300.kv'):
        """Load pre-trained FastText vectors."""
        print("Loading pre-trained FastText vectors...")
        self.pretrained_vectors = KeyedVectors.load(vector_path, mmap='r')
        print("Pre-trained vectors loaded successfully.")

    def train_embeddings(self, vector_size=300, window=5, min_count=25, epochs=40):
        """Train FastText embeddings."""
        print("Initializing FastText model...")
        num_cores = multiprocessing.cpu_count()
        self.fasttext_model = FastText(vector_size=vector_size, window=window, min_count=min_count, sg=1, workers=num_cores)

        # Build vocabulary
        print("Building vocabulary...")
        self.fasttext_model.build_vocab(corpus_iterable=self.tokenized_data)

        # Intersect with pre-trained vectors
        print("Intersecting with pre-trained vectors...")
        words_in_both = set(self.pretrained_vectors.key_to_index).intersection(set(self.fasttext_model.wv.key_to_index))
        for word in words_in_both:
            self.fasttext_model.wv[word] = self.pretrained_vectors[word]

        # Train model
        print(f"Training FastText model for {epochs} epochs...")
        self.fasttext_model.train(
            corpus_iterable=self.tokenized_data,
            total_examples=self.fasttext_model.corpus_count,
            epochs=epochs
        )
        print("Training completed.")

    def save_model(self, save_dir='/content/drive/MyDrive/SLANGuage_Data/embeddings'):
        """Save the trained model."""
        save_path = os.path.join(save_dir, f"{self.filename}.model")
        self.fasttext_model.save(save_path)
        print(f"Model saved at {save_path}.")

    def analyze_word(self, word, top_n=10):
        """Analyze a specific word using trained embeddings."""
        print(f"Finding top {top_n} most similar words for '{word}'...")
        neighbors = self.fasttext_model.wv.most_similar(word, topn=top_n)
        for neighbor, similarity in neighbors:
            print(f"{neighbor}: {similarity:.4f}")

    def word_frequency(self, word):
        """Get the frequency of a word in the tokenized data."""
        all_tokens = [token for tweet in self.tokenized_data for token in tweet]
        freq = Counter(all_tokens)[word]
        print(f"Frequency of '{word}': {freq}")
        return freq

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from tqdm.auto import tqdm
tqdm.pandas()  # Enable tqdm for pandas

year_month_dates = ['2019-06-01']

for year_month_date in year_month_dates:
  print('Starting process for',year_month_date)
  # Load and Process Data
  model = Model(year_month_date)
  model.load_data()
  model.preprocess()
  print(model.preview_tokens())

  # Train Model
  model.load_pretrained_vectors()
  model.train_embeddings()
  model.save_model()

Starting process for 2019-06-01
Data loaded successfully.
                                                text
0  RT @tomcoates: @realDonaldTrump Appointed mass...
1                 @hansqrrl YES STAN MONSTA X (≧∇≦)b
2                           @shrutithenaik Thank you
3  RT @CFCBrano: Hazard compilation of him throug...
4  Major investment platforms provide regular mut...
Starting preprocessing...
Removing @usernames...


  0%|          | 0/1038317 [00:00<?, ?it/s]

Removing links...


  0%|          | 0/1038317 [00:00<?, ?it/s]

Cleaning special tokens...


  0%|          | 0/1038317 [00:00<?, ?it/s]

Normalizing spaces...


  0%|          | 0/1038317 [00:00<?, ?it/s]

Tokenizing and removing 'RT'...


  0%|          | 0/1038317 [00:00<?, ?it/s]

Preprocessing completed.
Previewing tokenized data...
[['appointed', 'massively', 'anti', '-', 'lgbt', 'judges'], ['yes', 'stan', 'monsta', 'x', '(', '≧', '∇', '≦', ')', 'b']]
None
Loading pre-trained FastText vectors...
Pre-trained vectors loaded successfully.
Initializing FastText model...
Building vocabulary...
Intersecting with pre-trained vectors...
Training FastText model for 40 epochs...
Training completed.
Model saved at /content/drive/MyDrive/SLANGuage_Data/embeddings/2019-06-01.model.
