In [1]:
# Twitter database - word embeddings
import itertools
import pandas as pd
import numpy as np
import re
import os
from tqdm import tqdm
from nltk.stem import WordNetLemmatizer, SnowballStemmer

# Functions
# Create a stemmer
stemmer = SnowballStemmer("english")

# Functions for stemming and lemmatization
def stem_and_lemmatize(text:str) -> str:
    """Stems and lemmatizes a given text."""
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def unique_word_dict(text: list) -> dict:
    """Creates a dictionary of unique words in a given test list"""
    unique_words = list(set(text))
    unique_words.sort()
    
    unique_dict = {w: i for i, word in enumerate(unique_words)}
    return unique_dict

def preprocess_texts(text_list: list) -> list:
    """Processes text to remove all unwanted words and symbols."""

    # Lowercase the tweets
    text_list = [text.lower() for text in text_list] 

    # Regex patterns
    url_pattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    user_pattern       = '@[^\s]+'
    alpha_pattern      = "[^a-zA-Z]"
    sequence_pattern   = r"(.)\1\1+"
    seq_replace_pattern = r"\1\1"

    # Remove URLs from the tweet text
    text_list = [re.sub(url_pattern, ' ', str(x))
                                    for x in text_list]
    # Remove username from the tweet text
    text_list = [re.sub(user_pattern, ' ', str(x))
                                    for x in text_list]
    # Remove all non-alphanumeric symbols
    text_list = [re.sub(alpha_pattern, ' ', str(x))
                                    for x in text_list]
    # Replace all 3 or more consecutive letters with 2 letters
    text_list = [re.sub(sequence_pattern, seq_replace_pattern, str(x))
                                    for x in text_list]

    
    full_tweet_list = []
    for x in text_list:
        full_tweet = []
        for word in x.split():
            word = stem_and_lemmatize(word)
            full_tweet.append(word)
        full_tweet_list.append(full_tweet)

    return full_tweet_list

def preprocess_single_tweet(text: str) -> list:
    # Lowercase the tweets
    lc_text = text.lower()

    # Regex patterns
    url_pattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    user_pattern       = '@[^\s]+'
    alpha_pattern      = "[^a-zA-Z]"
    sequence_pattern   = r"(.)\1\1+"
    seq_replace_pattern = r"\1\1"

    # Remove URLs from the tweet text
    lc_text = re.sub(url_pattern, ' ', lc_text)

    # Remove username from the tweet text
    lc_text = re.sub(user_pattern, ' ', lc_text)

    # Remove all non-alphanumeric symbols
    lc_text = re.sub(alpha_pattern, ' ', lc_text)

    # Replace all 3 or more consecutive letters with 2 letters
    lc_text = re.sub(sequence_pattern, seq_replace_pattern, lc_text)


    
    processed_text = []
    for word in lc_text.split():
        if word not in gensim.parsing.preprocessing.STOPWORDS and len(word) > 3:
            word = stem_and_lemmatize(word)
            processed_text.append(word)

    return processed_text

In [2]:
# Read texts from file
tweets = pd.read_csv('twitter_database.csv')
tweets = [t for t in tweets['tweet_text']]

In [3]:
# Preprocess the texts
word_lists = []
processed_tweets = preprocess_texts(tweets)

In [4]:
# Create the context dictionary
WINDOW_SIZE = 5    # Size of the context window

for tweet in processed_tweets:
    for i, word in enumerate(tweet):
        for w in range(WINDOW_SIZE):
            if i + 1 + WINDOW_SIZE < len(tweet):
                word_lists.append([word] + [tweet[(i + 1 + w)]])
            if i - w - 1 >= 0:
                word_lists.append([word] + [tweet[(i - w - 1)]])

In [5]:
# Create a dictionary of unique words
all_processed = itertools.chain.from_iterable(processed_tweets)
unique_words = unique_word_dict(all_processed)

# Number of features
words_count = len(unique_words)

# List of unique words
unique_word_list = list(unique_words.keys())