In [104]:
from nltk import ngrams
from typing import List, Tuple, Optional, Set
from nltk.tokenize import word_tokenize
from collections import Counter
import random

## 1. Load the data and retrieve tweets tokens

First we need to clean the data: let's remove the `"` character from the tweets because some of them have it at the start of the tweet but miss it at the end. This will cause the dataframe tweet column to be mixed with the following columns.

In [105]:
with open('resources/tweets.csv', 'r', encoding="utf8") as f:
    tweets = f.readlines()
    tweets = [tweet.replace('"', '') for tweet in tweets]

Now we can retrieve the tweets tokens. We will use the `word_tokenize` function from the `nltk.tokenize` module to tokenize the tweets.
We also add the start and end tokens to each tweet and merge all tokens into one list.

In [106]:
# split csv data columns and select the one with tweet texts
tweet_texts = [tweet.split(",")[1] for tweet in tweets[1:]]

# tokenize each tweet, the result is a list of lists where each inner list (list of strings) contains the tokens (single words) of a tweet.
tweets_tokens = [word_tokenize(tweet) for tweet in tweet_texts]
print(tweets_tokens[0])

# add start and end tokens to each tweet
tweets_tokens = [['<s>'] + tweet + ['</s>'] for tweet in tweets_tokens]

# merge all tokens into one list of strings
tokens = [token for tweet in tweets_tokens for token in tweet]
print(tokens[0:15])

['LOSER', '!', 'https', ':', '//t.co/p5imhMJqS1']
['<s>', 'LOSER', '!', 'https', ':', '//t.co/p5imhMJqS1', '</s>', '<s>', 'Most', 'of', 'the', 'money', 'raised', 'by', 'the']


## 2. Generate n-grams

In [107]:
bigrams = list(ngrams(tokens, 2))
trigrams = list(ngrams(tokens,3))

## 3. Count n-grams frequencies

For each n-gram we use the `Counter` class from the `collections` module to count the frequency of each n-gram.
Each n-gram is stored in a dictionary where the key is the n-gram and the value is its the frequency.

In [108]:
bigrams_freq = Counter(bigrams)
trigrams_freq = Counter(trigrams)
for i in range(3):
    print(f"Bigram {i}: {bigrams[i]} - Frequency: {bigrams_freq[bigrams[i]]}")

Bigram 0: ('<s>', 'LOSER') - Frequency: 1
Bigram 1: ('LOSER', '!') - Frequency: 2
Bigram 2: ('!', 'https') - Frequency: 10


## 4. Generate tweets

We generate tweets using the bigrams and trigrams frequencies. 

Once we have the list of tokens for each tweet, we can join them in a human natural way (eg. avoid spacing before punctuation mark and after a @ or a # symbol).

In [109]:
punctuation: Set = {".", ",", "!", "?", ")", "]", "}", ":", ";", "'", "\"", "’", "‘",
               "”", "–", "—", "…", "•", "·", "``", "''", "'"}

def join_tokens(tweet_tokens: List[str]) -> str:
    tweet_text:str = ""
    skip_space = False
    for token in tweet_tokens:
        if token in punctuation:
            tweet_text = tweet_text + token
        elif token in ("@", "(", "#", "“"):
            tweet_text += " " + token
            skip_space = True
        else:
            if skip_space:
                tweet_text += token
                skip_space = False
            else:
                tweet_text += " " + token
    return tweet_text.replace("<s>", "").replace("</s>", "").strip()

To generate tweets from bigrams the following choices are made:
- If the start word is not provided, the first word will be the start token.
- If a token that is never followed by another token (other than the end token) in the corpus is chosen, the tweet generation will stop. We have no criteria to choose the next token in this case.
- If the next token is the end token and the at least 80% of `num_words` have been generated, the tweet generation will stop.

In [110]:
def generate_tweets_from_bigrams(num_words: Optional[int] = 20, start_word: Optional[str] = "<s>") -> str:
    current_word = start_word
    tweet = [current_word]
    for _ in range(num_words):
        # get all bigrams that start with the current word
        candidates = [bigram for bigram in bigrams_freq if bigram[0] == current_word]
        if len(candidates) == 0:
            # if there are no bigrams which start with current word we stop generating the tweet
            break
        
        # get next bigram based on frequency, the higher the frequency the more likely the bigram will be chosen
        next_bigram = random.choices(candidates, weights=[bigrams_freq[bigram] for bigram in candidates], k=1)[0]
        next_word = next_bigram[1]
        
        if next_word == "</s>" and len(tweet) > num_words*0.8:
            # if the next word is the end token and the tweet is at least 80% complete we stop generating the tweet
            break
        
        tweet.append(next_word)
        current_word = next_word
    return join_tokens(tweet)

To generate tweets from trigrams the following choices are made:
- If the start words are not provided, the first word will be the start token and the second word will be chosen based on the frequency of the trigrams that start with the start token.
- If a token that is never followed by another token (other than the end token) in the corpus is chosen, the tweet generation will stop. We have no criteria to choose the next token in this case.
- If the next token is the end token and the at least 80% of num_words have been generated, the tweet generation will stop.

In [111]:
def generate_tweets_from_trigrams(
        num_words: Optional[int] = 20, 
        start_words: Optional[Tuple[str, str]] = ("<s>", None)) -> str:
    """
    The function `generate_tweets_from_trigrams` generates a tweet based on trigrams. It takes two optional parameters:
    :param Optional[int] num_words: which is maximum number of words that the generated tweet will have. The default value is 20. 
    :param Optional[Tuple[str,str]] start_words: which is a tuple of two strings that represent the first two words of the tweet. The default value is ("<s>", None) which means that the first word will be the start token and the second word will be chosen based on the frequency of the trigrams that start with the start token.
    
    :return: The function returns a string which represents the generated tweet.
    """
    if start_words[1] is None:
        # if the second word is not provided we choose it based on the frequency of the trigrams that start with the first word
        candidates = [trigram for trigram in trigrams_freq if trigram[0] == start_words[0]]
        next_trigram = random.choices(candidates, weights=[trigrams_freq[trigram] for trigram in candidates], k=1)[0]
        start_words = (start_words[0], next_trigram[1])
    current_words = start_words
    tweet = list(current_words)
    
    for _ in range(num_words):
        # get all trigrams that start with the current words
        candidates = [trigram for trigram in trigrams_freq if trigram[:2] == current_words]
        if len(candidates) == 0:
            # if there are no trigrams which start with current words we stop generating the tweet
            break
        
        # get next trigram based on frequency, the higher the frequency the more likely the trigram will be chosen
        next_trigram = random.choices(candidates, weights=[trigrams_freq[trigram] for trigram in candidates], k=1)[0]
        next_word = next_trigram[2]
        
        if next_word == "</s>" and len(tweet) > num_words*0.8:
            # if the next word is the end token and the tweet is at least 80% complete we stop generating the tweet
            break
        
        tweet.append(next_word)
        current_words = (current_words[1], next_word)
    return join_tokens(tweet)

## 5. Test the tweet generation

In [112]:
test_tweet_bigram = generate_tweets_from_bigrams(20)
print(test_tweet_bigram)

@N_R_Mandela: //t.co/lJcHG0IHaM via @CheriJacobus. Plus $ 3.9 billion (or fired? I wear a loser


In [113]:
test_tweet_trigram = generate_tweets_from_trigrams(20)
print(test_tweet_trigram)

@pzarrella21: @realDonaldTrump: It should be out of the money raised by the people that got them there
