In [20]:
from nltk import ngrams
import pandas as pd
from typing import List
from nltk.tokenize import word_tokenize
from collections import Counter
import random

## 1. Load the data and retrieve tweets tokens

In [21]:
df = pd.read_csv('resources/tweets.csv')
tweets_texts: List[str] = df['text'].tolist()
tweets_tokens = [word_tokenize(tweet) for tweet in tweets_texts]
# merge all tokens into one list
tokens = [token for tweet in tweets_tokens for token in tweet]


## 2. Generate n-grams

In [22]:
bigrams = list(ngrams(tokens, 2))
trigrams = list(ngrams(tokens, 3))

## 3. Count n-grams frequencies

In [23]:
bigrams_freq = Counter(bigrams)
trigrams_freq = Counter(trigrams)

## 4. Generate tweets

In [24]:
def generate_tweets_from_bigrams(start_word: str, num_words: int = 20) -> str:
    current_word = start_word
    tweet = [current_word]
    for _ in range(num_words):
        # get all bigrams that start with the current word
        candidates = [bigram for bigram in bigrams_freq if bigram[0] == current_word]
        if len(candidates) == 0:
            # if there are no bigrams which start with current word we stop generating the tweet
            break
        # get next bigram based on frequency, the higher the frequency the more likely the bigram will be chosen
        next_bigram = random.choices(candidates, weights=[bigrams_freq[bigram] for bigram in candidates], k=1)[0]
        next_word = next_bigram[1]
        tweet.append(next_word)
        current_word = next_word
    return ' '.join(tweet)

In [33]:
test_tweet = generate_tweets_from_bigrams('I', 15)
print(test_tweet)

I 'll show you do with them '' Happy 4th of fate . I barely know
