In [165]:
from nltk import ngrams
import pandas as pd
from typing import List, Tuple, Optional
from nltk.tokenize import word_tokenize
from collections import Counter
import random

## 1. Load the data and retrieve tweets tokens

In [166]:
df = pd.read_csv('resources/tweets.csv')
tweets_texts: List[str] = df['text'].tolist()
tweets_tokens = [word_tokenize(tweet) for tweet in tweets_texts]
# add start and end tokens to each tweet
tweets_tokens = [['<s>'] + tweet + ['</s>'] for tweet in tweets_tokens]
# merge all tokens into one list
tokens = [token for tweet in tweets_tokens for token in tweet]

print(tweets_tokens[0])

['<s>', 'LOSER', '!', 'https', ':', '//t.co/p5imhMJqS1', '</s>']


## 2. Generate n-grams

In [167]:
bigrams = list(ngrams(tokens, 2))
trigrams = list(ngrams(tokens, 3))

## 3. Count n-grams frequencies

In [168]:
bigrams_freq = Counter(bigrams)
trigrams_freq = Counter(trigrams)

## 4. Generate tweets

In [169]:
def generate_tweets_from_bigrams(num_words: int = 20, start_word: Optional[str] = "<s>") -> str:
    current_word = start_word
    tweet = [current_word]
    for _ in range(num_words):
        # get all bigrams that start with the current word
        candidates = [bigram for bigram in bigrams_freq if bigram[0] == current_word]
        if len(candidates) == 0:
            # if there are no bigrams which start with current word we stop generating the tweet
            break
        # get next bigram based on frequency, the higher the frequency the more likely the bigram will be chosen
        next_bigram = random.choices(candidates, weights=[bigrams_freq[bigram] for bigram in candidates], k=1)[0]
        next_word = next_bigram[1]
        if next_word == "</s>":
            # if the next word is the end token we stop generating the tweet
            break
        tweet.append(next_word)
        current_word = next_word
    return ' '.join(tweet).removeprefix('<s>')

In [170]:
def generate_tweets_from_trigrams(start_words: Tuple[str, str], num_words: int = 20) -> str:
    current_words = start_words
    tweet = list(current_words)
    for _ in range(num_words):
        # get all trigrams that start with the current words
        candidates = [trigram for trigram in trigrams_freq if trigram[:2] == current_words]
        if len(candidates) == 0:
            # if there are no trigrams which start with current word we stop generating the tweet
            break
        # get next trigram based on frequency, the higher the frequency the more likely the trigram will be chosen
        next_trigram = random.choices(candidates, weights=[trigrams_freq[trigram] for trigram in candidates], k=1)[0]
        next_word = next_trigram[2]
        tweet.append(next_word)
        current_words = (current_words[1], next_word)
    return ' '.join(tweet)

In [174]:
test_tweet_bigram = generate_tweets_from_bigrams(15)
print(test_tweet_bigram)

 Don than to fire @ SeanMcQuade : Show me someone tweets that all haters (


In [172]:
test_tweet_trigram = generate_tweets_from_trigrams(('Fake', 'News'), 15)
print(test_tweet_trigram)

Fake News of big ratings loser CNN . https : //t.co/fxZh8zNhks </s> <s> The Fake News of
