In [128]:
from nltk import ngrams
import pandas as pd
from typing import List, Tuple, Optional, Set
from nltk.tokenize import word_tokenize
from collections import Counter
import random

## 1. Load the data and retrieve tweets tokens

In [129]:
# first we need to clean the data
# let's remove the " from the tweets because some of them have " at the start of the tweet but miss it at the end. This will cause the dataframe tweet column to be mixed with the following columns.
with open('resources/tweets.csv', 'r', encoding="utf8") as f:
    tweets = f.readlines()
    tweets = [tweet.replace('"', '') for tweet in tweets]

# write the cleaned tweets to a new file. Each row will contain a tweet cleaned text
with open('resources/tweets_clean.csv', 'w', encoding="utf8") as f:
    for tweet in tweets:
        f.write(tweet)

In [130]:
df = pd.read_csv('resources/tweets_clean.csv')
tweets_texts: List[str] = df['text'].tolist()
tweets_tokens = [word_tokenize(tweet) for tweet in tweets_texts]
# add start and end tokens to each tweet
tweets_tokens = [['<s>'] + tweet + ['</s>'] for tweet in tweets_tokens]
# merge all tokens into one list
tokens = [token for tweet in tweets_tokens for token in tweet]

print(tweets_tokens[0])

['<s>', 'LOSER', '!', 'https', ':', '//t.co/p5imhMJqS1', '</s>']


In [131]:
# write tweets text on file
with open('resources/tweets_tokens.txt', 'w', encoding="utf8") as f:
    for tweet in tweets_tokens:
        f.write(' '.join(tweet) + '\n')

## 2. Generate n-grams

In [132]:
bigrams = list(ngrams(tokens, 2))
trigrams = list(ngrams(tokens, 3))

## 3. Count n-grams frequencies

In [133]:
bigrams_freq = Counter(bigrams)
trigrams_freq = Counter(trigrams)

## 4. Generate tweets

In [134]:
punctuation: Set = {".", ",", "!", "?", ")", "]", "}", ":", ";", "'", "\"", "’", "‘",
               "”", "–", "—", "…", "•", "·", "``", "''"}

def join_tokens(tweet_tokens: List[str]) -> str:
    tweet_text:str = ""
    skip_space = False
    for token in tweet_tokens:
        if token in punctuation:
            tweet_text = tweet_text + token
        elif token in ("@", "(", "#", "“"):
            tweet_text += " " + token
            skip_space = True
        else:
            if skip_space:
                tweet_text += token
                skip_space = False
            else:
                tweet_text += " " + token
    return tweet_text.replace("<s>", "").replace("</s>", "").strip()

In [135]:
def generate_tweets_from_bigrams(num_words: Optional[int] = 20, start_word: Optional[str] = "<s>") -> str:
    current_word = start_word
    tweet = [current_word]
    for _ in range(num_words):
        # get all bigrams that start with the current word
        candidates = [bigram for bigram in bigrams_freq if bigram[0] == current_word]
        if len(candidates) == 0:
            # if there are no bigrams which start with current word we stop generating the tweet
            break
        # get next bigram based on frequency, the higher the frequency the more likely the bigram will be chosen
        next_bigram = random.choices(candidates, weights=[bigrams_freq[bigram] for bigram in candidates], k=1)[0]
        next_word = next_bigram[1]
        if next_word == "</s>" and len(tweet) > num_words*0.8:
            # if the next word is the end token and the tweet is at least 80% complete we stop generating the tweet
            break
        tweet.append(next_word)
        current_word = next_word
    return join_tokens(tweet)

In [136]:
def generate_tweets_from_trigrams(
        num_words: Optional[int] = 20, 
        start_words: Optional[Tuple[str, str]] = ("<s>", None)) -> str:
    """
    The function `generate_tweets_from_trigrams` generates a tweet based on trigrams. It takes two optional parameters:
    :param Optional[int] num_words: which is maximum number of words that the generated tweet will have. The default value is 20. 
    :param Optional[Tuple[str,str]] start_words: which is a tuple of two strings that represent the first two words of the tweet. The default value is ("<s>", None) which means that the first word will be the start token and the second word will be chosen based on the frequency of the trigrams that start with the start token.
    
    :return: The function returns a string which represents the generated tweet.
    """
    if start_words[1] is None:
        # if the second word is not provided we choose it based on the frequency of the trigrams that start with the first word
        candidates = [trigram for trigram in trigrams_freq if trigram[0] == start_words[0]]
        next_trigram = random.choices(candidates, weights=[trigrams_freq[trigram] for trigram in candidates], k=1)[0]
        start_words = (start_words[0], next_trigram[1])
    current_words = start_words
    tweet = list(current_words)
    for _ in range(num_words):
        # get all trigrams that start with the current words
        candidates = [trigram for trigram in trigrams_freq if trigram[:2] == current_words]
        if len(candidates) == 0:
            # if there are no trigrams which start with current word we stop generating the tweet
            break
        # get next trigram based on frequency, the higher the frequency the more likely the trigram will be chosen
        next_trigram = random.choices(candidates, weights=[trigrams_freq[trigram] for trigram in candidates], k=1)[0]
        next_word = next_trigram[2]
        if next_word == "</s>" and len(tweet) > num_words*0.8:
            # if the next word is the end token and the tweet is at least 80% complete we stop generating the tweet
            break
        tweet.append(next_word)
        current_words = (current_words[1], next_word)
    return join_tokens(tweet)

In [137]:
test_tweet_bigram = generate_tweets_from_bigrams(150)
print(test_tweet_bigram)

@krauthammer is now dismissed or in the beginning have nothing but my friend @realDonaldTrump @BridgetGonzale3: //t.co/mbnmf8D8jI   @RepMattGaetz: @politico is a huge defeat by these clowns and yet Fox should be gone!   Why does n't believe in politics yet they still a very bright and @realDonaldTrump who have it loser!  . is firing sleepy eyes Chuck Todd in Vietnam) is the planet Donald Trump I notice you can still follow loser-boredom without being in months and Miss U.S.A. and winners for our true!   Mini Mike Bloomberg called many years along with 400 million dollars & amp; haters would never called his seat. Most people who fraudulently made up Trump from other reason they throw towards you keep tweeting the FBI. I agree - a bully a man I


In [138]:
test_tweet_trigram = generate_tweets_from_trigrams(250)
print(test_tweet_trigram)

Every time I speak of the highest -and you all know it! Always respect FIGHTERS over overrated loser POLITITIANS!!!   Losers and haterseven you as low and dumb as you are jealous of his experience-unlike the haters and losers like @RepSwalwell (who got ZERO as presidential candidate before quitting) Pramila Jayapal David Cicilline and others who are Radical Left Story about Doral bedbugs but Bret Stephens is loaded up with a wonderful family. Michael is a stone cold loser who made up stories in order to sell the @UnionLeader. It 's a loser.   What my father really gave me is a stone cold loser who hates Michael a fine person with a wonderful family. Michael is a loser who deserves her comeuppance. @Lord_Sugar .... but you wouldn’ t. Like it or not it’ s a winner. EVERYONE knows that.Some LOSERS do n't) say I never called my friend @HowardStern a loser- he’ s phony lawsuit against Trump U was decimated by the RINO losers of the highest -and you all know it!   Sorry losers and haters nev