In [1]:
import re
import nltk
import heapq
import random
import pandas as pd
from collections import Counter

nltk.download('punkt')

UNKNOWN = "<UNK>"
START_TOKEN = "<s>"
END_TOKEN = "</s>"
NUMBER = "NUM"

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tegua\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def get_corpus():
    tweets = pd.read_csv('tweets.csv')
    tweets = tweets['content']
    print('The shape of tweets: {}'.format(tweets.size))
    return tweets

In [3]:
tweets_corpus = get_corpus()

The shape of tweets: 41122


In [4]:
tweets_corpus.head(5)

0    Be sure to tune in and watch Donald Trump on L...
1    Donald Trump will be appearing on The View tom...
2    Donald Trump reads Top Ten Financial Tips on L...
3    New Blog Post: Celebrity Apprentice Finale and...
4    "My persona will never be that of a wallflower...
Name: content, dtype: object

In [5]:
def tokenize_sentences(tweets):
    tokenized_sentences = []
    all_words = []

    def tokenize(tweet):
        tweet = tweet.lower()
        tweet = re.sub(r'http\S+', '', tweet)
        tweet = re.sub(r"[^a-zA-Z0-9.?! ]+", "", tweet)
        tweet = re.sub(r"[0-9]+", NUMBER, tweet)
        tweet = tweet.replace("!", ".")
        sentences = tweet.split(".")
        for sentence in sentences:
            tokenized_sentence = nltk.word_tokenize(sentence)
            if len(tokenized_sentence) > 0:
                tokenized_sentences.append(tokenized_sentence)
                all_words.extend(tokenized_sentence)
    
    for tweet in tweets:
        tokenize(tweet)
    
    return tokenized_sentences, all_words

In [6]:
tokenized_sentences, all_words = tokenize_sentences(tweets_corpus)
len(tokenized_sentences)

90884

In [7]:
def get_filtered_unigrams(all_words):
    unigram_map = Counter(all_words)
    filtered_unigrams = {x: count for x, count in unigram_map.items() if count > 2}
    
    return filtered_unigrams

In [8]:
unigrams = get_filtered_unigrams(all_words)

In [9]:
def replace_with_unk(sentences, unigrams):
    for sentence in sentences:
        for i in range(len(sentence)):
            if not sentence[i] in unigrams:
                sentence[i] = UNKNOWN
    return sentences

In [10]:
tokenized_sentences = replace_with_unk(tokenized_sentences, unigrams)

In [11]:
def generate_bigrams(sentences):
    bigram_list = []

    for sentence in sentences:
        sentence = tuple([START_TOKEN] + sentence + [END_TOKEN])
        for i in range(len(sentence) - 1):
            bigram = sentence[i: i+2]
            bigram_list.append(bigram)
    bigram_map = Counter(bigram_list)
    return dict(bigram_map)

In [12]:
bigrams = generate_bigrams(tokenized_sentences)

In [13]:
def estimate_probability(word, prev, unigrams, bigrams):
    k = 1.0
    size = len(bigrams)
    
    prev_count = unigrams.get(prev, 0)
    denominator = prev_count + k * size
    
    ngram = (prev, word)
    ngram_count = bigrams.get(ngram, 0)
    numerator = ngram_count + k
    
    probability = numerator / denominator
    
    return probability

In [14]:
def get_probabilities(prev, unigrams, bigrams):
    probabilities = {}
    vocab = list(unigrams.keys()) + [START_TOKEN, END_TOKEN, UNKNOWN]
    for word in vocab:
        probability = estimate_probability(word, prev, unigrams, bigrams)
        probabilities[word] = probability
    return probabilities

In [15]:
def suggest_word(prev_token, unigrams, bigrams):
    suggestion = None
    max_prob = 0
    probs = get_probabilities(prev_token, unigrams, bigrams)   
    
    for word, prob in probs.items():
        if prob > max_prob: 
            suggestion = word
            max_prob = prob
    return suggestion

In [16]:
def suggest_n_words(prev_token, unigrams, bigrams, n):
    max_prob = 0
    probs = get_probabilities(prev_token, unigrams, bigrams)
    suggestions = {}
    heap = []
    for word, prob in probs.items():
        if word != UNKNOWN:
            threshhold = heapq.nsmallest(1, heap)
            if len(threshhold) == 0 or prob > threshhold[0]: 
                if len(heap) >= 3:
                    key = heapq.heappop(heap)
                    try:
                        del suggestions[key]
                    except KeyError:
                        pass
                heapq.heappush(heap, prob)
                suggestions[prob] = word
    
    return random.choice(list(suggestions.values())) 

In [17]:
def format_to_sentence(generated_list):
    if len(generated_list) > 0:
        generated_list[0] = generated_list[0].capitalize()
    return " ".join(generated_list) + "."

In [18]:
def generate_sentence(initial_word):
    word = initial_word
    generated = []
    
    while word != END_TOKEN:
        if word == NUMBER:
            word = str(random.randint(1, 3000))
        if word == "NUMpm":
            word = str(random.randint(1, 12)) + "pm"
        generated.append(word)
        word = suggest_n_words(word, unigrams, bigrams, 5)
        
    return generated

In [19]:
generated = generate_sentence('i')
format_to_sentence(generated)

'I am.'