In [71]:
#Yields lists of words representing 1 line each
def file_to_words(filename):
    #Only alphabets and lowercase
    def clean_word(word):
        return "".join([c if (c.isalpha() or c == "'") else '' for c in word]).lower()
        
    #open file
    with open(filename) as infile:
        #Go through lines
        for line in infile:
            #Trim and split
            words = line.strip().split()
            #No [Singer] tags
            no_bracket_words = [word for word in words if word[0] != '[']
            
            #check if empty
            if not no_bracket_words:
                continue
            #Clean words
            cleaned_words = [clean_word(word) for word in no_bracket_words]
            #yield cleaned words
            for word in cleaned_words:
                yield word
            #yield newline
            yield '\n'

In [72]:
list(file_to_words('lyrics/Imagine-dragons-on-top-of-the-world-lyrics.txt'))[:10]

['', '\n', 'if', 'you', 'love', 'somebody', '\n', 'better', 'tell', 'them']

In [73]:
import os
def get_all_filenames(directory):
    #https://stackoverflow.com/questions/120656/directory-listing-in-python
    return os.listdir(directory)

In [74]:
get_all_filenames('lyrics')[:10]

['Imagine-dragons-leave-me-lyrics.txt',
 'Imagine-dragons-uptight-lyrics.txt',
 'Imagine-dragons-on-top-of-the-world-lyrics.txt',
 'Imagine-dragons-volume-drops-lyrics.txt',
 'Imagine-dragons-gold-lyrics.txt',
 'Imagine-dragons-amsterdam-lyrics.txt',
 'Imagine-dragons-february-lyrics.txt',
 'Imagine-dragons-battle-cry-lyrics.txt',
 'Imagine-dragons-pistol-whip-lyrics.txt',
 'Imagine-dragons-thief-lyrics.txt']

In [75]:
def words_to_bigrams(words):
    #Use zip to combine, shifted left, shifted right
    return zip(words[:-1], words[1:])

In [76]:
list(words_to_bigrams(['if', 'you', 'love', 'somebody', '\n', 'better', 'tell', 'them']))

[('if', 'you'),
 ('you', 'love'),
 ('love', 'somebody'),
 ('somebody', '\n'),
 ('\n', 'better'),
 ('better', 'tell'),
 ('tell', 'them')]

In [77]:
def add_bigrams_to_dict(d, bigrams):
    #go through bigrams
    for bigram in bigrams:
        #Extract words
        first, second = bigram
        #If first isnt there
        if first not in d:
            d[first] = {}
        #If second isnt there
        if second not in d[first]:
            d[first][second] = 0
        #Increment
        d[first][second] += 1
                

In [78]:
from random import randint
def random_weighted_key(d):
    #calculate total sum
    total_value = sum(d.values())
    rand_value = randint(0, total_value)
    #iterate through keys and values
    for k, v in d.items():
        #Update rand value
        rand_value -= v
        #See if we've reached the right number
        if rand_value <= 0:
            return k
        
    

In [82]:
def build_count_dict(directory):
    count_dict = {}
    for filename in get_all_filenames(directory):
        full_filename = directory + '/' + filename
        bigrams = words_to_bigrams(list(file_to_words(full_filename)))
        add_bigrams_to_dict(count_dict, bigrams)
    return count_dict

In [84]:
build_count_dict('lyrics')

{'': {'': 21,
  '\n': 208,
  'am': 9,
  'boys': 3,
  'chorus': 1,
  "i'm": 6,
  "it's": 1,
  'mirrors': 1,
  'nothing': 1,
  'rocks': 1,
  'take': 1,
  'they': 3,
  'whoa': 2},
 'halls': {'\n': 1},
 'under': {'\n': 5},
 'look': {'\n': 2,
  'and': 1,
  'at': 4,
  'back': 2,
  'clear': 5,
  'down': 8,
  'for': 1,
  'how': 9,
  'in': 5,
  'inside': 1,
  'into': 3,
  'look': 4,
  'on': 3,
  'out': 8,
  'so': 1,
  'that': 4,
  'the': 2,
  'through': 2,
  'tired': 1,
  'to': 2,
  'you': 3},
 'limited': {'\n': 2},
 'place': {'\n': 3, 'so': 3, 'that': 4, 'to': 1},
 'thunder': {'\n': 24, 'feel': 9, 'lost': 1, 'roars': 1, 'thunder': 6},
 'dug': {'your': 1},
 'night': {'\n': 31,
  'and': 1,
  'every': 33,
  'go': 2,
  "i'll": 9,
  'light': 1,
  'stands': 1,
  'too': 1,
  'with': 1},
 'mumble': {'\n': 1},
 'growing': {'old': 1, 'older': 1},
 'crystallized': {'\n': 1},
 'searching': {'\n': 1, 'to': 1},
 'led': {'me': 6},
 'lying': {'in': 1},
 'pane': {'\n': 2},
 'cries': {'from': 1},
 "king's": {'l

In [83]:

count_dict = build_count_dict('lyrics')

NUM_WORDS = 100
START_WORD = '\n'

cur_word = START_WORD
for i in range(NUM_WORDS):
    print(cur_word, end=" ")
    cur_word = random_weighted_key(count_dict[cur_word])


 don't look tired of a million opportunities 
 my yesterday 
 rock bottom of the reason 
 open up to be asked me love love you were the thunder feel it and i keep your head up hill from here we start 
 was only comes a fool 
 oh don't work i can 
 maybe i'm just levitate 
 for this is it on me and speak 
 everything 
 i am me voir très bien ouh la lala la 
 i will not enough 
 whenever i'm just one 
 and now no these eyes they keep 