In [107]:
from nltk.tokenize import RegexpTokenizer
from nltk import ngrams, FreqDist
from nltk.corpus import stopwords
from pprint import pprint

In [118]:
#read file line by line
def read_lines(filename):
    with open(filename,'r') as fin:
        return fin.readlines()

In [49]:
# group lines of one verse together
def group_lines(lines):
    songs = []
    index_of_last_new_line = -1
    for i,line in enumerate(lines):
        if line == '\n':
            songs.append(' '.join(lines[(index_of_last_new_line+1):i]))
            index_of_last_new_line = i
    return songs

In [50]:
lines = read_lines('data/kanye_verses.txt')

In [51]:
songs = group_lines(lines)

In [52]:
songs = [s.replace('\n','') for s in songs]

In [53]:
def tokenize_song(song):
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(song)

In [89]:
tokenized_songs = [tokenize_song(s) for s in songs]

In [90]:
len(tokenized_songs)

363

In [101]:
def freq_counter(song,ngram_size,most_common_size):
    most_common_ngrams = FreqDist(ngrams(song, ngram_size)).most_common(most_common_size)
    most_common_ngrams = [(' '.join(tuple_),count) for tuple_,count in most_common_ngrams]
    return most_common_ngrams

In [113]:
def remove_stop_words(song):
    stop_words = set(stopwords.words('english'))
    return [w for w in song if not w in stop_words]

In [104]:
most_common_size = 5

In [119]:
songs_freq_record = {i:{n:freq_counter(song,n,most_common_size) for n in range(1,4)} for i,song in enumerate(tokenized_songs)}

`songs_freq_record` has most common words and phrases of each verse. Stopwords included

In [108]:
pprint(songs_freq_record)

{0: {1: [('the', 8), ('I', 8), ('in', 4), ('up', 3), ('like', 3)],
     2: [('I threw', 2),
         ('threw suicides', 2),
         ('suicides on', 2),
         ('on the', 2),
         ('I m', 2)],
     3: [('I threw suicides', 2),
         ('threw suicides on', 2),
         ('suicides on the', 2),
         ('Let the suicide', 1),
         ('the suicide doors', 1)]},
 1: {1: [('I', 7), ('my', 6), ('you', 5), ('What', 5), ('m', 4)],
     2: [('I m', 4),
         ('What s', 4),
         ('ain t', 2),
         ('Cause I', 2),
         ('She said', 1)],
     3: [('She said Ye', 1),
         ('said Ye can', 1),
         ('Ye can we', 1),
         ('can we get', 1),
         ('we get married', 1)]},
 2: {1: [('I', 14), ('a', 9), ('the', 8), ('at', 5), ('m', 5)],
     2: [('I m', 5),
         ('m talking', 2),
         ('get money', 2),
         ('I just', 2),
         ('he s', 2)],
     3: [('I m talking', 2),
         ('Break records at', 1),
         ('records at Louis', 1),
         ('at

           ('was in', 2)],
       3: [('realize I guess', 2),
           ('realize They got', 2),
           ('They got the', 2),
           ('When it feel', 1),
           ('it feel like', 1)]},
 136: {1: [('I', 8), ('the', 7), ('a', 4), ('s', 4), ('in', 3)],
       2: [('in the', 2),
           ('a new', 2),
           ('with her', 2),
           ('I m', 2),
           ('that s', 2)],
       3: [('I paid for', 1),
           ('paid for them', 1),
           ('for them titties', 1),
           ('them titties get', 1),
           ('titties get your', 1)]},
 137: {1: [('I', 9), ('as', 6), ('never', 4), ('ll', 3), ('was', 3)],
       2: [('I ll', 3),
           ('ll never', 3),
           ('never be', 2),
           ('black as', 2),
           ('was I', 2)],
       3: [('I ll never', 3),
           ('ll never be', 2),
           ('up You see', 2),
           ('You see how', 2),
           ('see how I', 2)]},
 138: {1: [('I', 6), ('the', 5), ('to', 4), ('up', 3), ('talk', 2)],
       2: [

           ('picture cut', 1)],
       3: [('When you take', 1),
           ('you take the', 1),
           ('take the picture', 1),
           ('the picture cut', 1),
           ('picture cut off', 1)]},
 269: {1: [('the', 6), ('I', 3), ('it', 3), ('over', 3), ('all', 3)],
       2: [('over and', 2),
           ('and over', 2),
           ('like a', 2),
           ('we need', 2),
           ('need something', 2)],
       3: [('over and over', 2),
           ('we need something', 2),
           ('Everybody saying please', 1),
           ('saying please don', 1),
           ('please don t', 1)]},
 270: {1: [('We', 5), ('I', 5), ('we', 4), ('t', 3), ('over', 3)],
       2: [('We like', 2),
           ('Except without', 2),
           ('without the', 2),
           ('had some', 2),
           ('We lost', 1)],
       3: [('Except without the', 2),
           ('We lost a', 1),
           ('lost a four', 1),
           ('a four leaf', 1),
           ('four leaf clover', 1)]},
 271: {1: [('I'

In [120]:
songs_freq_record_without_stop_words = {i: freq_counter(remove_stop_words(song),1,most_common_size) for i,song in enumerate(tokenized_songs)}

`songs_freq_record_without_stop_words` has most common word of each verse. Stopwords not included

In [117]:
songs_freq_record_without_stop_words

{0: [('I', 8), ('like', 3), ('level', 3), ('threw', 2), ('suicides', 2)],
 1: [('I', 7), ('What', 5), ('said', 2), ('married', 2), ('like', 2)],
 2: [('I', 14), ('talking', 5), ('home', 2), ('got', 2), ('business', 2)],
 3: [('I', 4), ('reputation', 4), ('round', 3), ('shit', 3), ('girl', 2)],
 4: [('I', 8), ('hey', 5), ('first', 4), ('remember', 3), ('But', 3)],
 5: [('I', 3), ('like', 2), ('No', 2), ('It', 2), ('something', 2)],
 6: [('I', 11), ('want', 5), ('get', 3), ('She', 3), ('even', 3)],
 7: [('I', 2), ('nigga', 2), ('racism', 2), ('That', 2), ('want', 2)],
 8: [('I', 20), ('know', 7), ('niggas', 7), ('new', 6), ('slaves', 6)],
 9: [('I', 12), ('wait', 4), ('And', 3), ('could', 3), ('need', 2)],
 10: [('And', 3), ('Before', 3), ('somebody', 2), ('first', 2), ('When', 2)],
 11: [('could', 3), ('somebody', 2), ('first', 2), ('tried', 2), ('And', 2)],
 12: [('He', 3), ('yeah', 3), ('get', 2), ('baby', 2), ('I', 2)],
 13: [('black', 11), ('I', 3), ('My', 2), ('see', 2), ('come', 2

In [121]:
flat_tokenzied_songs = [w for song in tokenized_songs for w in song]

In [123]:
len(flat_tokenzied_songs)

54380

In [128]:
freq_record_of_all_songs = {n:freq_counter(flat_tokenzied_songs,n,most_common_size) for n in range(1,4)}

`freq_record_of_all_songs` has most common words and phrases of all dataset. Stopwords included

In [125]:
freq_record_of_all_songs

{1: [('I', 2586), ('the', 1876), ('you', 1170), ('a', 998), ('to', 969)],
 2: [('I m', 523),
  ('don t', 262),
  ('ain t', 247),
  ('in the', 194),
  ('it s', 147)],
 3: [('I don t', 78),
  ('I ain t', 70),
  ('I can t', 42),
  ('you ain t', 34),
  ('don t know', 33)]}

In [132]:
freq_record_of_all_songs_without_stop_words = freq_counter(remove_stop_words(flat_tokenzied_songs),1,most_common_size)

`freq_record_of_all_songs_without_stop_words` has most common words of all dataset. Stopwords not included

In [133]:
freq_record_of_all_songs_without_stop_words

[('I', 2586), ('like', 480), ('And', 457), ('got', 327), ('know', 323)]