In [1]:
import pandas as pd
import numpy as np

In [4]:
from string import punctuation
from itertools import chain
from collections import Counter

class Cover:
    def __init__(self, window_size=5, min_occurrence_count=1):
        self.token_to_id = {}
        self.window_size = window_size
        self.min_occurrence_count = min_occurrence_count
        self.transformed_data = []
        self.corpus = None
    
    def import_data(self, filename, column_name):
        data_frame = pd.read_csv(filename, encoding='latin-1')
        self.corpus = data_frame[column_name].astype(str).tolist()
        print("Corpus has {} documents", len(self.corpus))
        print(self.corpus[0])
        
    def _get_or_set_token_to_id(self, word):
        try:
            return self.token_to_id[word]
        except KeyError:
            idx = len(self.token_to_id)
            self.token_to_id[word] = idx
            return idx
        
    def fit_transform(self):
        if self.corpus is None:
            print("Please load corpus first!!")
        else:
            # tokenised_documents = [document.lower().strip(punctuation).split(' ') for document in self.corpus]
            tokenised_documents = (document.split(' ') for document in self.corpus)
            print("Done tokenising")
            
            word_occurrences = {
                token : count 
                for token, count in Counter(chain.from_iterable(tokenised_documents)).items()
                if count >= self.min_occurrence_count
            }
            
            print("print created word occurs")
            
            self.transformed_data = [[self._get_or_set_token_to_id(word) if word in word_occurrences else 0 for word in sentence] for sentence in tokenised_documents]
            
            print("Corpus has {} documents", len(self.transformed_data))
    
    def build_cooccur_matrix(self):
        ij_list = []
        cooccur_matrix = np.fromiter(())      
        

In [5]:
import time
filename = '/opt/training/data/raw/lyrics.csv'
column_name = 'lyrics'
cover = Cover()


start_time = time.time()
cover.import_data(filename, column_name)
cover.fit_transform()
end_time = time.time()
print("Time taken is {}".format(end_time-start_time))

Corpus has {} documents 362237
Oh baby, how you doing?
You know I'm gonna cut right to the chase
Some women were made but me, myself
I like to think that I was created for a special purpose
You know, what's more special than you? You feel me
It's on baby, let's get lost
You don't need to call into work 'cause you're the boss
For real, want you to show me how you feel
I consider myself lucky, that's a big deal
Why? Well, you got the key to my heart
But you ain't gonna need it, I'd rather you open up my body
And show me secrets, you didn't know was inside
No need for me to lie
It's too big, it's too wide
It's too strong, it won't fit
It's too much, it's too tough
He talk like this 'cause he can back it up
He got a big ego, such a huge ego
I love his big ego, it's too much
He walk like this 'cause he can back it up
Usually I'm humble, right now I don't choose
You can leave with me or you could have the blues
Some call it arrogant, I call it confident
You decide when you find on what I'm w

In [None]:
from nltk.corpus import gutenberg

cover = Cover()
texts = gutenberg.sents('shakespeare-macbeth.txt')
sentences = [" ".join(list_of_words) for list_of_words in texts]

start_time = time.time()
data = cover.fit_transform(sentences)
end_time = time.time()
print("Time taken is {}".format(end_time-start_time))
print(data[1000])