In [1]:
import os
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3'

print(os.environ.get('PYSPARK_DRIVER_PYTHON'))
print(os.environ.get('PYSPARK_PYTHON'))

/usr/bin/python3
/usr/bin/python3


In [2]:
import pandas as pd
import numpy as np
import pyspark as spark
from pyspark.sql import SparkSession

spark = SparkSession\
    .builder\
    .appName("Cover")\
    .getOrCreate()

In [162]:
from string import punctuation
from itertools import chain
from collections import Counter
from pyspark.sql.types import StringType, ArrayType, StructType, StructField
from pyspark.sql.functions import udf, col, explode, monotonically_increasing_id 

class Cover:
    def __init__(self, window_size=5, min_occurrence_count=1):
        self.token_to_id = {}
        self.window_size = window_size
        self.min_occurrence_count = min_occurrence_count
        self.transformed_data = []
        self.corpus = None
    
    def import_data(self, filename, column_name):
        data_frame = pd.read_csv(filename, encoding='latin-1')
        self.corpus = data_frame[column_name].astype(str).tolist()
        print("Corpus has {} documents", len(self.corpus))
        print(self.corpus[0])
        
    def _get_or_set_token_to_id(self, word):
        try:
            return self.token_to_id[word]
        except KeyError:
            idx = len(self.token_to_id)
            self.token_to_id[word] = idx
            return idx
        
    def fit_transform(self):
        if self.corpus is None:
            print("Please load corpus first!!")
        else:
            tokenised_documents = [document.lower().strip(punctuation).split(' ') for document in self.corpus]
            #tokenised_documents = (document.split(' ') for document in self.corpus)
            print("Done tokenising")
            
            word_occurrences = {
                token : count 
                for token, count in Counter(chain.from_iterable(tokenised_documents)).items()
                if count >= self.min_occurrence_count
            }
            
            print("print created word occurs")
            
            self.transformed_data = [[self._get_or_set_token_to_id(word) if word in word_occurrences else 0 for word in sentence] for sentence in tokenised_documents]
            
            print("Corpus has {} documents", len(self.transformed_data))
    
    def fit_transform_duo(self, filename):
        dataframe = spark.read.format("csv").option("header", "True").option("mode", "DROPMALFORMED").load(filename)
        
        value_type = StructType([StructField('letter', StringType())])
        
        tokenise = udf(lambda x: x.lower().strip(punctuation).split(' ') if x else [], ArrayType(StringType()))
        
        df = dataframe.withColumn("tokens", tokenise("Lyrics").alias("tokens"))
        
        words = df.withColumn('word', explode(col('tokens')))\
                             .groupBy('word')\
                             .count()\
                             .sort('count', ascending=True)
        
        words_with_id = words.withColumn('id', monotonically_increasing_id())
        
        words_with_id.show()
        
    def build_cooccur_matrix(self):
        ij_list = []
        cooccur_matrix = np.fromiter(())      
        

In [163]:
import time
filename = '/opt/training/data/raw/billboard_lyrics_1964-2015.csv'
column_name = 'lyrics'
cover = Cover()


start_time = time.time()
#cover.import_data(filename, column_name)
cover.fit_transform_duo(filename)
end_time = time.time()
print("Time taken is {}".format(end_time-start_time))

+-------------+-----+---+
|         word|count| id|
+-------------+-----+---+
|           l7|    1|  0|
|     shooooes|    1|  1|
|     mindtake|    1|  2|
|     jugglers|    1|  3|
|       versos|    1|  4|
|       biting|    1|  5|
|       lathen|    1|  6|
|      methose|    1|  7|
|    underyeah|    1|  8|
|    blackroof|    1|  9|
|      coldhow|    1| 10|
|      shedded|    1| 11|
|        ain̢t|    1| 12|
|       slaver|    1| 13|
|differentlyby|    1| 14|
|       spared|    1| 15|
|     choiceme|    1| 16|
|    whycloser|    1| 17|
|     newlywed|    1| 18|
|    candletop|    1| 19|
+-------------+-----+---+
only showing top 20 rows

Time taken is 3.2167532444000244


In [None]:
from nltk.corpus import gutenberg

cover = Cover()
texts = gutenberg.sents('shakespeare-macbeth.txt')
sentences = [" ".join(list_of_words) for list_of_words in texts]

start_time = time.time()
data = cover.fit_transform(sentences)
end_time = time.time()
print("Time taken is {}".format(end_time-start_time))
print(data[1000])