In [1]:
import os
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3'

print(os.environ.get('PYSPARK_DRIVER_PYTHON'))
print(os.environ.get('PYSPARK_PYTHON'))

/usr/bin/python3
/usr/bin/python3


In [2]:
import pandas as pd
import numpy as np
import pyspark as spark
from pyspark.sql import SparkSession

spark = SparkSession\
    .builder\
    .appName("Cover")\
    .getOrCreate()

In [188]:
from string import punctuation
from itertools import chain
from collections import Counter
from pyspark.sql.types import StringType, ArrayType, StructType, StructField
from pyspark.sql.functions import udf, col, explode, monotonically_increasing_id, when

class Cover:
    def __init__(self, window_size=5, min_occurrence_count=1):
        self.token_to_id = {}
        self.window_size = window_size
        self.min_occurrence_count = min_occurrence_count
        self.transformed_data = []
        self.corpus = None
    
    def import_data(self, filename):
        self.corpus = spark.read.format("csv").option("header", "True").option("mode", "DROPMALFORMED").load(filename)
        print("Corpus has {} documents".format(self.corpus.count()))
        
    def fit_transform(self):
        if self.corpus is None:
            print()
        else:
            tokenise = udf(lambda x: x.lower().strip(punctuation).split(' ') if x else [], ArrayType(StringType()))

            tokenised_dataframe = self.corpus.withColumn("tokens", tokenise("Lyrics").alias("tokens"))

            words_dataframe = tokenised_dataframe.withColumn('word', explode(col('tokens')))\
                                 .groupBy('word')\
                                 .count()\
                                 .sort('count', ascending=True)

            words_with_id_dataframe = words_dataframe.withColumn('id', monotonically_increasing_id() + 1)

            filtered_words_with_id_dataframe = words_with_id_dataframe.withColumn('id', when(words_with_id_dataframe['count'] <= self.min_occurrence_count, 0).otherwise(words_with_id_dataframe.id))

            filtered_words_with_id_dataframe.show()
        
    def build_cooccur_matrix(self):
        ij_list = []
        cooccur_matrix = np.fromiter(())      
        

In [189]:
import time
filename = '/opt/training/data/raw/billboard_lyrics_1964-2015.csv'
column_name = 'lyrics'
cover = Cover(min_occurrence_count=5)

start_time = time.time()
cover.import_data(filename)
cover.fit_transform()
end_time = time.time()

print("Time taken is {}".format(end_time-start_time))

Corpus has 5100 documents
+-------------+-----+---+
|         word|count| id|
+-------------+-----+---+
|           l7|    1|  0|
|     shooooes|    1|  0|
|     mindtake|    1|  0|
|     jugglers|    1|  0|
|       versos|    1|  0|
|       biting|    1|  0|
|       lathen|    1|  0|
|      methose|    1|  0|
|    underyeah|    1|  0|
|    blackroof|    1|  0|
|      coldhow|    1|  0|
|      shedded|    1|  0|
|        ain̢t|    1|  0|
|       slaver|    1|  0|
|differentlyby|    1|  0|
|       spared|    1|  0|
|     choiceme|    1|  0|
|    whycloser|    1|  0|
|     newlywed|    1|  0|
|    candletop|    1|  0|
+-------------+-----+---+
only showing top 20 rows

Time taken is 3.2542426586151123


In [None]:
from nltk.corpus import gutenberg

cover = Cover()
texts = gutenberg.sents('shakespeare-macbeth.txt')
sentences = [" ".join(list_of_words) for list_of_words in texts]

start_time = time.time()
data = cover.fit_transform(sentences)
end_time = time.time()
print("Time taken is {}".format(end_time-start_time))
print(data[1000])