In [1]:
import os
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3'

print(os.environ.get('PYSPARK_DRIVER_PYTHON'))
print(os.environ.get('PYSPARK_PYTHON'))

/usr/bin/python3
/usr/bin/python3


In [2]:
import pandas as pd
import numpy as np
import pyspark as spark
from pyspark.sql import SparkSession

spark = SparkSession\
    .builder\
    .appName("Cover")\
    .getOrCreate()

In [12]:
from string import punctuation
from itertools import chain
from collections import Counter
from pyspark.sql.types import StringType, ArrayType, StructType, StructField, IntegerType
from pyspark.sql.functions import udf, col, explode, monotonically_increasing_id, when, split, lower

class Cover:
    def __init__(self, window_size=5, min_occurrence_count=1):
        self.window_size = window_size
        self.min_occurrence_count = min_occurrence_count
        self.transformed_data = []
        self.corpus = None
    
    def import_data(self, filename):
        self.corpus = spark.read.format("csv").option("header", "True").option("mode", "DROPMALFORMED").load(filename)
        print("Corpus has {} documents".format(self.corpus.count()))
        
    def fit_transform(self, column_name):
        if self.corpus is None:
            print()
        else:
            tokenise = udf(lambda x: x.lower().translate(str.maketrans('','',punctuation)).split(' ') if x else [], ArrayType(StringType()))
            tokenised_dataframe = self.corpus.withColumn('tokens', tokenise(column_name).alias('tokens'))

            words_dataframe = tokenised_dataframe.withColumn('word', explode(col('tokens')))\
                                 .groupBy('word')\
                                 .count()\
                                 .sort('count', ascending=True)
            
            #Need to find a way to automatically assign ID's from 1 - vocab size
            words_with_id_dataframe = words_dataframe.withColumn('id', monotonically_increasing_id() + 1)

            filtered_words_with_id_dataframe = words_with_id_dataframe.withColumn('id', when(words_with_id_dataframe['count'] <= self.min_occurrence_count, 0).otherwise(words_with_id_dataframe.id))
            
            token_to_id = filtered_words_with_id_dataframe.rdd.map(lambda r : (r.word,r.id)).collectAsMap()
            
            print("There are {} unique words".format(len(token_to_id)))
            
            get_id = udf(lambda x: [token_to_id[word] for word in x], ArrayType(StringType()))
            transformed_dataframe = tokenised_dataframe.withColumn('transform', get_id('tokens').alias('transform'))
            
            print("Transformed tokens to id!".format(len(token_to_id)))
            
            ngrams = udf(lambda x: get_ngrams(x), ArrayType(StringType()))
            
            matrix = transformed_dataframe.withColumn("matrix", ngrams("transform").alias('matrix'))
            
            matrix.select('matrix').show(10, False)
                
    def build_cooccur_matrix(self):
        ij_list = []
        cooccur_matrix = np.fromiter(())
        
    def get_ngrams(indexes):
        ngrams = defaultdict(lambda: 0)
        for i, left_index in enumerate(indexes):
            window = indexes[i + 1:i + 3 + 1]
            for distance, right_index in enumerate(window):
                ngrams[frozenset((left_index, right_index))] += (distance + 1)
        return ngrams
        

In [6]:
from itertools import islice
sentence = "Hi my name is, hi my age is"
sentence_list = sentence.lower().split(' ')

word_to_index = list(set(sentence_list))
transform_sentence_list = [word_to_index.index(word) for word in sentence_list]



In [13]:
import time
filename = '/opt/training/data/raw/billboard_lyrics_1964-2015.csv'
column_name = 'lyrics'
cover = Cover(min_occurrence_count=5)

start_time = time.time()
cover.import_data(filename)
cover.fit_transform(column_name)
end_time = time.time()

print("Time taken is {}".format(end_time-start_time))

Corpus has 5100 documents
There are 42181 unique words
Transformed tokens to id!
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
from nltk.corpus import gutenberg

cover = Cover()
texts = gutenberg.sents('shakespeare-macbeth.txt')
sentences = [" ".join(list_of_words) for list_of_words in texts]

start_time = time.time()
data = cover.fit_transform(sentences)
end_time = time.time()
print("Time taken is {}".format(end_time-start_time))
print(data[1000])