# Train Cover Model

## Set up Spark 

In [1]:
import os
from pyspark.sql import SparkSession
from pyspark import SparkContext

os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3'
os.environ['PYTHONPATH'] = '$PYTHONPATH:/opt/training'
os.chdir('/opt/training')

spark_session = SparkSession\
    .builder\
    .appName("Cover")\
    .getOrCreate()

spark_session.sparkContext.addPyFile("/opt/training/src/modelling/Cover.py")
spark_session.sparkContext.addPyFile("/opt/training/src/processing/utils.py")

## Set up Cover

In [3]:
from src.modelling import Cover
from src.processing import utils
filename = '/opt/training/data/raw/billboard_lyrics_1964-2015.csv'
column_name = 'lyrics'
cover = Cover.Cover(spark_session=spark_session, embedding_size=300)

## Training

In [4]:
import time

start_time = time.time()
cover.import_data(filename)
cover.fit_transform(column_name=column_name, min_occurrence_count=5, window_size=5)
cover.build_co_occurrence_matrix()
end_time = time.time()

print("Time taken is {}".format(end_time-start_time))

Corpus has 5100 documents
+----+--------------------+--------------------+----+--------------------+------+
|Rank|                Song|              Artist|Year|              Lyrics|Source|
+----+--------------------+--------------------+----+--------------------+------+
|   1|         wooly bully|sam the sham and ...|1965|sam the sham misc...|     3|
|   2|i cant help mysel...|           four tops|1965| sugar pie honey ...|     1|
|   3|i cant get no sat...|  the rolling stones|1965|                    |     1|
|   4| you were on my mind|             we five|1965| when i woke up t...|     1|
|   5|youve lost that l...|the righteous bro...|1965| you never close ...|     1|
|   6|            downtown|        petula clark|1965| when youre alone...|     1|
|   7|                help|         the beatles|1965|help i need someb...|     3|
|   8|cant you hear my ...|     hermans hermits|1965|carterlewis every...|     5|
|   9|crying in the chapel|       elvis presley|1965| you saw me cryin..

In [None]:
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, ArrayType
from pyspark.sql.functions import col, explode, sum as sum_ 

df = spark_session.createDataFrame([{(1, 3): 1, (2 , 2): 2}, {(1, 2): 1}], MapType(ArrayType(IntegerType()), IntegerType())).toDF("age")

df.show()

t = df.select(explode(col("age"))).groupBy(col("key")).agg(sum_("value").alias("value"))

t.show()

x = t.select("value", "key").rdd.map(lambda x: (x.key, x.value)).collect()

x = zip(*x)

index, value = x

print(index)

print(value)



