# Train Cover Model

## Set up Spark 

In [1]:
import os
from pyspark.sql import SparkSession
from pyspark import SparkContext

os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3'
os.environ['PYTHONPATH'] = '$PYTHONPATH:/opt/training'
os.chdir('/opt/training')

spark_session = SparkSession\
    .builder\
    .appName("Cover")\
    .getOrCreate()

spark_session.sparkContext.addPyFile("/opt/training/src/modelling/Cover.py")
spark_session.sparkContext.addPyFile("/opt/training/src/processing/utils.py")

## Set up Cover

In [None]:
from src.modelling import Cover
from src.processing import utils
filename = '/opt/training/data/raw/lyrics.csv'
column_name = 'Lyrics'
cover = Cover.Cover(spark_session=spark_session, embedding_size=300)

## Training

In [None]:
import time

start_time = time.time()
cover.import_data(filename)
cover.fit_transform(column_name=column_name, min_occurrence_count=5, window_size=5)
cover.build_co_occurrence_matrix()
end_time = time.time()

print("Time taken is {}".format(end_time-start_time))

In [63]:
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, ArrayType
from pyspark.sql.functions import col, explode, sum as sum_ 

schema = StructType([
    StructField("age", MapType(ArrayType(IntegerType()), IntegerType()), False),
    StructField("type", StringType(), False)
])

df = spark_session.createDataFrame([({(1, 2): 1, (2 , 2): 2}, "one"), ({(1, 2): 1}, "two"), ({(2, 2): 3} , "one"), ({(2, 2): 3} , "two")], schema)

df.show(10, False)

t = df.select(explode(col("age")), "type").groupBy(col("key"), col("type")).agg(sum_("value").alias("value"))

t.show()

x = t.select("value", "key", "type").rdd.map(lambda x: (x.key, x.type, x.value)).groupBy(lambda x: x[1]).mapValues(list).collect()

print(x)

x = [zip(*y[1]) for y in x]


index, genre, value = x[1]

print(index)


#MapType(ArrayType(IntegerType()), IntegerType())

print(value)





+--------------------------+----+
|age                       |type|
+--------------------------+----+
|[[2, 2] -> 2, [1, 2] -> 1]|one |
|[[1, 2] -> 1]             |two |
|[[2, 2] -> 3]             |one |
|[[2, 2] -> 3]             |two |
+--------------------------+----+

+------+----+-----+
|   key|type|value|
+------+----+-----+
|[1, 2]| two|    1|
|[2, 2]| one|    5|
|[2, 2]| two|    3|
|[1, 2]| one|    1|
+------+----+-----+

[('two', [([1, 2], 'two', 1), ([2, 2], 'two', 3)]), ('one', [([2, 2], 'one', 5), ([1, 2], 'one', 1)])]
([2, 2], [1, 2])
(5, 1)
