# Train Cover Model

## Set up Spark 

In [1]:
import os
from pyspark.sql import SparkSession
from pyspark import SparkContext

os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3'
os.environ['PYTHONPATH'] = '$PYTHONPATH:/opt/training'
os.chdir('/opt/training')

spark_session = SparkSession\
    .builder\
    .appName("Cover")\
    .getOrCreate()

spark_session.sparkContext.addPyFile("/opt/training/src/modelling/Cover.py")
spark_session.sparkContext.addPyFile("/opt/training/src/processing/utils.py")

## Set up Cover

In [2]:
from src.modelling import Cover
from src.processing import utils
filename = '/opt/training/data/raw/test-lyrics.csv'
column_names = ['Lyrics', 'Genre']
cover = Cover.Cover(spark_session=spark_session, embedding_size=300)

## Training

In [3]:
import time

start_time = time.time()
cover.import_data(filename)
cover.fit_transform(column_name=column_names, min_occurrence_count=5, window_size=5)
cover.build_co_occurrence_matrix()
end_time = time.time()

print("Time taken is {}".format(end_time-start_time))

Corpus has 5 documents
+-----+--------------------+
|Genre|              Lyrics|
+-----+--------------------+
|  Pop|Yeah, breakfast a...|
|  Rap|I'm all in my bag...|
|  Rap|Cole World, Cole ...|
|  Pop|I'm not just tryi...|
|  Rap|Two bad bitches a...|
+-----+--------------------+

+-----+-----+---+
| word|count| id|
+-----+-----+---+
|    i|   53|321|
|  you|   34|320|
|  the|   29|319|
| that|   22|318|
|  and|   20|317|
|   to|   19|316|
|   im|   18|315|
|    a|   16|314|
|   my|   16|313|
|   it|   15|312|
|   me|   14|311|
|   be|   12|310|
|  got|   12|309|
|sorry|   11|308|
|  for|   11|307|
| they|   10|306|
| what|    9|305|
|  ass|    9|304|
|  say|    9|303|
| yeah|    9|302|
+-----+-----+---+
only showing top 20 rows

There are 322 unique tokens
Mapped tokens to unique id
+--------------------+-----+
|              matrix|Genre|
+--------------------+-----+
|[[321, 321] -> 0....|  Pop|
|[[314, 319] -> 0....|  Rap|
|[[311, 317] -> 1....|  Rap|
|[[303, 321] -> 0....|  Pop|

In [None]:
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, ArrayType
from pyspark.sql.functions import col, explode, sum as sum_ 

schema = StructType([
    StructField("age", MapType(ArrayType(IntegerType()), IntegerType()), False),
    StructField("type", StringType(), False)
])

df = spark_session.createDataFrame([({(1, 2): 1, (2 , 2): 2}, "one"), ({(1, 2): 1}, "two"), ({(2, 2): 3} , "one"), ({(2, 2): 3} , "two")], schema)

df.show(10, False)

t = df.select(explode(col("age")), "type").groupBy(col("key"), col("type")).agg(sum_("value").alias("value"))

t.show()

x = t.select("value", "key", "type").rdd.map(lambda x: (x.key, x.type, x.value)).groupBy(lambda x: x[1]).mapValues(list).collect()

print(x)

x = [zip(*y[1]) for y in x]


index, genre, value = x[1]

print(index)


#MapType(ArrayType(IntegerType()), IntegerType())

print(value)



