# Train Cover Model

## Set up Spark 

In [1]:
import os
from pyspark.sql import SparkSession
from pyspark import SparkContext

os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3'
os.environ['PYTHONPATH'] = '$PYTHONPATH:/opt/training'
os.chdir('/opt/training')

spark_session = SparkSession\
    .builder\
    .appName("Cover")\
    .getOrCreate()

spark_session.sparkContext.addPyFile("/opt/training/src/modelling/Cover.py")
spark_session.sparkContext.addPyFile("/opt/training/src/processing/utils.py")

## Set up Cover

In [2]:
from src.modelling import Cover
from src.processing import utils
filename = '/opt/training/data/raw/lyrics.csv'
column_name = 'Lyrics'
cover = Cover.Cover(spark_session=spark_session)

## Training

In [3]:
import time

start_time = time.time()
cover.import_data(filename)
cover.fit_transform(column_name=column_name, min_occurrence_count=5, window_size=5)
end_time = time.time()

print("Time taken is {}".format(end_time-start_time))

Corpus has 368259 documents
+----+-----+-----+
|word|count|   id|
+----+-----+-----+
| the|77848|75400|
|   i|65663|75399|
| you|54140|75398|
|   a|44477|75397|
|  to|36266|75396|
|  in|28737|75395|
| and|27199|75394|
|  me|25524|75393|
|  my|25305|75392|
|  of|24070|75391|
|  on|16356|75390|
|    |16267|75389|
|your|16192|75388|
|  it|15838|75387|
|  is|15529|75386|
|  im|13709|75385|
|that|12413|75384|
|this|11821|75383|
| all|11262|75382|
| for|11151|75381|
+----+-----+-----+
only showing top 20 rows

There are 75400 unique tokens
Mapped tokens to unique id
+--------------------+
|              matrix|
+--------------------+
|[[75321, 75398] -...|
|[[75210, 75040] -...|
|[[75355, 75398] -...|
|[[75399, 75363] -...|
|[[75400, 74992] -...|
|[[75399, 75163] -...|
|[[75334, 75399] -...|
|[[75133, 75133] -...|
|[[75310, 75303] -...|
|                  []|
+--------------------+
only showing top 10 rows

There are 1109264 ij pairs
+--------------+-------------------+
|           key|     

In [12]:
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, ArrayType
from pyspark.sql.functions import col, explode, sum as sum_ 

schema = StructType([
    StructField("left_context_id", IntegerType(), False),
    StructField("right_context_id", IntegerType(), False)
])

df = spark_session.createDataFrame([{(1, 2): 1, (2 , 2): 2}, {(1, 2): 1}], MapType(schema, IntegerType())).toDF("age")

df.show()

t = df.select(explode(col("age"))).groupBy(col("key")).agg(sum_("value"))

t.show()


+--------------------+
|                 age|
+--------------------+
|[[2, 2] -> 2, [1,...|
|       [[1, 2] -> 1]|
+--------------------+

+------+----------+
|   key|sum(value)|
+------+----------+
|[2, 2]|         2|
|[1, 2]|         2|
+------+----------+



AnalysisException: "cannot resolve 'CAST(`key` AS INT)' due to data type mismatch: cannot cast struct<left_context_id:int,right_context_id:int> to int;;\n'Project [cast(key#320 as int) AS key#340, sum(value)#327L]\n+- AnalysisBarrier\n      +- Aggregate [key#320], [key#320, sum(cast(value#321 as bigint)) AS sum(value)#327L]\n         +- Project [key#320, value#321]\n            +- Generate explode(age#313), false, [key#320, value#321]\n               +- Project [value#311 AS age#313]\n                  +- LogicalRDD [value#311], false\n"