# Train Cover Model

## Set up Spark 

In [None]:
import os
from pyspark.sql import SparkSession
from pyspark import SparkContext

os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3'
os.environ['PYTHONPATH'] = '$PYTHONPATH:/opt/training'
os.chdir('/opt/training')

spark_session = SparkSession\
    .builder\
    .appName("Cover")\
    .getOrCreate()

spark_session.sparkContext.addPyFile("/opt/training/src/modelling/Cover.py")
spark_session.sparkContext.addPyFile("/opt/training/src/processing/utils.py")

## Set up Cover

In [None]:
from src.modelling import Cover
from src.processing import utils
filename = '/opt/training/data/raw/test-lyrics.csv'
cover = Cover.Cover(spark_session=spark_session, embedding_size=300, x_max=100, alpha=.75, weight_decay=1, learning_rate=1)

## Training

In [None]:
import time
%load_ext autoreload
%autoreload 2

start_time = time.time()
cover.import_data(filename)
cover.fit_transform(column_name='Lyrics', covariate='Genre', min_occurrence_count=5, window_size=5)
cover.build_co_occurrence_matrix()
end_time = time.time()

print("Time taken is {}".format(end_time-start_time))

In [None]:
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, ArrayType
from pyspark.sql.functions import col, explode, sum as sum_, udf
from itertools import chain

schema = StructType([
    StructField("age", MapType(ArrayType(IntegerType()), IntegerType()), False),
    StructField("type", StringType(), False)
])

df = spark_session.createDataFrame([({(1, 2): 1, (2 , 2): 2}, "one"), ({(1, 2): 1}, "two"), ({(2, 2): 3} , "one"), ({(2, 2): 3} , "two")], schema)

df.show(10, False)

lis = df.select("type").distinct().rdd.flatMap(lambda x: x).collect()

dicto = {key : value for value, key in enumerate(lis)}

type_2_num = udf(lambda x: dicto[x])

print(dicto)

df = df.withColumn('type', type_2_num('type').cast(IntegerType()))


t = df.select(explode(col("age")), "type").groupBy(col("key"), col("type")).agg(sum_("value").alias("value"))

t.show()

print(t.schema)

x = t.select("value", "key", "type").rdd.map(lambda x: (list(chain([x['type']], x.key)), x.value)).collect()

x = list(zip(*x))

print(x)

index, value = x

print(list(index)[0])

print(list(value)[0])





In [1]:
%connect_info

{
  "shell_port": 40765,
  "ip": "127.0.0.1",
  "signature_scheme": "hmac-sha256",
  "stdin_port": 45967,
  "key": "ca2b500b-6ea5542a7633689b24228f4d",
  "control_port": 36359,
  "transport": "tcp",
  "hb_port": 50955,
  "iopub_port": 34819,
  "kernel_name": ""
}

Paste the above JSON into a file, and connect with:
    $> jupyter <app> --existing <file>
or, if you are local, you can connect with just:
    $> jupyter <app> --existing kernel-a08d0a49-4218-4256-90fe-0676755c928a.json
or even just:
    $> jupyter <app> --existing
if this is the most recent Jupyter kernel you have started.
