In [None]:
import pandas as pd 
# Load train data
train = pd.read_csv('trainIdx2_matrix.txt', header=None, sep='|', names=['userId', 'itemId', 'score'])

# Get Ids from track, album, artist, and genre to classify train item ids
trackIds = pd.read_csv('trackData2.txt', sep='|', usecols=[0], header=None, names=['trackId'])
albumIds = pd.read_csv('albumData2.txt', sep='|', usecols=[0], header=None, names=['albumId'])
artistIds = pd.read_csv('artistData2.txt', sep='|', usecols=[0], header=None, names=['artistId'])
genreIds = pd.read_csv('genreData2.txt', sep='|', usecols=[0], header=None, names=['genreId'])

In [None]:
# determine if the itemid is a track, album, artist, or genre, and rename columns
track_score = train[train.itemId.isin(trackIds['trackId'])].rename(columns = {'score': 'track_score', 'itemId': 'trackId'})
album_score = train[train.itemId.isin(albumIds['albumId'])].rename(columns = {'score': 'album_score', 'itemId': 'albumId'})
artists_score = train[train.itemId.isin(artistIds['artistId'])].rename(columns = {'score': 'artist_score', 'itemId': 'artistId'})
genres_score = train[train.itemId.isin(genreIds['genreId'])].rename(columns = {'score': 'genre_score', 'itemId': 'genreId'})

track_score.head()

Unnamed: 0,userId,trackId,track_score
87,199810,47420,90
88,199810,158436,50
89,199810,256008,70
90,199810,234891,70
130,199810,150186,70


In [None]:
# Load track and album heirarchies

track_heir = pd.read_csv('trackData2.txt', sep='|', header=None, na_values=['None'], names=['trackId', 'albumId', 'artistId',
                                                                                            'genreId_1', 'genreId_2', 'genreId_3',
                                                                                            'genreId_4', 'genreId_5', 'genreId_6',
                                                                                            'genreId_7', 'genreId_8', 'genreId_9',
                                                                                            'genreId_10', 'genreId_11', 'genreId_12',
                                                                                            'genreId_13', 'genreId_14', 'genreId_15',
                                                                                            'genreId_16', 'genreId_17', 'genreId_18',
                                                                                            'genreId_19', 'genreId_20', 'genreId_21'])

album_heir = pd.read_csv('albumData2.txt', sep='|', header=None, na_values=['None'], names=['albumId', 'artistId', 'genreId_1', 'genreId_2',
                                                                                            'genreId_3', 'genreId_4', 'genreId_5', 'genreId_6',
                                                                                            'genreId_7', 'genreId_8', 'genreId_9', 'genreId_10',
                                                                                            'genreId_11', 'genreId_12', 'genreId_13', 'genreId_14',
                                                                                            'genreId_15', 'genreId_16', 'genreId_17', 'genreId_18',
                                                                                            'genreId_19', 'genreId_20', 'genreId_21'])

# Combine heirarchy with the given track/album score 
train_track_heir = pd.merge(track_score, track_heir, how='left', on='trackId')
album_score_heir = pd.merge(album_score, album_heir, how='left', on='albumId')

album_score_heir.sort_values(by = ['userId', 'albumId'])
album_score_heir.head()

Unnamed: 0,userId,albumId,album_score,artistId,genreId_1,genreId_2,genreId_3,genreId_4,genreId_5,genreId_6,genreId_7,genreId_8,genreId_9,genreId_10,genreId_11,genreId_12,genreId_13,genreId_14,genreId_15,genreId_16,genreId_17,genreId_18,genreId_19,genreId_20,genreId_21
0,199810,26374,50,153568.0,158282.0,81520.0,242383.0,,,,,,,,,,,,,,,,,,
1,199810,204650,50,177418.0,131552.0,,,,,,,,,,,,,,,,,,,,
2,199810,9774,50,79500.0,158282.0,242383.0,207648.0,47898.0,,,,,,,,,,,,,,,,,
3,199811,271229,70,293464.0,158282.0,279143.0,173655.0,280261.0,,,,,,,,,,,,,,,,,
4,199812,112725,100,275191.0,158282.0,207648.0,,,,,,,,,,,,,,,,,,,


### Only keep the train albums seperate that are not already inside `train_track_heir`

In [None]:
# Look back at this!!!!!!!! Think its deleting album scores for no reason
album_score_heir = train_track_heir[~train_track_heir['albumId'].isin(album_score_heir['albumId'])] #don't keep train_track_heir[albumid] if its in album_score_heir
album_score_heir.head()

Unnamed: 0,userId,trackId,track_score,albumId,artistId,genreId_1,genreId_2,genreId_3,genreId_4,genreId_5,genreId_6,genreId_7,genreId_8,genreId_9,genreId_10,genreId_11,genreId_12,genreId_13,genreId_14,genreId_15,genreId_16,genreId_17,genreId_18,genreId_19,genreId_20,genreId_21
4,199810,150186,70,,,131552.0,61215.0,176858.0,199606.0,289568.0,214110.0,,,,,,,,,,,,,,,
22,199810,176455,50,,,131552.0,,,,,,,,,,,,,,,,,,,,
116,199814,13269,50,,,61215.0,17453.0,256783.0,88853.0,,,,,,,,,,,,,,,,,
143,199815,164183,50,,,131552.0,,,,,,,,,,,,,,,,,,,,
145,199815,281268,40,,,131552.0,,,,,,,,,,,,,,,,,,,,


## For train_track_heir check to see if scores exist in the other matrices

In [None]:
track_score_in_albums = album_score[album_score['albumId'].isin(train_track_heir['albumId'])]
track_score_in_albums.head()

Unnamed: 0,userId,albumId,album_score
134,199810,26374,50
138,199810,204650,50
141,199810,9774,50
280,199811,271229,70
291,199812,112725,100


In [None]:
track_score_in_artists = artists_score[artists_score.artistId.isin(train_track_heir.artistId)]
track_score_in_artists.head()

Unnamed: 0,userId,artistId,artist_score
0,199808,248969,90
1,199808,2663,90
2,199808,28341,90
3,199808,42563,90
4,199808,59092,90


### Combine the matrices together

In [None]:
final = pd.merge(train_track_heir, track_score_in_albums, how='outer', on=['userId', 'albumId'])
final = pd.merge(final, track_score_in_artists, how='outer', on=['userId', 'artistId'])

final.head()

Unnamed: 0,userId,trackId,track_score,albumId,artistId,genreId_1,genreId_2,genreId_3,genreId_4,genreId_5,genreId_6,genreId_7,genreId_8,genreId_9,genreId_10,genreId_11,genreId_12,genreId_13,genreId_14,genreId_15,genreId_16,genreId_17,genreId_18,genreId_19,genreId_20,genreId_21,album_score,artist_score
0,199810,47420.0,90.0,190891.0,22907.0,243099.0,,,,,,,,,,,,,,,,,,,,,,
1,199810,158436.0,50.0,121272.0,48050.0,214110.0,181006.0,,,,,,,,,,,,,,,,,,,,,70.0
2,199810,256008.0,70.0,56953.0,275191.0,158282.0,242383.0,207648.0,,,,,,,,,,,,,,,,,,,,50.0
3,199810,234891.0,70.0,18215.0,257019.0,131552.0,47898.0,201738.0,88853.0,,,,,,,,,,,,,,,,,,,90.0
4,199810,265026.0,70.0,252707.0,257019.0,131552.0,201738.0,88853.0,,,,,,,,,,,,,,,,,,,,90.0


In [None]:
# Drop na track ids (consider not doing this again tm)!!!!!!!!!!!!!!!
final = final.dropna(subset=['trackId'])

final.to_csv('train_with_empty.csv', index=False, na_rep='None', columns=['userId', 'trackId', 'albumId', 'artistId', 'genreId_1',
                                                                          'genreId_2', 'genreId_3', 'genreId_4', 'genreId_5', 'genreId_6',
                                                                          'genreId_7', 'genreId_8', 'genreId_9', 'genreId_10', 'genreId_11',
                                                                          'genreId_12', 'genreId_13', 'genreId_14', 'genreId_15', 'genreId_16',
                                                                          'genreId_17', 'genreId_18', 'genreId_19', 'genreId_20', 'genreId_21'])

In [None]:
empty_album_scores = final[final.album_score.isna()]
empty_album_scores = empty_album_scores.dropna(subset=['albumId'])

empty_album_scores[['userId', 'trackId', 'albumId']].to_csv('empty_album_scores.csv', index=False)

empty_artist_scores = final[final.artist_score.isna()]
empty_artist_scores = empty_artist_scores.dropna(subset=['albumId'])
empty_artist_scores.head()

empty_artist_scores[['userId', 'trackId', 'artistId']].to_csv('empty_artist_scores.csv', index=False)

## Make predictions on the empty values

In [None]:
!pip install pyspark
from pyspark.sql import SparkSession
import pandas

spark = SparkSession\
        .builder\
        .master('local[*]')\
        .appName('Homework9Part1')\
        .config('spark.driver.maxResultSize', '10g')\
        .config('spark.executor.memory' ,'10g')\
        .config('spark.driver.memory', '10g')\
        .getOrCreate()

from pyspark import SparkContext
sc = SparkContext.getOrCreate()

Collecting pyspark
  Downloading pyspark-3.2.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 37 kB/s 
[?25hCollecting py4j==0.10.9.2
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 32.0 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805912 sha256=c24ad9189c46f97a2ab68dfb5d9354cf8cea29b8dd7810c4665a4ba04feb1cae
  Stored in directory: /root/.cache/pip/wheels/0b/de/d2/9be5d59d7331c6c2a7c1b6d1a4f463ce107332b1ecd4e80718
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0


In [None]:
albums = spark.read.csv('empty_album_scores.csv', sep=',', header=True)
artists = spark.read.csv('empty_artist_scores.csv', sep=',', header=True)

In [None]:
albums.printSchema()

root
 |-- userId: string (nullable = true)
 |-- trackId: string (nullable = true)
 |-- albumId: string (nullable = true)



In [None]:
artists.printSchema()

root
 |-- userId: string (nullable = true)
 |-- trackId: string (nullable = true)
 |-- artistId: string (nullable = true)



In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType

albums = albums.withColumn('userId', albums['userId'].cast(IntegerType()))
albums = albums.withColumn('trackId', albums['trackId'].cast(IntegerType()))
albums = albums.withColumn('albumId', albums['albumId'].cast(IntegerType()))

artists = artists.withColumn('userId', artists['userId'].cast(IntegerType()))
artists = artists.withColumn('trackId', artists['trackId'].cast(IntegerType()))
artists = artists.withColumn('artistId', artists['artistId'].cast(IntegerType()))

In [None]:
albums.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- trackId: integer (nullable = true)
 |-- albumId: integer (nullable = true)



In [None]:
artists.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- trackId: integer (nullable = true)
 |-- artistId: integer (nullable = true)



In [None]:
from pyspark.ml.recommendation import ALS

als = ALS(userCol='userId', 
          itemCol='itemId',
          ratingCol='score', 
          rank=5,
          maxIter= 10,
          regParam=0.01,
          nonnegative = True, 
          implicitPrefs = False)

In [None]:
train = spark.createDataFrame(train)
train.show(5)

+------+------+-----+
|userId|itemId|score|
+------+------+-----+
|199808|248969|   90|
|199808|  2663|   90|
|199808| 28341|   90|
|199808| 42563|   90|
|199808| 59092|   90|
+------+------+-----+
only showing top 5 rows



In [None]:
model = als.fit(train)
train_output = model.transform(train)
train_output.show(5)

+------+------+-----+----------+
|userId|itemId|score|prediction|
+------+------+-----+----------+
|199808|  2663|   90|  89.15709|
|199808| 42563|   90|  89.06621|
|199808| 64052|   90|  92.91526|
|199808|248969|   90| 87.115036|
|199808| 28341|   90|  90.77194|
+------+------+-----+----------+
only showing top 5 rows



In [None]:
 model.setItemCol('itemId')

ALSModel: uid=ALS_2776b37f52e6, rank=5

In [None]:
prediction_album_train = model.transform(albums.filter('albumId IS NOT NULL').withColumnRenamed('albumId', 'itemId')).withColumnRenamed('itemId', 'albumId').withColumnRenamed('prediction', 'album_score')
prediction_artists_train = model.transform(artists.filter('artistId IS NOT NULL').withColumnRenamed('artistId', 'itemId')).withColumnRenamed('itemId', 'artistId').withColumnRenamed('prediction', 'artist_score')

prediction_album_train.coalesce(1).write.csv('predicted_albums.csv', header=True)
prediction_artists_train.coalesce(1).write.csv('predicted_artists.csv', header=True)

## Find num genres

In [None]:
#delete cell after confirming this ithe same as final
final2 = spark.read.csv('train_with_empty.csv',
                     sep=',',
                     nullValue='None',
                     header=True)

final2.show(5)

+------+--------+--------+--------+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+
|userId| trackId| albumId|artistId|genreId_1|genreId_2|genreId_3|genreId_4|genreId_5|genreId_6|genreId_7|genreId_8|genreId_9|genreId_10|genreId_11|genreId_12|genreId_13|genreId_14|genreId_15|genreId_16|genreId_17|genreId_18|genreId_19|genreId_20|genreId_21|
+------+--------+--------+--------+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+
|199810| 47420.0|190891.0| 22907.0| 243099.0|     null|     null|     null|     null|     null|     null|     null|     null|      null|      null|      null|      null|      null|      null|      null|      null|      null|  

In [None]:
genre_headers = ['userId', 'trackId', 'genreId_1', 'genreId_2', 'genreId_3', 'genreId_4',
                 'genreId_5', 'genreId_6', 'genreId_7','genreId_8',
                 'genreId_9', 'genreId_10','genreId_11', 'genreId_12',
                 'genreId_13', 'genreId_14','genreId_15','genreId_16',
                 'genreId_17','genreId_18','genreId_19','genreId_20',
                 'genreId_21']

In [None]:
from pyspark.sql.functions import isnull
genres = final2.select(genre_headers)

final_num_genres = genres.select('userId', 'trackId', (21-sum([isnull(genres[col]).cast(IntegerType()) for col in genres.columns])).alias('num_genres'))

final_num_genres.coalesce(1).write.csv('train_num_genres.csv', header=True)
final_num_genres.show(5)

+------+--------+----------+
|userId| trackId|num_genres|
+------+--------+----------+
|199810| 47420.0|         1|
|199810|158436.0|         2|
|199810|256008.0|         3|
|199810|234891.0|         4|
|199810|265026.0|         3|
+------+--------+----------+
only showing top 5 rows



In [None]:
predicted_artists = pd.read_csv('/content/predicted_artists.csv/part-00000-91362856-b992-4c85-8b0b-ed68546001e4-c000.csv')
predicted_albums = pd.read_csv('/content/predicted_albums.csv/part-00000-6ec87873-a976-47d5-929e-9519b86db529-c000.csv')
num_genres_df = pd.read_csv('/content/train_num_genres.csv/part-00000-aa97557e-d52a-466a-a707-cf29e7c269c3-c000.csv')

In [None]:
predicted_artists = predicted_artists.set_index(['userId', 'trackId'])
predicted_albums = predicted_albums.set_index(['userId', 'trackId'])
num_genred_df = num_genres_df.set_index(['userId', 'trackId'])

In [None]:
final = final[['userId', 'trackId', 'track_score', 'album_score', 'artist_score']].set_index(['userId', 'trackId'])

final['artist_score'] = final['artist_score'].fillna(predicted_artists['artist_score'])
final['album_score'] = final['album_score'].fillna(predicted_albums['album_score'])
final['num_genres'] = num_genred_df['num_genres']

final.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,track_score,album_score,artist_score,num_genres
userId,trackId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
199810,47420.0,90.0,66.80206,93.95858,1
199810,158436.0,50.0,57.59525,70.0,2
199810,256008.0,70.0,67.10171,50.0,3
199810,234891.0,70.0,55.453636,90.0,4
199810,265026.0,70.0,56.72174,90.0,3


In [None]:
final.isna().sum()
final = final.fillna(0)
final.isna().sum()

track_score     0
album_score     0
artist_score    0
num_genres      0
dtype: int64

In [None]:
final.to_csv('finalTrainset.csv')