In [2]:
import os
from google.colab import drive
drive.mount('/content/drive')

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from matplotlib.ticker import MaxNLocator
import math

Mounted at /content/drive


In [3]:
!pip install pyspark==3.2.0

Collecting pyspark==3.2.0
  Downloading pyspark-3.2.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 34 kB/s 
[?25hCollecting py4j==0.10.9.2
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 45.7 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805912 sha256=cdc7b31b8dd1e3f575a6a41f4fea520876b818555b7908434319743e24e6b8ac
  Stored in directory: /root/.cache/pip/wheels/0b/de/d2/9be5d59d7331c6c2a7c1b6d1a4f463ce107332b1ecd4e80718
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0


In [4]:
from pyspark.sql import SparkSession, functions
from pyspark.ml import Pipeline
from pyspark.ml.feature import MinMaxScaler, VectorAssembler
from pyspark.ml.clustering import KMeans, BisectingKMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.sql.functions import udf
from pyspark.sql.types import *


In [5]:
def createDataframe(path):
    ''' 
    create Dataframe using csv path
    '''
    raw_data = spark.read.csv(path,header=True,inferSchema=True)
    df = pd.DataFrame(raw_data.toPandas())
    data_values=df.values.tolist()
    data_columns=list(df.columns)
    df = spark.createDataFrame(data_values,data_columns)
    return df

In [6]:
def featureGenerator(df):
    '''
    generate the music feature
    returns only columns with numeric values
    and also combines them all together into one signle vector
    '''
    columns_to_scale = [
        'energy',
        'key',
        'loudness',
        'speechiness',
        'instrumentalness',
        'liveness',
        'valence',
        'tempo']
    df = df.select(columns_to_scale)
    assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in columns_to_scale]
    scalers = [MinMaxScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_scale]
    pipeline = Pipeline(stages=assemblers + scalers)
    scalerModel = pipeline.fit(df)
    df_norm = scalerModel.transform(df)
    selected_columns = [col for col in df_norm.columns if col[-6:]=='scaled']
    df_norm = df_norm.select(selected_columns)
    vecAss = VectorAssembler(inputCols = df_norm.columns[:], outputCol = 'features')
    df_km = vecAss.transform(df_norm).select('features')
    return df_km

In [7]:
def dataProcess(path):
    df = createDataframe(path).cache()
    df_km = featureGenerator(df)
    return df_km

In [8]:
def modelLoss(data,n,saving=False,save_path=''):
    '''
    returns the kmeans model and its loss
    '''
    kmeans = KMeans(k=n, seed=1,maxIter=100)
    model = kmeans.fit(data)
    predictions = model.transform(data)
    evaluator = ClusteringEvaluator()
    silhouette = evaluator.evaluate(predictions)
    if saving:
        kmeans_path = save_path + '/kmeans'
        kmeans.save(kmeans_path)
        print('successully saved model at: ',kmeans_path)
    return model,silhouette

In [9]:
def lossPrint(loss):
    '''
    for different k print its loss
    '''
    x = [i+2 for i in range(len(loss))]
    plt.plot(x,loss,label="loss")
    plt.xlabel('num of clusters')
    plt.ylabel('loss')
    plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
    plt.title('loss change vs num of clusters')
    plt.grid()
    plt.show()

In [10]:
def trainandsave(df_km,save_path,printable=False):
    '''
    the training process
    return the model path
    '''
    loss = []
    K_max = 11
    for k in range(2,K_max,1):
        _,silhouette = modelLoss(df_km,k)
        loss.append(silhouette)
    if printable == True:
        lossPrint(loss)
    best_k = 2 + loss.index(min(loss))
    print('The best K is: ',best_k)
    model,_ = modelLoss(df_km,best_k,saving=True,save_path=save_path)
    model.setPredictionCol("prediction")
    model.predict(df_km.head().features)
    transformed = model.transform(df_km).select("features", "prediction")
    kmeans_path = save_path + '/kmeans'
    return kmeans_path,best_k

In [11]:
def cluster(df2_km,model_path,k):
    '''
    return features, prediction
    '''
    kmeans2 = KMeans.load(model_path)
    model2 = kmeans2.fit(df2_km)
    data2 = model2.transform(df2_km)
    model2.setPredictionCol("prediction")
    model2.predict(df2_km.head().features)
    transformed = model2.transform(df2_km).select("features", "prediction").cache()
    return transformed

In [12]:
def getRatio(data,k):
    '''
    get the Ratio of a class of music to all music
    '''
    data = data.groupby('prediction').count()
    count = {row['prediction']:row['count'] for row in data.collect()}
    ratio = np.zeros(k)
    sum = 0
    for i in count.keys():
        sum += count[i]
        ratio[i] += count[i]
    return ratio/sum

In [13]:
def sigmoid(x):
    return 1 / (1 + math.exp(-x))
def ratingFunc(ratio):
    '''
    The rating function maps the relative ratio to rating
    The rating is [0,10]
    also x + 1/x = 10
    '''
    if ratio >= 1:
      return 10*(sigmoid(ratio-1))
    else:
      if ratio < 1e-5:
        return 1e-2
      return 10*(sigmoid(1-1/ratio))

In [14]:
def getRating(all_ratio,liked_ratio):
    '''
    the rating is based on relative ratio
    '''
    rating = liked_ratio / all_ratio
    for i in range(len(rating)):
        rating[i] = ratingFunc(rating[i])
    return rating

In [15]:
def userRating(userdata,alldata,model_path,k):
    '''
    get user's rating
    '''
    userdata = cluster(userdata,model_path,k)
    alldata = cluster(alldata,model_path,k)
    userRatio = getRatio(userdata,k)
    allRatio = getRatio(alldata,k)
    userrate = getRating(allRatio,userRatio)
    return userdata,userrate

In [16]:
def predictionToRating(value,rating):
    return rating[value]

In [17]:
def giveRating(rating,liked_data,data_path):
    '''
    add the rating column
    '''
    df = pd.read_csv(data_path)
    userdata = liked_data.select('prediction')
    predict = userdata.collect()
    rating = [rating[i] for i in predict]
    df.insert(df.shape[1], 'rating', rating)
    df.to_csv(data_path)
    print('Already added rating to: ',data_path)

In [None]:
if __name__ == '__main__':
    music_path = "/content/drive/My Drive/6893project/data/music_pool_csv.csv"
    liked_path = '/content/drive/My Drive/6893project/data/liked_tess.csv'
    save_path = "/content/drive/My Drive/6893project/code" 
    spark = SparkSession.builder.appName('Clustering using K-Means').getOrCreate()

    df_km = dataProcess(music_path)
    df_km = df_km.cache()
    kmeans_path,k = trainandsave(df_km,save_path,False)
    user_km = dataProcess(liked_path).cache()
    userdata, userrate = userRating(user_km,df_km,kmeans_path,k)
    giveRating(userrate,userdata,liked_path)

In [None]:
music_path = "/content/drive/My Drive/6893project/data/music_pool_csv.csv"
kmeans_path = '/content/drive/My Drive/6893project/code/kmeans'
spark = SparkSession.builder.appName('Clustering using K-Means').getOrCreate()
df_km = dataProcess(music_path).cache()
predicted = cluster(df_km,kmeans_path,8)
prediction = predicted.select('prediction').collect()
print(prediction)
df = pd.read_csv(music_path)
print(df.shape[0],len(prediction))
df.insert(df.shape[1], 'prediction', prediction)
df = df.sort_values(by="prediction")
df.to_csv(music_path)

In [None]:
music_path = "/content/drive/My Drive/6893project/data/music_pool_csv.csv"
kmeans_path = '/content/drive/My Drive/6893project/code/kmeans'
output_path = "/content/drive/My Drive/6893project/data/user"
df = pd.read_csv(music_path)
for i in range(10):
    new_df = df.sample(n=50, random_state=i)
    path = output_path+str(i)+'.csv'
    new_df.to_csv(output_path+str(i)+'.csv')
    print("data saved at:",path)

In [None]:
import os, glob
path = "/content/drive/My Drive/6893project/data/"
all_files = glob.glob(os.path.join(path,'user*.csv'))
print(all_files)

In [None]:
spark = SparkSession.builder.appName('Clustering using K-Means').getOrCreate()
music_path = "/content/drive/My Drive/6893project/data/music_pool_csv.csv"
df_km = dataProcess(music_path)
df_km = df_km.cache()


In [None]:
for f in all_files:
    if f == all_files[0]:
        continue
    user_km = dataProcess(f).cache()
    userdata, userrate = userRating(user_km,df_km,kmeans_path,8)
    giveRating(userrate,userdata,f)

In [None]:
for f in all_files:
    df = pd.read_csv(f)
    df = df.drop(columns=['userid'])
    df.insert(df.shape[1],'userid',value=f[45:-4])
    df.to_csv(f)

In [None]:
df_merged = (pd.read_csv(f,sep=',') for f in all_files)
df_merged = pd.concat(df_merged,ignore_index=True)
print(path+'merged.csv')
df_merged.to_csv(path+'merged.csv')