In [10]:
import sys
sys.path.append('../RecSysRep/')

import Basics.Load as ld

URM_all, ICM_genre_all, ICM_subgenre_all, ICM_channel_all, ICM_event_all = ld.getCOOs()
# URM_train, URM_val = ld.getSplit(URM_train_val, 5678, 0.8)

URM_all_dataframe, ICM_genre_all_dataframe, ICM_subgenre_all_dataframe, ICM_channel_all_dataframe, ICM_event_all_dataframe = ld.getDataframes()

import numpy as np
from scipy.sparse import *

In [32]:
ICM_event_all_dataframe

Unnamed: 0,ItemID,EpisodeID,Match
0,0,121781,1.0
1,1,150102,1.0
2,1,349614,1.0
3,2,9174,1.0
4,2,20833,1.0
...,...,...,...
358065,18056,169988,1.0
358066,18056,198728,1.0
358067,18057,237262,1.0
358068,18058,96320,1.0


## Calculate array of number of episodes per show

In [11]:
ICM_event_all = ICM_event_all.tocsr()
episodesPerShow = np.ediff1d(ICM_event_all.indptr)

In [12]:
print(episodesPerShow)

[1 2 3 ... 2 1 2]


In [13]:
reshapedEPR = episodesPerShow.reshape(-1,1)
print(reshapedEPR)
print(len(reshapedEPR))

[[1]
 [2]
 [3]
 ...
 [2]
 [1]
 [2]]
18059


## Apply K-Means clustering

In [14]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=9).fit(reshapedEPR)

#you can see the labels with:
print(kmeans.labels_)
print(len(kmeans.labels_))
print(kmeans.cluster_centers_)

[0 0 0 ... 0 0 0]
18059
[[5.64361352e+00]
 [2.69500000e+03]
 [4.45614679e+02]
 [4.03100000e+03]
 [2.32671587e+02]
 [1.46125000e+03]
 [8.52480706e+01]
 [8.09216216e+02]
 [5.77800000e+03]]


In [15]:
import pandas as pd

a = pd.DataFrame(episodesPerShow, columns = ['NumEpisodes'])
a['cluster'] = kmeans.labels_
a

Unnamed: 0,NumEpisodes,cluster
0,1,0
1,2,0
2,3,0
3,1,0
4,1,0
...,...,...
18054,1,0
18055,1,0
18056,2,0
18057,1,0


## Apply balanced clustering

In [58]:
num_cluster = 3
div = pd.qcut(a.NumEpisodes, num_cluster, labels=list(range(0, num_cluster)))

In [59]:
a["cluster"] = div
for i in range(0, num_cluster):
    print(a.loc[a['cluster'] == i].describe())

       NumEpisodes
count  7544.000000
mean      1.465403
std       0.522714
min       0.000000
25%       1.000000
50%       1.000000
75%       2.000000
max       2.000000
       NumEpisodes
count  4685.000000
mean      3.693490
std       0.770375
min       3.000000
25%       3.000000
50%       4.000000
75%       4.000000
max       5.000000
       NumEpisodes
count  5830.000000
mean     56.554202
std     176.347217
min       6.000000
25%       9.000000
50%      15.000000
75%      40.000000
max    5778.000000


## Express features as one-hot encodings

In [16]:
one_hot = pd.get_dummies(a['cluster'])
# Drop column B as it is now encoded
# a_without = a.drop('cluster',axis = 1)
# Join the encoded df
a_onehot = a.join(one_hot)
a_onehot

Unnamed: 0,NumEpisodes,cluster,0,1,2,3,4,5,6,7,8
0,1,0,1,0,0,0,0,0,0,0,0
1,2,0,1,0,0,0,0,0,0,0,0
2,3,0,1,0,0,0,0,0,0,0,0
3,1,0,1,0,0,0,0,0,0,0,0
4,1,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
18054,1,0,1,0,0,0,0,0,0,0,0
18055,1,0,1,0,0,0,0,0,0,0,0
18056,2,0,1,0,0,0,0,0,0,0,0
18057,1,0,1,0,0,0,0,0,0,0,0


In [17]:
toSave = a_onehot.drop('NumEpisodes', axis=1)
toSave = toSave.drop('cluster', axis=1)
print(toSave)


'''
item_index
genre_index

ICM[item_index,genre_index]
'''

       0  1  2  3  4  5  6  7  8
0      1  0  0  0  0  0  0  0  0
1      1  0  0  0  0  0  0  0  0
2      1  0  0  0  0  0  0  0  0
3      1  0  0  0  0  0  0  0  0
4      1  0  0  0  0  0  0  0  0
...   .. .. .. .. .. .. .. .. ..
18054  1  0  0  0  0  0  0  0  0
18055  1  0  0  0  0  0  0  0  0
18056  1  0  0  0  0  0  0  0  0
18057  1  0  0  0  0  0  0  0  0
18058  1  0  0  0  0  0  0  0  0

[18059 rows x 9 columns]


'\nitem_index\ngenre_index\n\nICM[item_index,genre_index]\n'

In [18]:
toSave.to_csv('ICM_9km_length.csv', index=False, header=True)

# Misc

In [3]:
uniqueEpisodes = set(episodesPerShow)
# print(uniqueEpisodes)

import collections
counter=collections.Counter(episodesPerShow)
episodeFrequency = list(counter.items())
# counter.most_common(20)
invEF = [(sub[1], sub[0]) for sub in episodeFrequency]

In [None]:
import matplotlib.pyplot as plt
import math
%matplotlib inline

x_axis = list(counter.keys())


plt.figure(figsize=(15,5))
# Plot Histogram on x
plt.bar(x_axis, counter.values(), log = True)
plt.gca().set(title='Frequency Histogram', ylabel='Frequency', xlabel='NumEpisodes');
plt.xlim([0, 4000])