In [None]:
import collections
user_dic = collections.defaultdict(int)
song_dic = collections.defaultdict(int)
edge_list = []
with open('data/train_triplets.txt', 'r') as f:
    for idx, line in enumerate(f.readlines()):
        if idx % 100000 == 0:
            print('idx:', idx)
        line = line.strip()
        fields = line.split('\t')
        user, song, weight = fields[0], fields[1], int(fields[2])
        user_dic[user] += 1
        song_dic[song] += 1
        edge_list.append((user, song, weight))
        line = f.readline()


### Visualize User/Song Distribution

In [None]:
print('total number of users:', len(user_dic))
count = 0
for user in user_dic:
    if user_dic[user] > 1:
        count += 1
print('number of users with listen > 1:', count)

In [None]:
import numpy as np
print('user 10 percentile:', np.percentile(list(user_dic.values()), 10))
print('user 90 percentile:', np.percentile(list(user_dic.values()), 90))
print('song 10 percentile:', np.percentile(list(song_dic.values()), 10))
print('song 90 percentile:', np.percentile(list(song_dic.values()), 90))

In [None]:
import pandas as pd

user_df = pd.DataFrame.from_dict(user_dic, orient='index')
user_df = user_df.reset_index()
user_df.columns = ['user_id', 'songs_listened']
user_df.head()

In [None]:
selected_user_df = user_df.loc[((user_df.songs_listened > 12) & (user_df.songs_listened < 105)), :]

In [None]:
song_df = pd.DataFrame.from_dict(song_dic, orient='index')
song_df = song_df.reset_index()
song_df.columns = ['song_id', 'num_of_listened']
song_df.head()

In [None]:
selected_song_df = song_df.loc[((song_df.num_of_listened > 2) & (song_df.num_of_listened < 218)), :]

In [None]:
triplet_df = pd.DataFrame(edge_list, columns=['user', 'song', 'weight'])
triplet_df.head()

In [None]:
selected_triplet_df = triplet_df.loc[((triplet_df.user.isin(selected_user_df['user_id'])) & (triplet_df.song.isin(selected_song_df['song_id']))), :]
selected_triplet_df.head()

In [None]:
selected_triplet_df.to_csv('selected_triplet.csv', index=False)

In [None]:
import numpy as np

print('User Distribution')
user_distribution = list(user_dic.values())
print('mean number of listen:', np.mean(user_distribution))
print('median numer of listen:', np.median(user_distribution))
print('max number of listen:', np.max(user_distribution))
print('min number of listen:', np.min(user_distribution))
print('------------------')
print('Song Distribution')
song_distribution = list(song_dic.values())
print('mean number of listened song:', np.mean(song_distribution))
print('median number of listened song:', np.median(song_distribution))
print('max number of listened song:', np.max(song_distribution))
print('min number of listened song:', np.min(song_distribution))

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(16,8))
plt.subplot(121)
plt.title('User Distribution')
plt.xlabel('Number of songs')
plt.hist(list(user_dic.values()), bins=100, log=True)
plt.subplot(122)
plt.title('Song Distribution')
plt.xlabel('Number of listened')
plt.hist(list(song_dic.values()), bins=100, log=True)
plt.show()


### Create Graph

In [None]:
import networkx as nx

def get_edgelist():
    edge_list = []
    with open('data/train_triplets.txt', 'r') as f:
        count = 0
        line = f.readline()
        while line:
            line = line.strip()
            fields = line.split('\t')
            user, song, weight = fields[0], fields[1], int(fields[2])
            edge_list.append((user, song, weight))
            line = f.readline()
            count += 1
            if count > 100:
                break
    return edge_list

G = nx.Graph()
edge_list = get_edgelist()
for user, song, weight in edge_list:
    G.add_edge(user, song, weight=weight)

In [None]:
nx.draw(G)

In [None]:
edge_list

In [None]:
G = nx.Graph()
G.add_edges_from([(1,2),(2,3)], weight=2)