In [1]:
# Prepare workspace
import pandas as pd
import numpy as np
import json
#import h5py
import matplotlib
import matplotlib.pyplot as plt
import statsmodels.api as sm

# fix seed for repeatability
np.random.seed(1)

In [2]:
# Arguments
class Args():
    pass
args = Args()

args.input_actions = 'data/mars_tianchi_user_actions.csv'
args.input_songs = 'data/mars_tianchi_songs.csv'
args.output_h5 = 'data/mars_tianchi_group_actions.h5'
args.output_json = 'data/mars_tianchi_group_actions.json'
args.output_csv = 'data/mars_tianchi_group_actions.csv'
args.val_frac = 0.1
args.val_along_dimension = 'group' # possible values include: group, time
args.test_frac = 0.1
args.num_group = 1

args.artist_daily_play = 'data/artist_daily_play.csv'
args.artist_daily_download = 'data/artist_daily_download.csv'
args.artist_daily_collect = 'data/artist_daily_collect.csv'

In [3]:
# Load song information
song_info = pd.read_csv(args.input_songs,
                        names=['song_id', 'artist_id', 'publish_time', 'song_init_plays', 'lang', 'gender'],)

In [4]:
#  Build conversion map
# dates = np.sort(actions.Ds.unique())
# T = dates.shape[0]
# dates_to_idx = {dates[i] : i for i in xrange(dates.shape[0])}

artists = song_info.artist_id.unique()
D = artists.shape[0]
artist_to_idx = {artists[i] : i for i in xrange(artists.shape[0])}          

In [5]:
# Load artist daily information from CSV files
artist_daily_play = pd.read_csv(args.artist_daily_play).set_index('Ds')
artist_daily_download = pd.read_csv(args.artist_daily_download).set_index('Ds')
artist_daily_collect = pd.read_csv(args.artist_daily_collect).set_index('Ds')

In [6]:
# Convert index to DatetimeIndex type
datetimes = pd.date_range(str(20150301), str(20150830), freq='D')

# artist_daily_play.index = datetimes
# artist_daily_download.index = datetimes
# artist_daily_collect.index = datetimes

# Set index name to Ds
# artist_daily_play.set_index('Ds', inplace=True)
# artist_daily_download.set_index('Ds', inplace=True)
# artist_daily_collect.set_index('Ds', inplace=True)

In [7]:
# Get specific artist. Here we show the third artist for example.
artist = artists[2] 

# Get artist's daily information
x = datetimes
y1 = artist_daily_play.loc[:, artist].tolist()
y2 = artist_daily_download.loc[:, artist].tolist()
y3 = artist_daily_collect.loc[:, artist].tolist()

# Plot daily information
fig = plt.figure()  
plt.title(artist)
plt.plot(x, y1, 'r')
plt.plot(x, y2, 'b')
plt.plot(x, y3, 'g')
fig.autofmt_xdate()

In [8]:
# Time Series Decomposition
res = sm.tsa.seasonal_decompose(y1, freq = 7)
res.plot()
plt.show()

In [9]:
# you can find more info from the website "http://www.xiami.com/search/album?key=" once you have doubt about the singer's identity.
# with the album published time and songs num, you can confirm you suspect
# Find artist by publish_time

song_info_withinrange = song_info[(song_info['publish_time'] == 20091027)]
song_info_withinrange
len(song_info_withinrange)

0

In [10]:
# get song's info from a fixed singer
# Get the specific artist's song_info

artist = artists[29]
song_info_withinrange = song_info[(song_info['artist_id'] == artist)]
#song_info_withinrange = song_info_withinrange[(song_info_withinrange['publish_time'] ==  20100329)]
#song_info_withinrange
#len(song_info_withinrange) 


In [11]:
# get published time and songs' num from a fixed singer 

artist = artists[29]
song_info_withinrange = song_info[(song_info['artist_id'] == artist)]

ptime = song_info_withinrange.publish_time.unique()
for j in range(len(ptime)):
    print ptime[j]
    print len(song_info_withinrange[(song_info_withinrange['publish_time'] == ptime[j])])

20090606
31
20150810
12
20101027
14
20160207
3
20140618
17
20121120
16


In [12]:
# find artist number
for i in range(len(artists)):
    if artists[i] == '6bb4c3bbdb6f5a96d643320c6b6005f5':
        print i

17


In [13]:
# use some specific assumptions to  narrow the singers, e.g., the singer are Chinese singers, or narrow the album publised time

song_info_withinrange = song_info[(song_info['gender'] == 3) & (song_info['lang'] == 1)]

artist3 = song_info_withinrange.artist_id.unique()
for i in range(len(artist3)):
    print artist3[i]
    print 'artist_id:',
    for j in range(len(artists)):
        if artists[j] == artist3[i]:
            print j
    song_artist_specific = song_info[(song_info['artist_id'] == artist3[i])]
    
    lan = song_artist_specific.lang.unique()
    for k in range(len(lan)):
        print 'lan_id:',
        print lan[k],
        print 'lan_len:',
        print len(song_artist_specific[(song_artist_specific['lang'] == lan[k])])
    
    print ''
                             

97546d961462a65d2518e4f63660bd09
artist_id: 50
lan_id: 1 lan_len: 71

67ed1f7f1374e64a93bfca4b44f66dd6
artist_id: 82
lan_id: 100 lan_len: 108
lan_id: 1 lan_len: 402
lan_id: 4 lan_len: 31



In [23]:
# Singer have been assured，step two
# Combine daily news and observed value together

# Artist 7:甲壳虫乐队;17:叶倩文;37:周华健;82:SHE; （Attention: follow the same sequence in spider script）
artist_index = [7,17,37,82]
artist_index_choose = 0
artist = artists[artist_index[artist_index_choose]]

# Get artist's daily information
x = datetimes
y1 = artist_daily_play.loc[:, artist].tolist()
y2 = artist_daily_download.loc[:, artist].tolist()
y3 = artist_daily_collect.loc[:, artist].tolist()

fig.autofmt_xdate()

# Time Series Decomposition
res = sm.tsa.seasonal_decompose(y1, freq = 7)
#res.plot()

fig = plt.figure()  
plt.title(artist)
artist_trend = res.trend.tolist()
artist_observed = res.observed.tolist()
artist_seasonal = res.seasonal.tolist()

plt.plot(x,artist_observed,'r')
plt.plot(x,artist_trend,'b')

artist_withinrange = song_info_withinrange[(song_info_withinrange['artist_id'] == artist)]
song_publishtime_withinrange = artist_withinrange.publish_time.unique()
if (len(song_publishtime_withinrange)<>0):
    for i in range(len(song_publishtime_withinrange)):
        date_specific = str(song_publishtime_withinrange[i])
        plt.axvline(x[x.get_loc(date_specific)],linestyle = '--' )

# Show the news info and observed value
news_daily_record = pd.read_csv('artist_day_news.txt',header=None);
for i in range(0+244*artist_index_choose,183+244*artist_index_choose,1):#range(len(news_daily_record)):
    #print i
    if news_daily_record[2][i] >= 1: # represent more than one news 
        specific_day = news_daily_record[1][i]
        plt.axvline(x[x.get_loc(str(specific_day))],linestyle = '--',color = 'g')

plt.show()

# Save figure
#plt.savefig('d:\\test.png')
plt.close()

In [15]:
# Sort by publish_time
artist = artists[68]
song_info_withinrange = song_info[(song_info['artist_id'] == artist)]

ptime = song_info_withinrange.publish_time.unique()
for j in range(len(ptime)):
    print ptime[j]
    print len(song_info_withinrange[(song_info_withinrange['publish_time'] == ptime[j])])

20090924
12
20100101
12
20080407
11
20100407
9
20100425
15
20091201
11
20120915
11
20100430
12
20140426
11
20090101
13
20110101
13
20120802
11


In [16]:
# Get specific artist. Here we show the third artist for example.

# Record of song_info which publish in 20150301-20150831
song_info_withinrange = song_info[(song_info['publish_time'] >= 20150301)& (song_info['publish_time'] <= 20150831)]

artist = artists[68] 

# Get artist's daily information
x = datetimes
y1 = artist_daily_play.loc[:, artist].tolist()
y2 = artist_daily_download.loc[:, artist].tolist()
y3 = artist_daily_collect.loc[:, artist].tolist()

# Plot daily information
fig = plt.figure()  
#plt.title(artist)
#plt.plot(x, y1, 'r')
#plt.plot(x, y2, 'b')
#plt.plot(x, y3, 'g')

# Display the song publish time ,if have
#artist_withinrange = song_info_withinrange[(song_info_withinrange['artist_id'] == artist)]
#song_publishtime_withinrange = artist_withinrange.publish_time.unique()
#if (len(song_publishtime_withinrange)<>0):
#    for i in range(len(song_publishtime_withinrange)):
#        date_specific = str(song_publishtime_withinrange[i])
#        plt.axvline(x[x.get_loc(date_specific)],linestyle = '--' )

fig.autofmt_xdate()

# Time Series Decomposition
res = sm.tsa.seasonal_decompose(y1, freq = 7)
#res.plot()

fig = plt.figure()  
plt.title(artist)
artist_trend = res.trend.tolist()
artist_observed = res.observed.tolist()
artist_seasonal = res.seasonal.tolist()

plt.plot(x,artist_observed,'r')
plt.plot(x,artist_trend,'b')

artist_withinrange = song_info_withinrange[(song_info_withinrange['artist_id'] == artist)]
song_publishtime_withinrange = artist_withinrange.publish_time.unique()
if (len(song_publishtime_withinrange)<>0):
    for i in range(len(song_publishtime_withinrange)):
        date_specific = str(song_publishtime_withinrange[i])
        plt.axvline(x[x.get_loc(date_specific)],linestyle = '--' )

plt.show()

# Save figure
#plt.savefig('d:\\test.png')
plt.close()

In [16]:
# Display the trend and observed value which include the song publish time

# Save the image of each artist.
for i in range(len(artists)):
    artist = artists[i] 
    # Get artist's daily information
    x = datetimes
    y1 = artist_daily_play.loc[:, artist].tolist()
    y2 = artist_daily_download.loc[:, artist].tolist()
    y3 = artist_daily_collect.loc[:, artist].tolist()
    
    # Time Series Decomposition
    res = sm.tsa.seasonal_decompose(y1, freq = 30)
    #res.plot()


    artist_trend = res.trend.tolist()
    artist_observed = res.observed.tolist()
    artist_seasonal = res.seasonal.tolist()

    # Plot daily information
    fig = plt.figure()  
    plt.title(artist)
    fig.autofmt_xdate()
    plt.plot(x,artist_observed,'r')
    plt.plot(x,artist_trend,'b')

    artist_withinrange = song_info_withinrange[(song_info_withinrange['artist_id'] == artist)]
    song_publishtime_withinrange = artist_withinrange.publish_time.unique()
    if (len(song_publishtime_withinrange)<>0):
        for j in range(len(song_publishtime_withinrange)):
            date_specific = str(song_publishtime_withinrange[j])
            plt.axvline(x[x.get_loc(date_specific)],linestyle = '--' )

    #plt.show()
    
    
    # Save figure
    plt.savefig('d:\\%d.png'%i)
    plt.close()

In [15]:
# Save the image of each artist decomposition for month and week.
for i in range(len(artists)):
    artist = artists[i] 
    # Get artist's daily information
    x = datetimes
    y1 = artist_daily_play.loc[:, artist].tolist()
    y2 = artist_daily_download.loc[:, artist].tolist()
    y3 = artist_daily_collect.loc[:, artist].tolist()

    # Plot daily information
    fig = plt.figure()  
    plt.title(artist)
    plt.plot(x, y1, 'r')
    plt.plot(x, y2, 'b')
    plt.plot(x, y3, 'g')
    fig.autofmt_xdate()
    
    # Display the song publish time ,if have
    artist_withinrange = song_info_withinrange[(song_info_withinrange['artist_id'] == artist)]
    song_publishtime_withinrange = artist_withinrange.publish_time.unique()
    if (len(song_publishtime_withinrange)<>0):
        for j in range(len(song_publishtime_withinrange)):
            data_specific = str(song_publishtime_withinrange[j])
            plt.axvline(x[x.get_loc(date_specific)],linestyle = '--',label = data_specific )

    # Time Series Decomposition
    #res = sm.tsa.seasonal_decompose(y1, freq = 7)
    #res.plot()
    #plt.show()
    
    # Save figure
    plt.savefig('d:\\%d.png'%i)
    plt.close()