In [1]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize

%matplotlib inline

# Load Dataset

In [2]:
data = pd.read_json('./data/song.json')
data['time_played'] = pd.to_datetime(data['time_played'])
data['user_sign_up_date'] = pd.to_datetime(data['user_sign_up_date'])
data.head()

Unnamed: 0,id,song_played,time_played,user_id,user_sign_up_date,user_state
0,GOQMMKSQQH,Hey Jude,2015-06-11 21:51:35,122,2015-05-16,Louisiana
1,HWKKBQKNWI,We Can Work It Out,2015-06-06 16:49:19,3,2015-05-01,Ohio
2,DKQSXVNJDH,Back In the U.S.S.R.,2015-06-14 02:11:29,35,2015-05-04,New Jersey
3,HLHRIDQTUW,P.s. I Love You,2015-06-08 12:26:10,126,2015-05-16,Illinois
4,SUKJCSBCYW,Sgt. Pepper's Lonely Hearts Club Band,2015-06-28 14:57:00,6,2015-05-01,New Jersey


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 6 columns):
id                   4000 non-null object
song_played          4000 non-null object
time_played          4000 non-null datetime64[ns]
user_id              4000 non-null int64
user_sign_up_date    4000 non-null datetime64[ns]
user_state           4000 non-null object
dtypes: datetime64[ns](2), int64(1), object(3)
memory usage: 187.6+ KB


In [4]:
# check missing values
data.isnull().sum()

id                   0
song_played          0
time_played          0
user_id              0
user_sign_up_date    0
user_state           0
dtype: int64

In [5]:
# check unique values
for name in data.columns:
    print('{0:15s} \t {1:5d}'.format(name, len(data[name].unique())))

id              	  4000
song_played     	   100
time_played     	  3997
user_id         	   196
user_sign_up_date 	    20
user_state      	    41


# Analysis

### Question 1

In [6]:
def unique_count(x):
    """ function to count the unique user_id """
    return len(np.unique(x))

In [7]:
# create temporary dataframe for unique user count per state
state_user_count = data.groupby('user_state')['user_id'].apply(unique_count).reset_index()
state_user_count = state_user_count.rename(columns={'user_id': 'user_count'})
state_user_count = state_user_count.sort_values(by='user_count', ascending=False)

In [8]:
# get the top 3 states
state_user_count.head(3)

Unnamed: 0,user_state,user_count
25,New York,23
4,California,21
35,Texas,15


In [9]:
# get the bottom 3 states
state_user_count.tail(9)

Unnamed: 0,user_state,user_count
24,New Mexico,1
9,Idaho,1
27,North Dakota,1
6,Connecticut,1
12,Iowa,1
32,Rhode Island,1
22,Nebraska,1
2,Arizona,1
13,Kansas,1


### Question 2

Here, we define the user engagement is measured throught the average plays per user in the given state.

Of course, there are other definitions, such as "average play event per hour" (see [this link](https://github.com/stasi009/TakeHomeDataChallenges/blob/master/08.SongChallenge/song_challenge.ipynb) for details).

In [10]:
# create temporary dataframe for play count per state
state_play_count = data.groupby('user_state')['id'].count().reset_index()
state_play_count = state_play_count.rename(columns={'id': 'play_count'})

# merge user count and play count
state_user_play = pd.merge(left=state_user_count, right=state_play_count, on='user_state')

# calculate average play
state_user_play['average_play'] = state_user_play['play_count'] / state_user_play['user_count']

# sort according to average_play
state_user_play = state_user_play.sort_values(by='average_play', ascending=False)

In [11]:
# get the top 3 states
state_user_play.head(3)

Unnamed: 0,user_state,user_count,play_count,average_play
38,Nebraska,1,36,36.0
27,Alaska,2,58,29.0
22,South Carolina,3,85,28.333333


In [12]:
# get the bottom 3 states
state_user_play.tail(3)[::-1]

Unnamed: 0,user_state,user_count,play_count,average_play
40,Kansas,1,8,8.0
30,Virginia,2,17,8.5
17,Minnesota,4,42,10.5


### Question 3

In [13]:
def find_first_user(df):
    """ function to find the first user """
    idx = df['user_sign_up_date'].argmin()
    return df.loc[idx, ['user_id', 'user_sign_up_date']]

In [14]:
data.groupby('user_state').apply(find_first_user).sort_values(by='user_sign_up_date')

Unnamed: 0_level_0,user_id,user_sign_up_date
user_state,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama,5,2015-05-01
Texas,7,2015-05-01
Oregon,1,2015-05-01
Ohio,3,2015-05-01
North Carolina,2,2015-05-01
New Mexico,4,2015-05-01
New Jersey,6,2015-05-01
Pennsylvania,11,2015-05-02
New York,19,2015-05-02
Minnesota,8,2015-05-02


### Question 4

In [15]:
data.head()

Unnamed: 0,id,song_played,time_played,user_id,user_sign_up_date,user_state
0,GOQMMKSQQH,Hey Jude,2015-06-11 21:51:35,122,2015-05-16,Louisiana
1,HWKKBQKNWI,We Can Work It Out,2015-06-06 16:49:19,3,2015-05-01,Ohio
2,DKQSXVNJDH,Back In the U.S.S.R.,2015-06-14 02:11:29,35,2015-05-04,New Jersey
3,HLHRIDQTUW,P.s. I Love You,2015-06-08 12:26:10,126,2015-05-16,Illinois
4,SUKJCSBCYW,Sgt. Pepper's Lonely Hearts Club Band,2015-06-28 14:57:00,6,2015-05-01,New Jersey


There should be a lot of different methods for this question. The simplest method is to choose the most popular songs. In addition, k-nearest neighbor (KNN) method can also be used. More advancely, collaborative filtering method can be implemented.

Here, I implement a simple version of collaborative filtering algorithm for song recommendation. More specifically, the similarity of two songs is calculate using the number of users whole listen them together.

In [16]:
# Step 1: build the Song-User matrix
song_user = data.groupby(['song_played', 'user_id'])['id'].count().unstack(fill_value=0)
song_user = (song_user > 0).astype(int)

song_user.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,191,192,193,194,195,196,197,198,199,200
song_played,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A Day In The Life,0,0,1,1,0,1,0,0,0,0,...,0,0,1,1,0,1,0,0,1,0
A Hard Day's Night,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
A Saturday Club Xmas/Crimble Medley,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ANYTIME AT ALL,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Across The Universe,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# Step 2: build song-song similarity matrix
song_user_norm = normalize(song_user, axis=1)  # normalize the song-user matrix
similarity = np.dot(song_user_norm, song_user_norm.T)  # calculate the similarity matrix
similarity_df = pd.DataFrame(similarity, index=song_user.index, columns=song_user.index)

similarity_df.head()

song_played,A Day In The Life,A Hard Day's Night,A Saturday Club Xmas/Crimble Medley,ANYTIME AT ALL,Across The Universe,All My Loving,All You Need Is Love,And Your Bird Can Sing,BAD BOY,BALLAD OF JOHN AND YOKO,...,We Can Work It Out,When I'm 64,While My Guitar Gently Weeps,Wild Honey Pie,With a Little Help From My Friends,YOUR MOTHER SHOULD KNOW,Yellow Submarine,Yesterday,You Never Give Me Your Money,You're Going To Lose That Girl
song_played,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A Day In The Life,1.0,0.264392,0.139347,0.148968,0.132196,0.301023,0.295599,0.098533,0.197066,0.201129,...,0.516528,0.056888,0.578459,0.279852,0.399723,0.088131,0.330489,0.365433,0.164222,0.0
A Hard Day's Night,0.264392,1.0,0.0,0.0,0.1,0.146385,0.111803,0.0,0.0,0.091287,...,0.305788,0.129099,0.266996,0.0,0.0,0.0,0.05,0.215003,0.074536,0.0
A Saturday Club Xmas/Crimble Medley,0.139347,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.161165,0.0,0.0,0.182574,0.0,0.0,0.0,0.0,0.0,0.0
ANYTIME AT ALL,0.148968,0.0,0.0,1.0,0.0,0.164957,0.094491,0.125988,0.0,0.0,...,0.172292,0.0,0.188044,0.09759,0.191663,0.0,0.0,0.103835,0.0,0.0
Across The Universe,0.132196,0.1,0.0,0.0,1.0,0.09759,0.0,0.0,0.0,0.0,...,0.101929,0.0,0.133498,0.0,0.0,0.0,0.0,0.06143,0.0,0.0


In [18]:
# Step 3: find the top-k most similar songs
def find_topk(song, similarity, k=1):
    df = similarity.loc[song].sort_values(ascending=False)[1:k + 1].reset_index()
    df = df.rename(columns={'song_played': 'Song', song: 'Similarity'})
    
    return df

In [19]:
# Example: find the top 10 similar song for 'A Day In The Life'
df = find_topk(song='A Day In The Life', similarity=similarity_df, k=10)
df

Unnamed: 0,Song,Similarity
0,Revolution,0.705327
1,Come Together,0.691885
2,Get Back,0.671014
3,Hello Goodbye,0.610658
4,Back In the U.S.S.R.,0.607872
5,Let It Be,0.594578
6,Hey Jude,0.591295
7,Lucy In The Sky With Diamonds,0.580249
8,While My Guitar Gently Weeps,0.578459
9,Here Comes The Sun,0.553557


### Question 5

We need to perform a A/B test ([reference](https://github.com/stasi009/TakeHomeDataChallenges/blob/master/08.SongChallenge/song_challenge.ipynb)):

* randomly split users into two groups, one Control group and one Experiment group
* Control group has no recommendation strategy
* Experiment group recommend the next song
* after running some time, perform a one-tailed t-test on 'average #play per hour'
    - $H_0$: population 'average #play per hour' is same in two groups
    - $H_a$: experiment group's population 'average #play per hour' is higher than control group's