In [1]:
import pandas as pd
import numpy as np


## Data Preprocessing

In [2]:
song = pd.read_json('song.json')
song = song.set_index('id')
song.user_sign_up_date = pd.to_datetime(song.user_sign_up_date)
song.time_played = pd.to_datetime(song.time_played)
song.head()

Unnamed: 0_level_0,user_id,user_state,user_sign_up_date,song_played,time_played
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GOQMMKSQQH,122,Louisiana,2015-05-16,Hey Jude,2015-06-11 21:51:35
HWKKBQKNWI,3,Ohio,2015-05-01,We Can Work It Out,2015-06-06 16:49:19
DKQSXVNJDH,35,New Jersey,2015-05-04,Back In the U.S.S.R.,2015-06-14 02:11:29
HLHRIDQTUW,126,Illinois,2015-05-16,P.s. I Love You,2015-06-08 12:26:10
SUKJCSBCYW,6,New Jersey,2015-05-01,Sgt. Pepper's Lonely Hearts Club Band,2015-06-28 14:57:00


In [3]:
song.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4000 entries, GOQMMKSQQH to LXJTCLWRUV
Data columns (total 5 columns):
user_id              4000 non-null int64
user_state           4000 non-null object
user_sign_up_date    4000 non-null datetime64[ns]
song_played          4000 non-null object
time_played          4000 non-null datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(2)
memory usage: 187.5+ KB


## Top 3 and Bottom 3 states in terms of number of users

In [4]:
user_numbers_by_states = song.groupby(song.user_state)['user_id'].apply(lambda x: len(np.unique(x))).sort_values(ascending = False)

In [5]:
print('Users are come from ',len(user_numbers_by_states),'States')
print('The TOP 3 States in terms of number of users are: ',user_numbers_by_states[:3].index.values)
print('The BOTTOM 3 States in terms of number of users are: ',user_numbers_by_states[:-4:-1].index.values)
user_numbers_by_states

Users are come from  41 States
The TOP 3 States in terms of number of users are:  ['New York' 'California' 'Texas']
The BOTTOM 3 States in terms of number of users are:  ['Arizona' 'New Mexico' 'Connecticut']


user_state
New York          23
California        21
Texas             15
Pennsylvania       9
Ohio               9
Florida            7
Illinois           7
Georgia            6
New Jersey         6
North Carolina     6
Massachusetts      6
Michigan           5
Wisconsin          5
Maryland           5
Louisiana          5
Missouri           5
Tennessee          5
Indiana            4
Minnesota          4
Alabama            4
West Virginia      3
Oregon             3
Colorado           3
Kentucky           3
South Carolina     3
Mississippi        3
Oklahoma           2
Utah               2
Virginia           2
Washington         2
Arkansas           2
Alaska             2
North Dakota       1
Kansas             1
Iowa               1
Rhode Island       1
Nebraska           1
Idaho              1
Connecticut        1
New Mexico         1
Arizona            1
Name: user_id, dtype: int64

## Top 3 and Bottom 3 states in terms of user engagement

I use 'average play per users' as a metric to measure user engagement behavior.

In [6]:
user_engagement_by_state = pd.DataFrame({'numbers_users':user_numbers_by_states,'numbers_songs':song.groupby('user_state').user_id.count()})
user_engagement_by_state['average_played_per_users'] = (user_engagement_by_state.numbers_songs/user_engagement_by_state.numbers_users)
user_engagement_by_state.average_played_per_users = user_engagement_by_state.average_played_per_users.astype('int')
user_engagement_by_state.sort_values(by = 'average_played_per_users',ascending =False,inplace= True)

In [7]:
print('The TOP 3 States in terms of engagement of users are: ',user_engagement_by_state[:3].index.values)
print('The BOTTOM 3 States in terms of engagement of users are: ',user_engagement_by_state[:-4:-1].index.values)
user_engagement_by_state

The TOP 3 States in terms of engagement of users are:  ['Nebraska' 'Alaska' 'Mississippi']
The BOTTOM 3 States in terms of engagement of users are:  ['Virginia' 'Kansas' 'Minnesota']


Unnamed: 0,numbers_users,numbers_songs,average_played_per_users
Nebraska,1,36,36
Alaska,2,58,29
Mississippi,3,85,28
South Carolina,3,85,28
Rhode Island,1,27,27
Idaho,1,26,26
North Dakota,1,26,26
Kentucky,3,78,26
Alabama,4,104,26
Florida,7,180,25


## First signed-up users for each States

In [8]:
def first_users(df):
    signed_date = df.user_sign_up_date.min()
    users = df.loc[df.user_sign_up_date == signed_date,['user_id']]
    users = np.unique(users.user_id)
    return users[0]

def first_users(df):
    signed_date = df.user_sign_up_date.idxmin()
    return df.loc[signed_date,['user_id','user_sign_up_date']]

First signed-up users:

In [9]:
song.groupby('user_state').apply(first_users)

user_state
Alabama             5
Alaska            106
Arizona           105
Arkansas           78
California         39
Colorado          166
Connecticut       127
Florida            41
Georgia            16
Idaho             165
Illinois           45
Indiana           102
Iowa              178
Kansas            177
Kentucky           34
Louisiana          50
Maryland           18
Massachusetts      15
Michigan           13
Minnesota           8
Mississippi        23
Missouri           85
Nebraska          134
New Jersey          6
New Mexico          4
New York           10
North Carolina      2
North Dakota      135
Ohio                3
Oklahoma          119
Oregon              1
Pennsylvania       11
Rhode Island      174
South Carolina     64
Tennessee          70
Texas               7
Utah               29
Virginia          142
Washington        125
West Virginia      60
Wisconsin          32
dtype: int64

## Recommendation Songs

In [53]:
print('There are ',song.song_played.unique().shape[0],'songs in company XYZ')
print('These are: ')
print(song.song_played.unique())

There are  100 songs in company XYZ
These are: 
['Hey Jude' 'We Can Work It Out' 'Back In the U.S.S.R.' 'P.s. I Love You'
 "Sgt. Pepper's Lonely Hearts Club Band" 'Sgt. Pepper Inner Groove'
 'Hello Goodbye' 'Cry For A Shadow' 'Revolution' 'Let It Be' 'I Feel Fine'
 'The Fool On The Hill' 'Get Back' 'Come Together' 'She Loves You'
 'While My Guitar Gently Weeps' 'Here Comes The Sun' 'A Day In The Life'
 'Getting Better' "Baby You're A Rich Man" 'The Ballad Of John And Yoko'
 'Lucy In The Sky With Diamonds' "Don't Let Me Down"
 'Reprise / Day in the Life' "Maxwell's Silver Hammer"
 'Across The Universe' 'Ob-la-di, Ob-la-da' 'Yesterday' 'Fixing A Hole'
 'OH DARLING' 'Birthday' 'A Saturday Club Xmas/Crimble Medley'
 'BALLAD OF JOHN AND YOKO' 'Got To Get You Into My Life' 'Wild Honey Pie'
 'IN MY LIFE' 'Strawberry Fields Forever' 'Twist and Shout'
 'I Saw Her Standing There' 'Helter Skelter' 'Something' 'Eleanor Rigby'
 'Things We Said Today' 'Sgt. Pepper/with A Little Help From My Friends'

We selected the users who played the selected song for the most frequently time, and then find their favrate songs that played most frequently and send it back to the users.

In [125]:
song_name = 'We Can Work It Out'
userid =song.loc[song['song_played']== song_name,:].groupby('user_id').time_played.count().sort_values(ascending = False).index[0]
rec_song = song.loc[(song.user_id == userid) & (song.song_played != song_name),:].groupby('song_played').time_played.count().sort_values(ascending = False).index[0]
rec_song

'Twist and Shout'

Now, when user input a song's name, we can print a new song that are highest probability of being played based on this method.

#### Count song by users

In [82]:
song_count_by_users = song[['song_played','user_id']].groupby(['song_played','user_id']).user_id.count().unstack(level =-1,fill_value = 0)
song_count_by_users.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,191,192,193,194,195,196,197,198,199,200
song_played,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A Day In The Life,0,0,1,3,0,2,0,0,0,0,...,0,0,3,3,0,2,0,0,2,0
A Hard Day's Night,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
A Saturday Club Xmas/Crimble Medley,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ANYTIME AT ALL,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Across The Universe,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [87]:
from sklearn.preprocessing import normalize
song_count_by_users_norm = normalize(song_count_by_users, axis=1)
similar_song = song_count_by_users_norm.dot(song_count_by_users_norm.T)

In [89]:
similar_song = pd.DataFrame(similar_song, index= song_count_by_users.index, columns= song_count_by_users.index)
similar_song.head()

song_played,A Day In The Life,A Hard Day's Night,A Saturday Club Xmas/Crimble Medley,ANYTIME AT ALL,Across The Universe,All My Loving,All You Need Is Love,And Your Bird Can Sing,BAD BOY,BALLAD OF JOHN AND YOKO,...,We Can Work It Out,When I'm 64,While My Guitar Gently Weeps,Wild Honey Pie,With a Little Help From My Friends,YOUR MOTHER SHOULD KNOW,Yellow Submarine,Yesterday,You Never Give Me Your Money,You're Going To Lose That Girl
song_played,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A Day In The Life,1.0,0.235702,0.074536,0.119523,0.212132,0.355023,0.329404,0.152145,0.210819,0.172133,...,0.464938,0.030429,0.508964,0.223607,0.359092,0.037268,0.318198,0.35322,0.087841,0.0
A Hard Day's Night,0.235702,1.0,0.0,0.0,0.1,0.136931,0.111803,0.0,0.0,0.091287,...,0.259548,0.129099,0.210099,0.0,0.0,0.0,0.05,0.195468,0.074536,0.0
A Saturday Club Xmas/Crimble Medley,0.074536,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.109435,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0
ANYTIME AT ALL,0.119523,0.0,0.0,1.0,0.0,0.154303,0.094491,0.109109,0.0,0.0,...,0.116991,0.0,0.138107,0.089087,0.183942,0.0,0.0,0.146845,0.0,0.0
Across The Universe,0.212132,0.1,0.0,0.0,1.0,0.091287,0.0,0.0,0.0,0.0,...,0.138426,0.0,0.116722,0.0,0.0,0.0,0.0,0.043437,0.0,0.0


In [129]:
song_name = 'We Can Work It Out'
similar_song[song_name].sort_values(ascending = False)[1:2].index.values

array(['A Day In The Life'], dtype=object)

In [151]:
for i in list(song.song_played.unique()):
    s = similar_song[i].sort_values(ascending = False)[1:2].index.values
    print('{0:45s} | '.format(i),s)

Hey Jude                                      |  ['Come Together']
We Can Work It Out                            |  ['A Day In The Life']
Back In the U.S.S.R.                          |  ['Revolution']
P.s. I Love You                               |  ['Golden Slumbers']
Sgt. Pepper's Lonely Hearts Club Band         |  ['Lovely Rita']
Sgt. Pepper Inner Groove                      |  ['Revolution']
Hello Goodbye                                 |  ['Revolution']
Cry For A Shadow                              |  ["I've Just Seen A Face"]
Revolution                                    |  ['Come Together']
Let It Be                                     |  ['Come Together']
I Feel Fine                                   |  ['Here Comes The Sun']
The Fool On The Hill                          |  ['I Feel Fine']
Get Back                                      |  ['Revolution']
Come Together                                 |  ['Revolution']
She Loves You                                 |  ['Revolution'