# CoderSchool Final Project Moods
## Music Recommendation System

In [1]:
import pandas as pd
import numpy as np

# Part 1 - Data cleaning

In [3]:
full_df = pd.read_json('MasterSongList.json')
full_df.head(3)

Unnamed: 0,_id,album,artist,audio_features,context,decades,genres,lyrics_features,moods,name,new_context,picture,recording_id,sub_context,yt_id,yt_views
0,{'$oid': '52fdfb440b9398049f3d7a8c'},Gangnam Style (강남스타일),PSY,"[11, 0.912744, 0.083704, 132.069, 0.293137, 0....",[work out],[],[pop],"[oppa, gangnam, style, gangnam, style, najeneu...","[energetic, motivational]",Gangnam Style (강남스타일),work out,http://images.musicnet.com/albums/073/463/405/...,50232.0,[working out: cardio],9bZkp7q19f0,2450112089
1,{'$oid': '52fdfb3d0b9398049f3cbc8e'},Native,OneRepublic,"[6, 0.7457039999999999, 0.11995499999999999, 1...",[energetic],[2012],[pop],"[lately, i, ve, been, i, ve, been, losing, sle...",[happy],Counting Stars,energetic,http://images.musicnet.com/albums/081/851/887/...,5839.0,[energy boost],hT_nvWreIhg,1020297206
2,{'$oid': '52fdfb420b9398049f3d3ea5'},Party Rock Anthem,LMFAO,"[5, 0.709932, 0.231455, 130.03, 0.121740999999...","[energetic, energetic, energetic, energetic]",[],[],"[party, rock, yeah, woo, let, s, go, party, ro...","[happy, celebratory, rowdy]",Party Rock Anthem,housework,http://images.musicnet.com/albums/049/414/127/...,52379.0,"[energy boost, pleasing a crowd, housework, dr...",KQ6zr6kCPj8,971128436


In [5]:
cols = ['lyrics_features', 'moods']
lyrics = full_df.copy()
lyrics = lyrics[cols]
lyrics.head()

Unnamed: 0,lyrics_features,moods
0,"[oppa, gangnam, style, gangnam, style, najeneu...","[energetic, motivational]"
1,"[lately, i, ve, been, i, ve, been, losing, sle...",[happy]
2,"[party, rock, yeah, woo, let, s, go, party, ro...","[happy, celebratory, rowdy]"
3,"[alagamun, lan, weh, wakun, heya, hanun, gon, ...","[happy, energetic, celebratory]"
4,"[j, lo, the, other, side, out, my, mine, it, s...",[energetic]


Remove the wrong format

In [7]:
lyrics['lyrics_features'] = lyrics['lyrics_features'].apply(' '.join)
lyrics['moods'] = lyrics['moods'].apply(', '.join)
lyrics.head()

Unnamed: 0,lyrics_features,moods
0,oppa gangnam style gangnam style najeneun ttas...,"energetic, motivational"
1,lately i ve been i ve been losing sleep dreami...,happy
2,party rock yeah woo let s go party rock is in ...,"happy, celebratory, rowdy"
3,alagamun lan weh wakun heya hanun gon alagamun...,"happy, energetic, celebratory"
4,j lo the other side out my mine it s a new gen...,energetic


Replace empty lyrics with NaN and drop them

In [8]:
lyrics['lyrics_features'].replace('', np.nan, inplace=True)
lyrics.shape

(36733, 2)

In [9]:
lyrics.dropna(subset=['lyrics_features'], inplace=True)
lyrics.shape

(20931, 2)

In [10]:
lyrics.reset_index(drop=True, inplace=True)

# Part 2 - Lyrics cleaning

In [11]:
from string import punctuation
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.stem.snowball import SnowballStemmer

In [12]:
def clean_text(raw_text):
    # Create empty list to receive result
    clean_words = []
    
    # 1. Convert to lower case
    raw_text = raw_text.lower()
    
    # 2. Remove punctuation
    translator = str.maketrans('', '', punctuation)
    raw_text = raw_text.translate(translator)
    split_words = raw_text.split()
    
    # 3 & 4. Remove common words and stem words
    stemmer = SnowballStemmer('english')
    for word in split_words:
        if word not in ENGLISH_STOP_WORDS:
            stemmed_word = stemmer.stem(word)
            clean_words.append(stemmed_word)
            
    return ' '.join(clean_words)

In [13]:
lyrics['clean_lyrics'] = lyrics['lyrics_features'].apply(clean_text)

In [21]:
cols2 = ['clean_lyrics', 'moods']
new_lyrics = lyrics.copy()
new_lyrics = new_lyrics[cols2]
new_lyrics.head()

Unnamed: 0,clean_lyrics,moods
0,oppa gangnam style gangnam style najeneun ttas...,"energetic, motivational"
1,late ve ve lose sleep dream thing babi ve ve p...,happy
2,parti rock yeah woo let s parti rock hous toni...,"happy, celebratory, rowdy"
3,alagamun lan weh wakun heya hanun gon alagamun...,"happy, energetic, celebratory"
4,j lo s new generat mr worldwid parti peopl flo...,energetic


# Part 3 - Try classifiers

Let's start with the following:
- TF-IDF with MultiLabelBinarizer and a Classifier Chain

### TF-IDF RFC Multilabel Classifier Chain

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [24]:
tf_idf = vectorizer.fit_transform(new_lyrics['clean_lyrics'])
print(tf_idf[0])

  (0, 44060)	0.22652096467632132
  (0, 24533)	0.3523880733352704
  (0, 58510)	0.15339492137542518
  (0, 41403)	0.04145742039238475
  (0, 62767)	0.04145742039238475
  (0, 30492)	0.020728710196192376
  (0, 67851)	0.1510139764508809
  (0, 33381)	0.04145742039238475
  (0, 27396)	0.020728710196192376
  (0, 67889)	0.020728710196192376
  (0, 2942)	0.05663024116908033
  (0, 48986)	0.020728710196192376
  (0, 30595)	0.03981840690336137
  (0, 5433)	0.03865550832622193
  (0, 43796)	0.04145742039238475
  (0, 55709)	0.03981840690336137
  (0, 62793)	0.020728710196192376
  (0, 25185)	0.09954601725840342
  (0, 5521)	0.020728710196192376
  (0, 41540)	0.10756078878017733
  (0, 53233)	0.20728710196192376
  (0, 42050)	0.020728710196192376
  (0, 55628)	0.020728710196192376
  (0, 31908)	0.018196685414216394
  (0, 67077)	0.020728710196192376
  :	:
  (0, 40712)	0.020728710196192376
  (0, 38863)	0.018876747056360114
  (0, 49017)	0.020728710196192376
  (0, 24662)	0.020728710196192376
  (0, 66369)	0.0207287101961

In [25]:
X = tf_idf
y = new_lyrics['moods']

Let's use the multilabel binarizer

In [27]:
from sklearn.preprocessing import MultiLabelBinarizer

In [28]:
mlb = MultiLabelBinarizer()
y_bina = mlb.fit_transform(y)
y_bina

array([[1, 1, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 1],
       [1, 1, 1, ..., 1, 0, 1],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [1, 1, 0, ..., 1, 0, 1]])

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_bina, test_size=0.1, random_state=101)

SVC with Classifier chain

In [33]:
from sklearn.multioutput import ClassifierChain
from sklearn.ensemble import RandomForestClassifier

In [34]:
chain = ClassifierChain(RandomForestClassifier())

In [None]:
chain.fit(X_train, y_train)
predictions = chain.predict(X_test)
print(classification_report(y_test, predictions))