# CoderSchool Final Project Moods
## Music Recommendation System

In [310]:
import pandas as pd
import numpy as np

# Part 1 - Data cleaning

In [80]:
full_df = pd.read_json('MasterSongList.json')
full_df.head(3)

Unnamed: 0,_id,album,artist,audio_features,context,decades,genres,lyrics_features,moods,name,new_context,picture,recording_id,sub_context,yt_id,yt_views
0,{'$oid': '52fdfb440b9398049f3d7a8c'},Gangnam Style (강남스타일),PSY,"[11, 0.912744, 0.083704, 132.069, 0.293137, 0....",[work out],[],[pop],"[oppa, gangnam, style, gangnam, style, najeneu...","[energetic, motivational]",Gangnam Style (강남스타일),work out,http://images.musicnet.com/albums/073/463/405/...,50232.0,[working out: cardio],9bZkp7q19f0,2450112089
1,{'$oid': '52fdfb3d0b9398049f3cbc8e'},Native,OneRepublic,"[6, 0.7457039999999999, 0.11995499999999999, 1...",[energetic],[2012],[pop],"[lately, i, ve, been, i, ve, been, losing, sle...",[happy],Counting Stars,energetic,http://images.musicnet.com/albums/081/851/887/...,5839.0,[energy boost],hT_nvWreIhg,1020297206
2,{'$oid': '52fdfb420b9398049f3d3ea5'},Party Rock Anthem,LMFAO,"[5, 0.709932, 0.231455, 130.03, 0.121740999999...","[energetic, energetic, energetic, energetic]",[],[],"[party, rock, yeah, woo, let, s, go, party, ro...","[happy, celebratory, rowdy]",Party Rock Anthem,housework,http://images.musicnet.com/albums/049/414/127/...,52379.0,"[energy boost, pleasing a crowd, housework, dr...",KQ6zr6kCPj8,971128436


In [81]:
cols = ['lyrics_features', 'moods']
lyrics = full_df.copy()
lyrics = lyrics[cols]
lyrics.head()

Unnamed: 0,lyrics_features,moods
0,"[oppa, gangnam, style, gangnam, style, najeneu...","[energetic, motivational]"
1,"[lately, i, ve, been, i, ve, been, losing, sle...",[happy]
2,"[party, rock, yeah, woo, let, s, go, party, ro...","[happy, celebratory, rowdy]"
3,"[alagamun, lan, weh, wakun, heya, hanun, gon, ...","[happy, energetic, celebratory]"
4,"[j, lo, the, other, side, out, my, mine, it, s...",[energetic]


Remove the wrong format

In [82]:
lyrics['lyrics_features'] = lyrics['lyrics_features'].apply(' '.join)
# lyrics['moods'] = lyrics['moods'].apply(', '.join)
lyrics.head()

Unnamed: 0,lyrics_features,moods
0,oppa gangnam style gangnam style najeneun ttas...,"[energetic, motivational]"
1,lately i ve been i ve been losing sleep dreami...,[happy]
2,party rock yeah woo let s go party rock is in ...,"[happy, celebratory, rowdy]"
3,alagamun lan weh wakun heya hanun gon alagamun...,"[happy, energetic, celebratory]"
4,j lo the other side out my mine it s a new gen...,[energetic]


Replace empty lyrics with NaN and drop them

In [83]:
lyrics['lyrics_features'].replace('', np.nan, inplace=True)
lyrics.shape

(36733, 2)

In [84]:
lyrics.dropna(subset=['lyrics_features'], inplace=True)
lyrics.shape

(20931, 2)

In [85]:
lyrics.reset_index(drop=True, inplace=True)

In [86]:
lyrics['moods'] = lyrics['moods'].apply(lambda y: np.nan if len(y)==0 else y)
lyrics.dropna(subset=['moods'], inplace=True)
lyrics.shape

(20931, 2)

### Moods cleaning

In [87]:
moods_list = lyrics['moods'].tolist()
moods_set = set(x for i in moods_list for x in i)
moods_set

{'aggressive',
 'angsty',
 'atmospheric',
 'campy',
 'celebratory',
 'classy',
 'cocky',
 'cold',
 'earthy',
 'energetic',
 'funky',
 'gloomy',
 'happy',
 'hypnotic',
 'introspective',
 'lush',
 'mellow',
 'motivational',
 'nocturnal',
 'raw',
 'rowdy',
 'sad',
 'seductive',
 'sexual',
 'soothing',
 'spacey',
 'sprightly',
 'sweet',
 'trashy',
 'trippy',
 'visceral',
 'warm'}

Moods distribution

In [88]:
def number_moods(mood):
    counter=0
    for i in lyrics['moods']:
        if mood in i:
            counter += 1
    return counter

for i in moods_set:
    mood_count = number_moods(i)
    print(i, ": ", mood_count)

nocturnal :  1334
sweet :  814
sprightly :  1733
celebratory :  1479
cold :  830
campy :  636
energetic :  2305
visceral :  1112
earthy :  873
funky :  2072
classy :  492
seductive :  1419
soothing :  1374
trashy :  477
cocky :  1438
raw :  1301
sexual :  505
warm :  1495
rowdy :  1721
spacey :  514
mellow :  2856
lush :  1326
happy :  1757
trippy :  780
motivational :  925
aggressive :  1683
gloomy :  750
angsty :  1205
atmospheric :  1155
sad :  1249
introspective :  1417
hypnotic :  286


We notice that the moods are more balanced than the genres

# Part 2 - Lyrics cleaning

In [426]:
def clean_text(raw_text):
    from string import punctuation
    from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
    from nltk.stem.snowball import SnowballStemmer

    # Create empty list to receive result
    clean_words = []
    
    # 1. Convert to lower case
    raw_text = raw_text.lower()
    
    # 2. Remove punctuation
    translator = str.maketrans('', '', punctuation)
    raw_text = raw_text.translate(translator)
    split_words = raw_text.split()
    
    # 3 & 4. Remove common words and stem words
    stemmer = SnowballStemmer('english')
    for word in split_words:
        if word not in ENGLISH_STOP_WORDS:
            stemmed_word = stemmer.stem(word)
            clean_words.append(stemmed_word)
            
    return ' '.join(clean_words)

In [427]:
pickle.dump(clean_text, open('clean_text_function.pickle', 'wb'))

In [91]:
lyrics['clean_lyrics'] = lyrics['lyrics_features'].apply(clean_text)

In [92]:
cols2 = ['clean_lyrics', 'moods']
new_lyrics = lyrics.copy()
new_lyrics = new_lyrics[cols2]
new_lyrics.head()

Unnamed: 0,clean_lyrics,moods
0,oppa gangnam style gangnam style najeneun ttas...,"[energetic, motivational]"
1,late ve ve lose sleep dream thing babi ve ve p...,[happy]
2,parti rock yeah woo let s parti rock hous toni...,"[happy, celebratory, rowdy]"
3,alagamun lan weh wakun heya hanun gon alagamun...,"[happy, energetic, celebratory]"
4,j lo s new generat mr worldwid parti peopl flo...,[energetic]


# Part 3 - Try classifiers

Let's start with the following:
- TF-IDF with MultiLabelBinarizer and a Classifier Chain

# TF-IDF

In [93]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [94]:
tf_idf = vectorizer.fit_transform(new_lyrics['clean_lyrics'])
print(tf_idf[0])

  (0, 44060)	0.22652096467632132
  (0, 24533)	0.3523880733352704
  (0, 58510)	0.15339492137542518
  (0, 41403)	0.04145742039238475
  (0, 62767)	0.04145742039238475
  (0, 30492)	0.020728710196192376
  (0, 67851)	0.1510139764508809
  (0, 33381)	0.04145742039238475
  (0, 27396)	0.020728710196192376
  (0, 67889)	0.020728710196192376
  (0, 2942)	0.05663024116908033
  (0, 48986)	0.020728710196192376
  (0, 30595)	0.03981840690336137
  (0, 5433)	0.03865550832622193
  (0, 43796)	0.04145742039238475
  (0, 55709)	0.03981840690336137
  (0, 62793)	0.020728710196192376
  (0, 25185)	0.09954601725840342
  (0, 5521)	0.020728710196192376
  (0, 41540)	0.10756078878017733
  (0, 53233)	0.20728710196192376
  (0, 42050)	0.020728710196192376
  (0, 55628)	0.020728710196192376
  (0, 31908)	0.018196685414216394
  (0, 67077)	0.020728710196192376
  :	:
  (0, 40712)	0.020728710196192376
  (0, 38863)	0.018876747056360114
  (0, 49017)	0.020728710196192376
  (0, 24662)	0.020728710196192376
  (0, 66369)	0.0207287101961

In [240]:
X_tfidf = tf_idf
y = new_lyrics['moods']

Let's use the multilabel binarizer

In [241]:
from sklearn.preprocessing import MultiLabelBinarizer

In [242]:
mlb = MultiLabelBinarizer()
y_bina = mlb.fit_transform(y)
y_bina

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [446]:
pickle.dump(mlb, open('mlb.pickle', 'wb'))

In [243]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_bina, test_size=0.1, random_state=101)

### RFC1 TFIDF with Classifier chain

In [135]:
from sklearn.multioutput import ClassifierChain
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [136]:
chain_rfc1 = ClassifierChain(RandomForestClassifier())
chain_rfc1.fit(X_train, y_train)

ClassifierChain(base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
        cv=None, order=None, random_state=None)

In [137]:
predictions_rfc1 = chain_rfc1.predict(X_test)
print(classification_report(y_test, predictions_rfc1))

             precision    recall  f1-score   support

          0       0.62      0.03      0.05       195
          1       0.00      0.00      0.00       118
          2       0.00      0.00      0.00       118
          3       0.00      0.00      0.00        63
          4       0.38      0.03      0.06       151
          5       0.50      0.07      0.12        45
          6       0.33      0.01      0.02       162
          7       0.17      0.01      0.02        88
          8       0.00      0.00      0.00        91
          9       0.37      0.05      0.10       201
         10       0.36      0.04      0.08       187
         11       0.00      0.00      0.00        75
         12       0.09      0.01      0.01       196
         13       0.00      0.00      0.00        20
         14       0.00      0.00      0.00       122
         15       0.17      0.01      0.01       144
         16       0.35      0.03      0.05       267
         17       0.00      0.00      0.00   

  'precision', 'predicted', average, warn_for)


Oooooh what a really bad score, let's try something else


### RFC2 TFIDF without Classifier chain

In [110]:
rfc2 = RandomForestClassifier()
parameters_rfc2 = {'n_estimators':[5, 10, 100], 'min_samples_split':[2, 5, 10], 'max_features':['sqrt', 'log2', 'auto']}
grid_rfc2 = GridSearchCV(rfc2, parameters_rfc2)
grid_rfc2.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 10, 100], 'min_samples_split': [2, 5, 10], 'max_features': ['sqrt', 'log2', 'auto']},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [111]:
print(grid_rfc2.best_estimator_)
predictions_rfc2 = grid_rfc2.predict(X_test)
print(classification_report(y_test, predictions_rfc2))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
             precision    recall  f1-score   support

          0       0.28      0.05      0.09       195
          1       0.00      0.00      0.00       118
          2       0.00      0.00      0.00       118
          3       0.00      0.00      0.00        63
          4       0.20      0.03      0.05       151
          5       0.33      0.07      0.11        45
          6       0.24      0.03      0.05       162
          7       0.17      0.01      0.02        88
          8       0.00      0.00      0.00        91
          9       0.32      0.06      0.10       

  'precision', 'predicted', average, warn_for)


The score is even worst

Let's try the 2 same things but with kNN instead

### kNN1 TFIDF without classifier chain

In [113]:
from sklearn.neighbors import KNeighborsClassifier

In [114]:
knn1 = KNeighborsClassifier()
parameters_knn1 = {'n_neighbors':[3,6,10]}
grid_knn1 = GridSearchCV(knn1, parameters_knn1, verbose=3)
grid_knn1.fit(X_train, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] n_neighbors=3 ...................................................
[CV] .......................... n_neighbors=3, score=0.019271 -   6.8s
[CV] n_neighbors=3 ...................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.8s remaining:    0.0s


[CV] .......................... n_neighbors=3, score=0.022774 -   4.8s
[CV] n_neighbors=3 ...................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   11.6s remaining:    0.0s


[CV] .......................... n_neighbors=3, score=0.022615 -   4.7s
[CV] n_neighbors=6 ...................................................
[CV] .......................... n_neighbors=6, score=0.007645 -   5.3s
[CV] n_neighbors=6 ...................................................
[CV] .......................... n_neighbors=6, score=0.007804 -   5.4s
[CV] n_neighbors=6 ...................................................
[CV] .......................... n_neighbors=6, score=0.008759 -   5.3s
[CV] n_neighbors=10 ..................................................
[CV] ......................... n_neighbors=10, score=0.007963 -   5.3s
[CV] n_neighbors=10 ..................................................
[CV] ......................... n_neighbors=10, score=0.004778 -   5.3s
[CV] n_neighbors=10 ..................................................
[CV] ......................... n_neighbors=10, score=0.006848 -   5.4s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   48.5s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [3, 6, 10]}, pre_dispatch='2*n_jobs',
       refit=True, scoring=None, verbose=3)

In [115]:
print(grid_knn1.best_estimator_)
predictions_knn1 = grid_knn1.predict(X_test)
print(classification_report(y_test, predictions_knn1))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')
             precision    recall  f1-score   support

          0       0.20      0.23      0.21       195
          1       0.00      0.00      0.00       118
          2       0.00      0.00      0.00       118
          3       0.25      0.02      0.03        63
          4       0.24      0.05      0.09       151
          5       0.67      0.09      0.16        45
          6       0.40      0.07      0.12       162
          7       0.20      0.02      0.04        88
          8       0.00      0.00      0.00        91
          9       0.22      0.06      0.09       201
         10       0.25      0.10      0.15       187
         11       0.00      0.00      0.00        75
         12       0.17      0.03      0.05       196
         13       0.00      0.00      0.00        20
         14       0.27      0.03      0.

### kNN2 TFIDF with classifier chain

In [244]:
chain_knn2 = ClassifierChain(KNeighborsClassifier())
chain_knn2.fit(X_train, y_train)

ClassifierChain(base_estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
        cv=None, order=None, random_state=None)

In [245]:
predictions_knn2 = chain_knn2.predict(X_test)
print(classification_report(y_test, predictions_knn2))

             precision    recall  f1-score   support

          0       0.22      0.09      0.13       195
          1       0.00      0.00      0.00       118
          2       0.25      0.01      0.02       118
          3       0.33      0.02      0.03        63
          4       0.22      0.05      0.08       151
          5       0.56      0.11      0.19        45
          6       0.37      0.06      0.11       162
          7       0.25      0.01      0.02        88
          8       0.14      0.01      0.02        91
          9       0.35      0.07      0.12       201
         10       0.20      0.14      0.17       187
         11       0.50      0.01      0.03        75
         12       0.10      0.02      0.03       196
         13       0.00      0.00      0.00        20
         14       0.06      0.01      0.01       122
         15       0.10      0.01      0.02       144
         16       0.15      0.06      0.08       267
         17       0.12      0.03      0.05   

  'precision', 'predicted', average, warn_for)


Still a very low score, let's first do a cross-validation on the one with the best score

In [119]:
from sklearn.model_selection import cross_val_score
scores_rfc1 = cross_val_score(chain_rfc1, X_train, y_train, cv=3)

In [120]:
print(scores_rfc1)
print(scores_rfc1.mean())

[0.0065297  0.00589266 0.00844083]
0.006954398258746085


Not helpful :(

Let's look at different data from TF-IDF:
- remove words with frequency lower than 10
- use bigrams

# TF-IDF2

In [281]:
vectorizer2 = TfidfVectorizer(ngram_range=(1,2), min_df=10)
tf_idf2 = vectorizer2.fit_transform(new_lyrics['clean_lyrics'])
print(tf_idf2[0])

  (0, 22624)	0.17555162985564912
  (0, 16653)	0.1396707381704529
  (0, 10216)	0.06283741917512049
  (0, 7993)	0.027503072721477898
  (0, 17515)	0.78313228579269
  (0, 6175)	0.4054680866662382
  (0, 20937)	0.07595136264876994
  (0, 12341)	0.05672612816713428
  (0, 17042)	0.11184257398463758
  (0, 8170)	0.03812431110832099
  (0, 1081)	0.0233343320070627
  (0, 11285)	0.03782574401868204
  (0, 11891)	0.00402407363332277
  (0, 20459)	0.00510855101094494
  (0, 20938)	0.10784996317178018
  (0, 6176)	0.3605558962184408
  (0, 1088)	0.022389745369421107
  (0, 12138)	0.012979907856845323


In [433]:
pickle.dump(vectorizer2, open('tfidf_vectorizer2.pickle', 'wb'))

In [282]:
X_tfidf2 = tf_idf2

We can now try this data on our best classifier so far

In [283]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf2, y_bina, test_size=0.2, random_state=101)

### RFC3 TFIDF2 with Classifier chain

In [160]:
chain_rfc3 = ClassifierChain(RandomForestClassifier())
chain_rfc3.fit(X_train, y_train)

ClassifierChain(base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
        cv=None, order=None, random_state=None)

In [161]:
predictions_rfc3 = chain_rfc3.predict(X_test)
print(classification_report(y_test, predictions_rfc3))

             precision    recall  f1-score   support

          0       0.41      0.02      0.04       376
          1       0.00      0.00      0.00       243
          2       0.07      0.00      0.01       222
          3       0.25      0.01      0.02       124
          4       0.28      0.03      0.05       286
          5       0.62      0.05      0.09       102
          6       0.40      0.03      0.05       290
          7       0.35      0.04      0.08       163
          8       0.00      0.00      0.00       177
          9       0.30      0.03      0.06       433
         10       0.49      0.05      0.10       422
         11       0.12      0.01      0.01       140
         12       0.29      0.02      0.04       357
         13       0.33      0.02      0.04        49
         14       0.00      0.00      0.00       266
         15       0.14      0.01      0.01       276
         16       0.45      0.04      0.07       561
         17       0.11      0.01      0.01   

  'precision', 'predicted', average, warn_for)


In [162]:
import pickle

pickle.dump(chain_rfc3, open('moods_chain_rfc3.pickle', 'wb'))

### RFC4 TFIDF2 without Classifier chain

In [164]:
rfc4 = RandomForestClassifier()
parameters_rfc4 = {'n_estimators':[5, 10, 100], 'min_samples_split':[2, 5, 10], 'max_features':['sqrt', 'log2', 'auto']}
grid_rfc4 = GridSearchCV(rfc4, parameters_rfc4, verbose=3)
grid_rfc4.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] max_features=sqrt, min_samples_split=2, n_estimators=5 ..........
[CV]  max_features=sqrt, min_samples_split=2, n_estimators=5, score=0.016302 -   7.0s
[CV] max_features=sqrt, min_samples_split=2, n_estimators=5 ..........


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.0s remaining:    0.0s


[CV]  max_features=sqrt, min_samples_split=2, n_estimators=5, score=0.017739 -   6.5s
[CV] max_features=sqrt, min_samples_split=2, n_estimators=5 ..........


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   13.6s remaining:    0.0s


[CV]  max_features=sqrt, min_samples_split=2, n_estimators=5, score=0.018455 -   7.1s
[CV] max_features=sqrt, min_samples_split=2, n_estimators=10 .........
[CV]  max_features=sqrt, min_samples_split=2, n_estimators=10, score=0.008957 -  11.3s
[CV] max_features=sqrt, min_samples_split=2, n_estimators=10 .........
[CV]  max_features=sqrt, min_samples_split=2, n_estimators=10, score=0.008421 -  11.1s
[CV] max_features=sqrt, min_samples_split=2, n_estimators=10 .........
[CV]  max_features=sqrt, min_samples_split=2, n_estimators=10, score=0.008959 -  10.8s
[CV] max_features=sqrt, min_samples_split=2, n_estimators=100 ........
[CV]  max_features=sqrt, min_samples_split=2, n_estimators=100, score=0.005554 - 1.7min
[CV] max_features=sqrt, min_samples_split=2, n_estimators=100 ........
[CV]  max_features=sqrt, min_samples_split=2, n_estimators=100, score=0.005196 - 1.7min
[CV] max_features=sqrt, min_samples_split=2, n_estimators=100 ........
[CV]  max_features=sqrt, min_samples_split=2, n_est

[CV]  max_features=auto, min_samples_split=2, n_estimators=5, score=0.018631 -   5.9s
[CV] max_features=auto, min_samples_split=2, n_estimators=5 ..........
[CV]  max_features=auto, min_samples_split=2, n_estimators=5, score=0.017022 -   5.8s
[CV] max_features=auto, min_samples_split=2, n_estimators=5 ..........
[CV]  max_features=auto, min_samples_split=2, n_estimators=5, score=0.025085 -   5.2s
[CV] max_features=auto, min_samples_split=2, n_estimators=10 .........
[CV]  max_features=auto, min_samples_split=2, n_estimators=10, score=0.009674 -  10.5s
[CV] max_features=auto, min_samples_split=2, n_estimators=10 .........
[CV]  max_features=auto, min_samples_split=2, n_estimators=10, score=0.010034 -  10.9s
[CV] max_features=auto, min_samples_split=2, n_estimators=10 .........
[CV]  max_features=auto, min_samples_split=2, n_estimators=10, score=0.008601 -  13.3s
[CV] max_features=auto, min_samples_split=2, n_estimators=100 ........
[CV]  max_features=auto, min_samples_split=2, n_estimat

[Parallel(n_jobs=1)]: Done  81 out of  81 | elapsed: 36.3min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 10, 100], 'min_samples_split': [2, 5, 10], 'max_features': ['sqrt', 'log2', 'auto']},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=3)

In [165]:
print(grid_rfc4.best_estimator_)
predictions_rfc4 = grid_rfc4.predict(X_test)
print(classification_report(y_test, predictions_rfc4))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
             precision    recall  f1-score   support

          0       0.49      0.09      0.15       376
          1       0.16      0.01      0.02       243
          2       0.15      0.01      0.02       222
          3       0.00      0.00      0.00       124
          4       0.17      0.02      0.04       286
          5       0.58      0.07      0.12       102
          6       0.24      0.02      0.04       290
          7       0.36      0.03      0.06       163
          8       0.12      0.01      0.01       177
          9       0.19      0.03      0.06       

The score is similar to the one with classifier chain

What about bag of words?

# BOW

In [254]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(ngram_range=(1, 2), min_df=10)
bow = count_vect.fit_transform(new_lyrics['clean_lyrics'])
print(bow[0])

  (0, 12138)	1
  (0, 1088)	2
  (0, 6176)	21
  (0, 20938)	6
  (0, 20459)	1
  (0, 11891)	1
  (0, 11285)	2
  (0, 1081)	4
  (0, 8170)	2
  (0, 17042)	6
  (0, 12341)	6
  (0, 20937)	6
  (0, 6175)	30
  (0, 17515)	40
  (0, 7993)	2
  (0, 10216)	8
  (0, 16653)	8
  (0, 22624)	17


In [434]:
pickle.dump(count_vect, open('bow_count_vect.pickle', 'wb'))

In [255]:
X_bow = bow

In [256]:
X_train, X_test, y_train, y_test = train_test_split(X_bow, y_bina, test_size=0.1, random_state=101)

### RFC5 BOW with Classifier chain

In [257]:
chain_rfc5 = ClassifierChain(RandomForestClassifier())
chain_rfc5.fit(X_train, y_train)

ClassifierChain(base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
        cv=None, order=None, random_state=None)

In [258]:
predictions_rfc5 = chain_rfc5.predict(X_test)
print(classification_report(y_test, predictions_rfc5))

             precision    recall  f1-score   support

          0       0.33      0.01      0.02       195
          1       0.00      0.00      0.00       118
          2       0.00      0.00      0.00       118
          3       0.00      0.00      0.00        63
          4       0.25      0.02      0.04       151
          5       0.33      0.04      0.08        45
          6       0.42      0.03      0.06       162
          7       0.33      0.02      0.04        88
          8       0.20      0.01      0.02        91
          9       0.41      0.04      0.08       201
         10       0.39      0.05      0.09       187
         11       0.00      0.00      0.00        75
         12       0.08      0.01      0.01       196
         13       0.00      0.00      0.00        20
         14       0.12      0.01      0.02       122
         15       0.12      0.01      0.01       144
         16       0.23      0.01      0.02       267
         17       0.00      0.00      0.00   

  'precision', 'predicted', average, warn_for)


In [435]:
pickle.dump(chain_rfc5, open('moods_chain_rfc5.pickle', 'wb'))

The score is similar to what we got with TF-IDF and RFC

### kNN3 BOW with Classifier chain

In [259]:
chain_knn3 = ClassifierChain(KNeighborsClassifier())
chain_knn3.fit(X_train, y_train)

ClassifierChain(base_estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
        cv=None, order=None, random_state=None)

In [260]:
predictions_knn3 = chain_knn3.predict(X_test)
print(classification_report(y_test, predictions_knn3))

             precision    recall  f1-score   support

          0       0.33      0.03      0.06       195
          1       0.00      0.00      0.00       118
          2       0.10      0.05      0.07       118
          3       0.00      0.00      0.00        63
          4       0.25      0.01      0.01       151
          5       0.00      0.00      0.00        45
          6       0.67      0.04      0.07       162
          7       0.00      0.00      0.00        88
          8       0.00      0.00      0.00        91
          9       0.24      0.02      0.05       201
         10       0.38      0.03      0.06       187
         11       0.00      0.00      0.00        75
         12       0.20      0.01      0.02       196
         13       0.00      0.00      0.00        20
         14       0.00      0.00      0.00       122
         15       0.00      0.00      0.00       144
         16       0.23      0.06      0.10       267
         17       1.00      0.01      0.02   

  'precision', 'predicted', average, warn_for)


Similar to first scores

Let's try to filter out the non-English data

# Remove foreign languages

In [261]:
from langdetect import detect

In [262]:
en_lyrics = lyrics.copy()

for index, row in en_lyrics.iterrows():
    if detect(row['lyrics_features']) != 'en':
        en_lyrics.drop(axis=0, index=index, inplace=True)
        
en_lyrics.shape

(19434, 3)

In [263]:
cols2 = ['clean_lyrics', 'moods']
en_lyrics = en_lyrics[cols2]
en_lyrics.head()

Unnamed: 0,clean_lyrics,moods
1,late ve ve lose sleep dream thing babi ve ve p...,[happy]
2,parti rock yeah woo let s parti rock hous toni...,"[happy, celebratory, rowdy]"
4,j lo s new generat mr worldwid parti peopl flo...,[energetic]
5,today don t feel like do just wanna lay bed do...,"[happy, sprightly]"
6,s start heart reach fever pitch s bring dark f...,[warm]


Yay seems like it worked :)

Let's look at the y data

In [264]:
y = en_lyrics['moods']

In [265]:
mlb_en = MultiLabelBinarizer()
y_bina_en = mlb_en.fit_transform(y)
y_bina_en

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

Let's try this dataset on the RFC + chain classifier

## TF-IDF-EN with EN lyrics

In [288]:
vectorizer_en = TfidfVectorizer(ngram_range=(1,2), min_df=10)
tf_idf_en = vectorizer_en.fit_transform(en_lyrics['clean_lyrics'])
print(tf_idf_en[0])

  (0, 11362)	0.10513618837496067
  (0, 22935)	0.23442870962670864
  (0, 13315)	0.10016971771099573
  (0, 19491)	0.0958732908391669
  (0, 5327)	0.08106395649830213
  (0, 21302)	0.09452304237166369
  (0, 938)	0.06591460900993013
  (0, 16924)	0.11908067959139214
  (0, 8821)	0.08359707172173109
  (0, 18374)	0.07301417160224398
  (0, 3883)	0.26870204851664475
  (0, 5001)	0.12352866499092034
  (0, 12761)	0.08115109606283256
  (0, 20127)	0.11269191693996185
  (0, 24806)	0.00937740337197313
  (0, 11887)	0.009822064274593907
  (0, 12079)	0.006784947007571454
  (0, 20849)	0.038236528139323975
  (0, 23118)	0.025349425937520324
  (0, 9106)	0.009889898882385392
  (0, 12529)	0.013675339203137295
  (0, 6037)	0.011518034461276734
  (0, 6643)	0.020639093942934686
  (0, 19290)	0.017036516044442562
  (0, 18869)	0.02179677278162771
  :	:
  (0, 11946)	0.023432806319843767
  (0, 12536)	0.03092843131264083
  (0, 15987)	0.058150235519397284
  (0, 5185)	0.03739762350008375
  (0, 21528)	0.06095764374890914
  (0

In [436]:
pickle.dump(vectorizer_en, open('tfidf_vectorizer_en.pickle', 'wb'))

In [289]:
X_tfidf_en = tf_idf_en

In [290]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_en, y_bina_en, test_size=0.2, random_state=101)

### RFC_EN1 TFIDF with Classifier chain

In [291]:
chain_rfc_en1 = ClassifierChain(RandomForestClassifier())
chain_rfc_en1.fit(X_train, y_train)

ClassifierChain(base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
        cv=None, order=None, random_state=None)

In [292]:
predictions_rfc_en1 = chain_rfc_en1.predict(X_test)
print(classification_report(y_test, predictions_rfc_en1))

             precision    recall  f1-score   support

          0       0.50      0.03      0.05       360
          1       0.00      0.00      0.00       232
          2       0.21      0.01      0.03       214
          3       0.00      0.00      0.00       132
          4       0.21      0.02      0.04       255
          5       0.40      0.05      0.08        85
          6       0.29      0.02      0.04       271
          7       0.20      0.01      0.02       174
          8       0.12      0.01      0.01       152
          9       0.33      0.03      0.06       391
         10       0.39      0.05      0.09       393
         11       0.25      0.01      0.01       177
         12       0.22      0.02      0.04       315
         13       0.50      0.02      0.04        48
         14       0.31      0.02      0.03       284
         15       0.18      0.01      0.02       239
         16       0.26      0.02      0.04       522
         17       0.25      0.01      0.02   

In [437]:
pickle.dump(chain_rfc_en1, open('moods_chain_rfc_en1.pickle', 'wb'))

Similar score to before :(

### kNN-EN1 TFIDF with Classifier chain

In [269]:
chain_knn_en1 = ClassifierChain(KNeighborsClassifier())
chain_knn_en1.fit(X_train, y_train)

ClassifierChain(base_estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
        cv=None, order=None, random_state=None)

In [270]:
predictions_knn_en1 = chain_knn_en1.predict(X_test)
print(classification_report(y_test, predictions_knn_en1))

             precision    recall  f1-score   support

          0       0.29      0.03      0.05       360
          1       0.00      0.00      0.00       232
          2       0.00      0.00      0.00       214
          3       0.00      0.00      0.00       132
          4       0.57      0.02      0.03       255
          5       1.00      0.04      0.07        85
          6       0.38      0.02      0.04       271
          7       0.00      0.00      0.00       174
          8       0.00      0.00      0.00       152
          9       0.29      0.02      0.03       391
         10       0.22      0.32      0.26       393
         11       0.00      0.00      0.00       177
         12       0.16      0.03      0.06       315
         13       0.00      0.00      0.00        48
         14       0.24      0.03      0.06       284
         15       0.18      0.02      0.04       239
         16       0.24      0.08      0.12       522
         17       0.18      0.02      0.03   

  'precision', 'predicted', average, warn_for)


Even lower :(((((

## BOW-EN with EN lyrics

In [271]:
count_vect_en = CountVectorizer(ngram_range=(1, 2), min_df=10)
bow_en = count_vect_en.fit_transform(en_lyrics['clean_lyrics'])
print(bow_en[0])

  (0, 23368)	1
  (0, 14046)	1
  (0, 23699)	9
  (0, 13969)	1
  (0, 24437)	1
  (0, 6317)	1
  (0, 13503)	1
  (0, 6383)	1
  (0, 12891)	3
  (0, 6302)	2
  (0, 13912)	2
  (0, 10755)	2
  (0, 17951)	2
  (0, 4932)	2
  (0, 6455)	2
  (0, 21330)	1
  (0, 24688)	1
  (0, 4936)	1
  (0, 6409)	1
  (0, 10427)	2
  (0, 21528)	2
  (0, 5185)	2
  (0, 15987)	2
  (0, 12536)	1
  (0, 11946)	1
  :	:
  (0, 18869)	1
  (0, 19290)	1
  (0, 6643)	1
  (0, 6037)	1
  (0, 12529)	1
  (0, 9106)	1
  (0, 23118)	1
  (0, 20849)	2
  (0, 12079)	1
  (0, 11887)	1
  (0, 24806)	1
  (0, 20127)	8
  (0, 12761)	11
  (0, 5001)	7
  (0, 3883)	15
  (0, 18374)	7
  (0, 8821)	7
  (0, 16924)	7
  (0, 938)	7
  (0, 21302)	10
  (0, 5327)	7
  (0, 19491)	7
  (0, 13315)	7
  (0, 22935)	28
  (0, 11362)	7


In [272]:
X_bow_en = bow_en

In [273]:
X_train, X_test, y_train, y_test = train_test_split(X_bow_en, y_bina_en, test_size=0.2, random_state=101)

### RFC_EN2 BOW with Classifier chain

In [208]:
chain_rfc_en2 = ClassifierChain(RandomForestClassifier())
chain_rfc_en2.fit(X_train, y_train)

ClassifierChain(base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
        cv=None, order=None, random_state=None)

In [209]:
predictions_rfc_en2 = chain_rfc_en2.predict(X_test)
print(classification_report(y_test, predictions_rfc_en2))

             precision    recall  f1-score   support

          0       0.42      0.02      0.03       316
          1       0.20      0.00      0.01       253
          2       0.07      0.00      0.01       205
          3       0.11      0.01      0.01       141
          4       0.07      0.01      0.01       248
          5       0.38      0.06      0.11        82
          6       0.40      0.02      0.04       270
          7       0.23      0.02      0.04       153
          8       0.33      0.01      0.02       173
          9       0.48      0.06      0.10       416
         10       0.23      0.02      0.04       414
         11       0.07      0.01      0.01       152
         12       0.21      0.02      0.03       297
         13       0.00      0.00      0.00        56
         14       0.14      0.01      0.01       260
         15       0.15      0.01      0.01       257
         16       0.35      0.02      0.05       534
         17       0.08      0.01      0.01   

Same

### kNN_EN2 BOW with Classifier chain

In [276]:
chain_knn_en2 = ClassifierChain(KNeighborsClassifier(n_neighbors=10))
chain_knn_en2.fit(X_train, y_train)

ClassifierChain(base_estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform'),
        cv=None, order=None, random_state=None)

In [277]:
predictions_knn_en2 = chain_knn_en2.predict(X_test)
print(classification_report(y_test, predictions_knn_en2))

             precision    recall  f1-score   support

          0       1.00      0.01      0.02       360
          1       0.00      0.00      0.00       232
          2       0.00      0.00      0.00       214
          3       0.00      0.00      0.00       132
          4       1.00      0.00      0.01       255
          5       0.00      0.00      0.00        85
          6       1.00      0.01      0.01       271
          7       0.00      0.00      0.00       174
          8       0.00      0.00      0.00       152
          9       0.00      0.00      0.00       391
         10       0.50      0.01      0.02       393
         11       0.00      0.00      0.00       177
         12       0.00      0.00      0.00       315
         13       0.00      0.00      0.00        48
         14       0.00      0.00      0.00       284
         15       0.00      0.00      0.00       239
         16       0.06      0.00      0.00       522
         17       0.00      0.00      0.00   

  'precision', 'predicted', average, warn_for)


# Testing

In [439]:
def model(lyrics):
#     Clean text
    clean_lyrics = clean_text(lyrics)
    lyrics_list = []
    lyrics_list.append(clean_lyrics)
    
#     First classifier: tf-idf chain_rf3
    format_rfc3 = vectorizer2.transform(lyrics_list)
    moods_rfc3 = chain_rfc3.predict_proba(format_rfc3)
    
#    Second classifier: bow chain_rfc5
    format_rfc5 = count_vect.transform(lyrics_list)
    moods_rfc5 = chain_rfc5.predict_proba(format_rfc5)
    
#     Third classifier: 
    format_rfc_en1 = vectorizer_en.transform(lyrics_list)
    moods_rfc_en1 = chain_rfc_en1.predict_proba(format_rfc_en1)
    
#     Put all in a mood list
    all_moods_list = pd.DataFrame(
    {'mood': mlb.classes_.tolist(),
     'rfc3': moods_rfc3.tolist()[0],
     'rfc5': moods_rfc5.tolist()[0],
     'rfc_en1': moods_rfc_en1.tolist()[0]
    })
    
#     Calculate the max score
    all_moods_list['max'] = all_moods_list.max(axis=1)
    all_moods_list.sort_values('max', axis=0, ascending=False, inplace=True)
    all_moods_list.reset_index(drop=True, inplace=True)
#     print(all_moods_list)
    
#     Select relevant moods
    if all_moods_list.loc[0,('max')] >= 0.3:
        mood_1 = all_moods_list['mood'][0]
        if all_moods_list.loc[1,('max')] >= 0.3:
            mood_2 = all_moods_list['mood'][1]
            if all_moods_list.loc[1,('max')] >= 0.3:
                mood_3 = all_moods_list['mood'][2]
            else:
                mood_3 = ""
        else:
            mood_2 = ""
            mood_3 = ""
    else:
        mood_1 = all_moods_list['mood'][0]
        mood_2 = ""
        mood_3 = ""
        
#     print(mood_1)
#     print(mood_2)
#     print(mood_3)

    final_moods = []
    final_moods.append(mood_1)
    if mood_2 != "":
        final_moods.append(mood_2)
    if mood_3 != "":
        final_moods.append(mood_3)
            
    return final_moods


In [440]:
n = 12345
print(lyrics['lyrics_features'][n])
print(lyrics['moods'][n])

da da da da da da da da da da da da da da you re sweet like chocolate boy sweet like chocolate you bring me so much joy you re sweet like chocolate boy finding a way in the dark ain t so hard when you re close to my heart you are there when i m feeling alone all i need is for you to come home chorus you re sweet like chocolate boy sweet like chocolate you bring me so much joy you re sweet like chocolate boy trust is the lock is the key there s no doubt that your love s all for me you are sweet on the tip of my tongue you are warm like the rays of the sun chorus you re sweet like chocolate boy sweet like chocolate you bring me so much joy you re sweet like chocolate boy you re sweet like da da da da da da knowing you re there every day makes me high in my own special way i am caught in the face of your love holding you is a gift from above you re sweet like chocolate boy sweet like chocolate you bring me so much joy you re sweet like chocolate boy you re sweet like da da da sweet like c

In [441]:
testlolo = """Oh, oh, oh, little China girl
Oh, oh, oh, little China girl
I could escape this feeling, with my China girl
I feel a wreck without my, little China girl
I hear her heart beating, loud as thunder
Saw they stars crashing
I'm a mess without my, little China girl
Wake up mornings where's my, little China girl
I hear her heart's beating, loud as thunder
Saw they stars crashing down
I feel a-tragic like I'm Marlon Brando
When I look at my China girl
I could pretend that nothing really meant too much
When I look at my China girl
I stumble into town just like a sacred cow
Visions of swastikas in my head
Plans for everyone
It's in the whites of my eyes
My little China girl
You shouldn't mess with me
I'll ruin everything you are
You know, I'll give you television
I'll give you eyes of blue
I'll give you men's who want to rule the world
And when I get excited
My little China girl says
Oh baby, just you shut your mouth
She says, sh-sh-shhh
She says, sh-sh-shhh
She says
She says
And when I get excited
My little China girl says
Oh baby, just you shut your mouth
And when I get excited
My little China girl says
Oh baby, just you shut your mouth
She says, sh-sh-shhh
She says
Oh, oh, oh, little China girl
Oh, oh, oh, little China girl
Oh, oh, oh, little China girl
Oh, oh, oh, little China girl
"""

In [442]:
moods_proba = model(testlolo)
mood_classes = mlb.classes_.tolist()
moods_proba

['sprightly', 'celebratory', 'cocky']

In [447]:
def moods_model(lyrics):
#     Clean text
    clean_lyrics = clean_text(lyrics)
    lyrics_list = []
    lyrics_list.append(clean_lyrics)
    
#     First classifier: tf-idf chain_rf3
    lyrics1 = moods_vect1.transform(lyrics_list)
    moods_proba1 = moods_cl1.predict_proba(lyrics1)
    
#    Second classifier: bow chain_rfc5
    lyrics2 = moods_vect2.transform(lyrics_list)
    moods_proba2 = moods_cl2.predict_proba(lyrics2)
    
#     Third classifier: 
    lyrics3 = moods_vect3.transform(lyrics_list)
    moods_proba3 = moods_cl3.predict_proba(lyrics3)
    
#     Put all in a mood list
    all_moods_list = pd.DataFrame(
    {'mood': mlb.classes_.tolist(),
     'cl1': moods_proba1.tolist()[0],
     'cl2': moods_proba2.tolist()[0],
     'cl3': moods_proba3.tolist()[0]
    })
    
#     Calculate the max score
    all_moods_list['max'] = all_moods_list.max(axis=1)
    all_moods_list.sort_values('max', axis=0, ascending=False, inplace=True)
    all_moods_list.reset_index(drop=True, inplace=True)
#     print(all_moods_list)
    
#     Select relevant moods
    if all_moods_list.loc[0,('max')] >= 0.3:
        mood_1 = all_moods_list['mood'][0]
        if all_moods_list.loc[1,('max')] >= 0.3:
            mood_2 = all_moods_list['mood'][1]
            if all_moods_list.loc[1,('max')] >= 0.3:
                mood_3 = all_moods_list['mood'][2]
            else:
                mood_3 = ""
        else:
            mood_2 = ""
            mood_3 = ""
    else:
        mood_1 = all_moods_list['mood'][0]
        mood_2 = ""
        mood_3 = ""
        
#     print(mood_1)
#     print(mood_2)
#     print(mood_3)

    final_moods = []
    final_moods.append(mood_1)
    if mood_2 != "":
        final_moods.append(mood_2)
    if mood_3 != "":
        final_moods.append(mood_3)
            
    return final_moods

In [448]:
pickle.dump(moods_model, open('moods_model_function.pickle', 'wb'))