# CoderSchool Final Project Moods Function
## Music Recommendation System

In [131]:
import pandas as pd
import numpy as np

# Part 1 - Data cleaning

In [132]:
full_df = pd.read_json('MasterSongList.json')

In [133]:
cols = ['lyrics_features', 'moods']
lyrics = full_df.copy()
lyrics = lyrics[cols]

Remove the wrong format

In [134]:
lyrics['lyrics_features'] = lyrics['lyrics_features'].apply(' '.join)
# lyrics['moods'] = lyrics['moods'].apply(', '.join)

Replace empty lyrics with NaN and drop them

In [135]:
lyrics['lyrics_features'].replace('', np.nan, inplace=True)

In [136]:
lyrics.dropna(subset=['lyrics_features'], inplace=True)

In [137]:
lyrics.reset_index(drop=True, inplace=True)

# Part 2 - Lyrics cleaning

In [138]:
from string import punctuation
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.stem.snowball import SnowballStemmer

In [139]:
def clean_text(raw_text):
    # Create empty list to receive result
    clean_words = []
    
    # 1. Convert to lower case
    raw_text = raw_text.lower()
    
    # 2. Remove punctuation
    translator = str.maketrans('', '', punctuation)
    raw_text = raw_text.translate(translator)
    split_words = raw_text.split()
    
    # 3 & 4. Remove common words and stem words
    stemmer = SnowballStemmer('english')
    for word in split_words:
        if word not in ENGLISH_STOP_WORDS:
            stemmed_word = stemmer.stem(word)
            clean_words.append(stemmed_word)
            
    return ' '.join(clean_words)

In [140]:
lyrics['clean_lyrics'] = lyrics['lyrics_features'].apply(clean_text)

In [141]:
cols2 = ['clean_lyrics', 'moods']
new_lyrics = lyrics.copy()
new_lyrics = new_lyrics[cols2]

# Part 3 - Try classifiers

Let's start with the following:
- TF-IDF with MultiLabelBinarizer and a Classifier Chain

### TF-IDF RFC Multilabel Classifier Chain

In [142]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [143]:
tf_idf = vectorizer.fit_transform(new_lyrics['clean_lyrics'])

In [144]:
X = tf_idf
y = new_lyrics['moods']

Let's use the multilabel binarizer

In [151]:
from sklearn.preprocessing import MultiLabelBinarizer

In [152]:
mlb = MultiLabelBinarizer()
y_bina = mlb.fit_transform(y)

In [153]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_bina, test_size=0.1, random_state=101)

RFC with Classifier chain

In [154]:
from sklearn.multioutput import ClassifierChain
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [155]:
chain = ClassifierChain(RandomForestClassifier())

In [156]:
chain.fit(X_train, y_train)
predictions = chain.predict(X_test)

In [157]:
print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

          0       0.21      0.03      0.05       195
          1       0.00      0.00      0.00       118
          2       0.00      0.00      0.00       118
          3       0.00      0.00      0.00        63
          4       0.31      0.03      0.05       151
          5       0.40      0.04      0.08        45
          6       0.30      0.02      0.03       162
          7       0.20      0.01      0.02        88
          8       0.00      0.00      0.00        91
          9       0.38      0.04      0.08       201
         10       0.45      0.05      0.09       187
         11       0.00      0.00      0.00        75
         12       0.00      0.00      0.00       196
         13       0.00      0.00      0.00        20
         14       0.00      0.00      0.00       122
         15       0.67      0.01      0.03       144
         16       0.24      0.01      0.03       267
         17       0.00      0.00      0.00   

  'precision', 'predicted', average, warn_for)


# Functions

### Function 1: transform text

In [172]:
def clean_lyrics(raw_list):
    from string import punctuation
    from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
    from nltk.stem.snowball import SnowballStemmer
    
    raw_text = ' '.join(raw_list)
    
    # Create empty list to receive result
    clean_words = []
    
    # 1. Convert to lower case
    raw_text = raw_text.lower()
    
    # 2. Remove punctuation
    translator = str.maketrans('', '', punctuation)
    raw_text = raw_text.translate(translator)
    split_words = raw_text.split()
    
    # 3 & 4. Remove common words and stem words
    stemmer = SnowballStemmer('english')
    for word in split_words:
        if word not in ENGLISH_STOP_WORDS:
            stemmed_word = stemmer.stem(word)
            clean_words.append(stemmed_word)
            
    return ' '.join(clean_words)

In [243]:
def tfidf_rfc(clean_lyrics):
    lyrics_list = []
    lyrics_list.append(clean_lyrics)
    tfidf_lyrics = vectorizer.transform(lyrics_list)
    moods = chain.predict(tfidf_lyrics)
    prediction = mlb.inverse_transform(moods)
    return prediction

### Test

In [259]:
# print(full_df['lyrics_features'][10])
# print(full_df['moods'][10])

In [257]:
# clean_text = clean_lyrics(full_df['lyrics_features'][10])

In [258]:
# predicted_moods = tfidf_rfc(clean_text)
# predicted_moods