# NLP Methods on Music Reviews: Baseline Model


In [296]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from textblob import TextBlob
from numpy import dot
from numpy.linalg import norm

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LinearRegression

## Baseline Implementation

In [2]:
df = pd.read_parquet("datasets/processed_reviews.parquet")

#### Importing Lexicons/Counting Sentiments

In [3]:
 
def sentiment_counter(token_list):
    """Function to make list of each sentiment type"""
    pos_word_list = []
    neg_word_list = []
    neu_word_list = []

    for word in token_list:
        testimonial = TextBlob(word)
        if testimonial.sentiment.polarity >= 0.5:
            pos_word_list.append(word)
        elif testimonial.sentiment.polarity <= -0.5:
            neg_word_list.append(word)
        else:
            neu_word_list.append(word)
    return pos_word_list, neg_word_list, neu_word_list
        

In [4]:
# Adding the sentiments to the df

df["Sentiment List (Pos/Neg/Neu)"] = df['preprocessed_review'].apply(lambda x:sentiment_counter(x.split()))
df.head()

Unnamed: 0,id,review,track,artist,url,score,best_new_music,author,author_type,date,weekday,day,month,year,preprocessed_review,Sentiment List (Pos/Neg/Neu)
0,22703,"“Trip-hop” eventually became a ’90s punchline,...",mezzanine,massive attack,http://pitchfork.com/reviews/albums/22703-mezz...,9.3,0,nate patrin,contributor,2017-01-08,6,8,1,2017,trip-hop eventually became 90 punchline music-...,"([best, famous, love, beautiful, top, sure, cr..."
1,22721,"Eight years, five albums, and two EPs in, the ...",prelapsarian,krallice,http://pitchfork.com/reviews/albums/22721-prel...,7.9,0,zoe camp,contributor,2017-01-07,5,7,1,2017,eight year five album two eps new york-based o...,"([great, best, spontaneously, good], [grim, ha..."
2,22659,Minneapolis’ Uranium Club seem to revel in bei...,all of them naturals,uranium club,http://pitchfork.com/reviews/albums/22659-all-...,7.3,0,david glickman,contributor,2017-01-07,5,7,1,2017,minneapolis uranium club seem revel aggressive...,"([many, love, talented, perfect], [vaguely, ca..."
3,22661,Kleenex began with a crash. It transpired one ...,first songs,"kleenex, liliput",http://pitchfork.com/reviews/albums/22661-firs...,9.0,1,jenn pelly,associate reviews editor,2017-01-06,4,6,1,2017,kleenex began crash transpired one night long ...,"([nice, delighted, ok, greatest, perfect, amus..."
4,22725,It is impossible to consider a given release b...,new start,taso,http://pitchfork.com/reviews/albums/22725-new-...,8.1,0,kevin lozano,tracks coordinator,2017-01-06,4,6,1,2017,impossible consider given release footwork art...,"([remarkable, best, warm, perfect, perfect, ab..."


In [5]:
# record review length and drop empty reviews
df["review_length"] = df["preprocessed_review"].apply(lambda x: len(x.split()))
df = df[df["review_length"]>0]

# make columns for percentage of each review that is pos/neg/nue
df["%Positive Sentiment"] = df['Sentiment List (Pos/Neg/Neu)'].apply(lambda x:len(x[0]))/df["review_length"]
df["%Negative Sentiment"] = df['Sentiment List (Pos/Neg/Neu)'].apply(lambda x:len(x[1]))/df["review_length"]
df["%Neutral Sentiment"] = df['Sentiment List (Pos/Neg/Neu)'].apply(lambda x:len(x[2]))/df["review_length"]
df.head()

Unnamed: 0,id,review,track,artist,url,score,best_new_music,author,author_type,date,weekday,day,month,year,preprocessed_review,Sentiment List (Pos/Neg/Neu),review_length,%Positive Sentiment,%Negative Sentiment,%Neutral Sentiment
0,22703,"“Trip-hop” eventually became a ’90s punchline,...",mezzanine,massive attack,http://pitchfork.com/reviews/albums/22703-mezz...,9.3,0,nate patrin,contributor,2017-01-08,6,8,1,2017,trip-hop eventually became 90 punchline music-...,"([best, famous, love, beautiful, top, sure, cr...",860,0.012791,0.003488,0.983721
1,22721,"Eight years, five albums, and two EPs in, the ...",prelapsarian,krallice,http://pitchfork.com/reviews/albums/22721-prel...,7.9,0,zoe camp,contributor,2017-01-07,5,7,1,2017,eight year five album two eps new york-based o...,"([great, best, spontaneously, good], [grim, ha...",276,0.014493,0.01087,0.974638
2,22659,Minneapolis’ Uranium Club seem to revel in bei...,all of them naturals,uranium club,http://pitchfork.com/reviews/albums/22659-all-...,7.3,0,david glickman,contributor,2017-01-07,5,7,1,2017,minneapolis uranium club seem revel aggressive...,"([many, love, talented, perfect], [vaguely, ca...",326,0.01227,0.009202,0.978528
3,22661,Kleenex began with a crash. It transpired one ...,first songs,"kleenex, liliput",http://pitchfork.com/reviews/albums/22661-firs...,9.0,1,jenn pelly,associate reviews editor,2017-01-06,4,6,1,2017,kleenex began crash transpired one night long ...,"([nice, delighted, ok, greatest, perfect, amus...",745,0.021477,0.002685,0.975839
4,22725,It is impossible to consider a given release b...,new start,taso,http://pitchfork.com/reviews/albums/22725-new-...,8.1,0,kevin lozano,tracks coordinator,2017-01-06,4,6,1,2017,impossible consider given release footwork art...,"([remarkable, best, warm, perfect, perfect, ab...",309,0.035599,0.003236,0.961165


In [6]:
# Making Labels
df['label'] = df['score'].apply(lambda sc: 1 if sc>=7 else 0)
df.head()
df

Unnamed: 0,id,review,track,artist,url,score,best_new_music,author,author_type,date,...,day,month,year,preprocessed_review,Sentiment List (Pos/Neg/Neu),review_length,%Positive Sentiment,%Negative Sentiment,%Neutral Sentiment,label
0,22703,"“Trip-hop” eventually became a ’90s punchline,...",mezzanine,massive attack,http://pitchfork.com/reviews/albums/22703-mezz...,9.3,0,nate patrin,contributor,2017-01-08,...,8,1,2017,trip-hop eventually became 90 punchline music-...,"([best, famous, love, beautiful, top, sure, cr...",860,0.012791,0.003488,0.983721,1
1,22721,"Eight years, five albums, and two EPs in, the ...",prelapsarian,krallice,http://pitchfork.com/reviews/albums/22721-prel...,7.9,0,zoe camp,contributor,2017-01-07,...,7,1,2017,eight year five album two eps new york-based o...,"([great, best, spontaneously, good], [grim, ha...",276,0.014493,0.010870,0.974638,1
2,22659,Minneapolis’ Uranium Club seem to revel in bei...,all of them naturals,uranium club,http://pitchfork.com/reviews/albums/22659-all-...,7.3,0,david glickman,contributor,2017-01-07,...,7,1,2017,minneapolis uranium club seem revel aggressive...,"([many, love, talented, perfect], [vaguely, ca...",326,0.012270,0.009202,0.978528,1
3,22661,Kleenex began with a crash. It transpired one ...,first songs,"kleenex, liliput",http://pitchfork.com/reviews/albums/22661-firs...,9.0,1,jenn pelly,associate reviews editor,2017-01-06,...,6,1,2017,kleenex began crash transpired one night long ...,"([nice, delighted, ok, greatest, perfect, amus...",745,0.021477,0.002685,0.975839,1
4,22725,It is impossible to consider a given release b...,new start,taso,http://pitchfork.com/reviews/albums/22725-new-...,8.1,0,kevin lozano,tracks coordinator,2017-01-06,...,6,1,2017,impossible consider given release footwork art...,"([remarkable, best, warm, perfect, perfect, ab...",309,0.035599,0.003236,0.961165,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18396,1535,The marketing guys of yer average modern megac...,let us replay!,coldcut,http://pitchfork.com/reviews/albums/1535-let-u...,8.9,0,james p. wisdom,,1999-01-26,...,26,1,1999,marketing guy yer average modern megaconglomer...,"([love, interesting, good, good, pleased], [va...",280,0.017857,0.007143,0.975000,1
18397,1341,"Well, it's been two weeks now, and I guess it'...",1999,cassius,http://pitchfork.com/reviews/albums/1341-1999/,4.8,0,james p. wisdom,,1999-01-26,...,26,1,1999,well 's two week guess 's time tell guy someth...,"([plausible, sure, sure, sure, beloved, better...",134,0.044776,0.014925,0.940299,0
18398,5376,"Out of Tune is a Steve Martin album. Yes, I'l...",out of tune,mojave 3,http://pitchfork.com/reviews/albums/5376-out-o...,6.3,0,jason josephes,contributor,1999-01-12,...,12,1,1999,tune steve martin album yes 'll explain upon t...,"([notably, catching, sure, good, good, happy, ...",457,0.019694,0.019694,0.960613,0
18399,2413,"Well, kids, I just went back and re-read my re...","singles breaking up, vol. 1",don caballero,http://pitchfork.com/reviews/albums/2413-singl...,7.2,0,james p. wisdom,,1999-01-12,...,12,1,1999,well kid went back re-read review guy last alb...,"([love, sure, sure, love, perfect, enjoying], ...",243,0.024691,0.008230,0.967078,1


### Word Length Feature Count

In [7]:
def word_length(token_list):
    count = 0
    for token in token_list:
        if len(token) >= 6:
            count+=1
    return count


In [8]:
df["% Long Word Length"] = df['Sentiment List (Pos/Neg/Neu)'].apply(lambda x:(word_length(x[0]) + word_length(x[1]) + word_length(x[2])))/df['review_length']
df.head()

Unnamed: 0,id,review,track,artist,url,score,best_new_music,author,author_type,date,...,month,year,preprocessed_review,Sentiment List (Pos/Neg/Neu),review_length,%Positive Sentiment,%Negative Sentiment,%Neutral Sentiment,label,% Long Word Length
0,22703,"“Trip-hop” eventually became a ’90s punchline,...",mezzanine,massive attack,http://pitchfork.com/reviews/albums/22703-mezz...,9.3,0,nate patrin,contributor,2017-01-08,...,1,2017,trip-hop eventually became 90 punchline music-...,"([best, famous, love, beautiful, top, sure, cr...",860,0.012791,0.003488,0.983721,1,0.560465
1,22721,"Eight years, five albums, and two EPs in, the ...",prelapsarian,krallice,http://pitchfork.com/reviews/albums/22721-prel...,7.9,0,zoe camp,contributor,2017-01-07,...,1,2017,eight year five album two eps new york-based o...,"([great, best, spontaneously, good], [grim, ha...",276,0.014493,0.01087,0.974638,1,0.576087
2,22659,Minneapolis’ Uranium Club seem to revel in bei...,all of them naturals,uranium club,http://pitchfork.com/reviews/albums/22659-all-...,7.3,0,david glickman,contributor,2017-01-07,...,1,2017,minneapolis uranium club seem revel aggressive...,"([many, love, talented, perfect], [vaguely, ca...",326,0.01227,0.009202,0.978528,1,0.56135
3,22661,Kleenex began with a crash. It transpired one ...,first songs,"kleenex, liliput",http://pitchfork.com/reviews/albums/22661-firs...,9.0,1,jenn pelly,associate reviews editor,2017-01-06,...,1,2017,kleenex began crash transpired one night long ...,"([nice, delighted, ok, greatest, perfect, amus...",745,0.021477,0.002685,0.975839,1,0.514094
4,22725,It is impossible to consider a given release b...,new start,taso,http://pitchfork.com/reviews/albums/22725-new-...,8.1,0,kevin lozano,tracks coordinator,2017-01-06,...,1,2017,impossible consider given release footwork art...,"([remarkable, best, warm, perfect, perfect, ab...",309,0.035599,0.003236,0.961165,1,0.543689


In [9]:
from nltk.tag import pos_tag

def POS_count(token_list):
    most_common_POS = ['NN', 'NNP', 'DT', 'IN', 'JJ', 'NNS','CC','PRP','VB','VBG']
    count = 0
    pos_tags = [x[1] for x in pos_tag(token_list)]
    for tag in pos_tags:
        if tag in most_common_POS:
            count +=1
    return count
    


### Percent Common POS Tag

In [10]:
df['% Common POS Tag'] = df['Sentiment List (Pos/Neg/Neu)'].apply(lambda x: POS_count(x[0]) + POS_count(x[1]) + POS_count(x[2]))/df['review_length']
df.head()

Unnamed: 0,id,review,track,artist,url,score,best_new_music,author,author_type,date,...,year,preprocessed_review,Sentiment List (Pos/Neg/Neu),review_length,%Positive Sentiment,%Negative Sentiment,%Neutral Sentiment,label,% Long Word Length,% Common POS Tag
0,22703,"“Trip-hop” eventually became a ’90s punchline,...",mezzanine,massive attack,http://pitchfork.com/reviews/albums/22703-mezz...,9.3,0,nate patrin,contributor,2017-01-08,...,2017,trip-hop eventually became 90 punchline music-...,"([best, famous, love, beautiful, top, sure, cr...",860,0.012791,0.003488,0.983721,1,0.560465,0.740698
1,22721,"Eight years, five albums, and two EPs in, the ...",prelapsarian,krallice,http://pitchfork.com/reviews/albums/22721-prel...,7.9,0,zoe camp,contributor,2017-01-07,...,2017,eight year five album two eps new york-based o...,"([great, best, spontaneously, good], [grim, ha...",276,0.014493,0.01087,0.974638,1,0.576087,0.811594
2,22659,Minneapolis’ Uranium Club seem to revel in bei...,all of them naturals,uranium club,http://pitchfork.com/reviews/albums/22659-all-...,7.3,0,david glickman,contributor,2017-01-07,...,2017,minneapolis uranium club seem revel aggressive...,"([many, love, talented, perfect], [vaguely, ca...",326,0.01227,0.009202,0.978528,1,0.56135,0.797546
3,22661,Kleenex began with a crash. It transpired one ...,first songs,"kleenex, liliput",http://pitchfork.com/reviews/albums/22661-firs...,9.0,1,jenn pelly,associate reviews editor,2017-01-06,...,2017,kleenex began crash transpired one night long ...,"([nice, delighted, ok, greatest, perfect, amus...",745,0.021477,0.002685,0.975839,1,0.514094,0.783893
4,22725,It is impossible to consider a given release b...,new start,taso,http://pitchfork.com/reviews/albums/22725-new-...,8.1,0,kevin lozano,tracks coordinator,2017-01-06,...,2017,impossible consider given release footwork art...,"([remarkable, best, warm, perfect, perfect, ab...",309,0.035599,0.003236,0.961165,1,0.543689,0.796117


In [11]:
df

Unnamed: 0,id,review,track,artist,url,score,best_new_music,author,author_type,date,...,year,preprocessed_review,Sentiment List (Pos/Neg/Neu),review_length,%Positive Sentiment,%Negative Sentiment,%Neutral Sentiment,label,% Long Word Length,% Common POS Tag
0,22703,"“Trip-hop” eventually became a ’90s punchline,...",mezzanine,massive attack,http://pitchfork.com/reviews/albums/22703-mezz...,9.3,0,nate patrin,contributor,2017-01-08,...,2017,trip-hop eventually became 90 punchline music-...,"([best, famous, love, beautiful, top, sure, cr...",860,0.012791,0.003488,0.983721,1,0.560465,0.740698
1,22721,"Eight years, five albums, and two EPs in, the ...",prelapsarian,krallice,http://pitchfork.com/reviews/albums/22721-prel...,7.9,0,zoe camp,contributor,2017-01-07,...,2017,eight year five album two eps new york-based o...,"([great, best, spontaneously, good], [grim, ha...",276,0.014493,0.010870,0.974638,1,0.576087,0.811594
2,22659,Minneapolis’ Uranium Club seem to revel in bei...,all of them naturals,uranium club,http://pitchfork.com/reviews/albums/22659-all-...,7.3,0,david glickman,contributor,2017-01-07,...,2017,minneapolis uranium club seem revel aggressive...,"([many, love, talented, perfect], [vaguely, ca...",326,0.012270,0.009202,0.978528,1,0.561350,0.797546
3,22661,Kleenex began with a crash. It transpired one ...,first songs,"kleenex, liliput",http://pitchfork.com/reviews/albums/22661-firs...,9.0,1,jenn pelly,associate reviews editor,2017-01-06,...,2017,kleenex began crash transpired one night long ...,"([nice, delighted, ok, greatest, perfect, amus...",745,0.021477,0.002685,0.975839,1,0.514094,0.783893
4,22725,It is impossible to consider a given release b...,new start,taso,http://pitchfork.com/reviews/albums/22725-new-...,8.1,0,kevin lozano,tracks coordinator,2017-01-06,...,2017,impossible consider given release footwork art...,"([remarkable, best, warm, perfect, perfect, ab...",309,0.035599,0.003236,0.961165,1,0.543689,0.796117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18396,1535,The marketing guys of yer average modern megac...,let us replay!,coldcut,http://pitchfork.com/reviews/albums/1535-let-u...,8.9,0,james p. wisdom,,1999-01-26,...,1999,marketing guy yer average modern megaconglomer...,"([love, interesting, good, good, pleased], [va...",280,0.017857,0.007143,0.975000,1,0.439286,0.778571
18397,1341,"Well, it's been two weeks now, and I guess it'...",1999,cassius,http://pitchfork.com/reviews/albums/1341-1999/,4.8,0,james p. wisdom,,1999-01-26,...,1999,well 's two week guess 's time tell guy someth...,"([plausible, sure, sure, sure, beloved, better...",134,0.044776,0.014925,0.940299,0,0.432836,0.649254
18398,5376,"Out of Tune is a Steve Martin album. Yes, I'l...",out of tune,mojave 3,http://pitchfork.com/reviews/albums/5376-out-o...,6.3,0,jason josephes,contributor,1999-01-12,...,1999,tune steve martin album yes 'll explain upon t...,"([notably, catching, sure, good, good, happy, ...",457,0.019694,0.019694,0.960613,0,0.382932,0.684902
18399,2413,"Well, kids, I just went back and re-read my re...","singles breaking up, vol. 1",don caballero,http://pitchfork.com/reviews/albums/2413-singl...,7.2,0,james p. wisdom,,1999-01-12,...,1999,well kid went back re-read review guy last alb...,"([love, sure, sure, love, perfect, enjoying], ...",243,0.024691,0.008230,0.967078,1,0.456790,0.699588


### NNP/NNPS tag

In [12]:
def proper_noun_count(token_list):
    count = 0
    pos_tags = [x[1] for x in pos_tag(token_list)]
    for tag in pos_tags:
        if tag == 'NNP' or tag == 'NNPS':
            count +=1
    return count
    

In [13]:
import nltk

from nltk.tokenize import word_tokenize

# print(proper_noun_count(nltk.word_tokenize((df['review'][0]))))

In [14]:
df['Proper Noun Count'] = df['review'].apply(lambda x: proper_noun_count(nltk.word_tokenize(x)))
df

Unnamed: 0,id,review,track,artist,url,score,best_new_music,author,author_type,date,...,preprocessed_review,Sentiment List (Pos/Neg/Neu),review_length,%Positive Sentiment,%Negative Sentiment,%Neutral Sentiment,label,% Long Word Length,% Common POS Tag,Proper Noun Count
0,22703,"“Trip-hop” eventually became a ’90s punchline,...",mezzanine,massive attack,http://pitchfork.com/reviews/albums/22703-mezz...,9.3,0,nate patrin,contributor,2017-01-08,...,trip-hop eventually became 90 punchline music-...,"([best, famous, love, beautiful, top, sure, cr...",860,0.012791,0.003488,0.983721,1,0.560465,0.740698,244
1,22721,"Eight years, five albums, and two EPs in, the ...",prelapsarian,krallice,http://pitchfork.com/reviews/albums/22721-prel...,7.9,0,zoe camp,contributor,2017-01-07,...,eight year five album two eps new york-based o...,"([great, best, spontaneously, good], [grim, ha...",276,0.014493,0.010870,0.974638,1,0.576087,0.811594,59
2,22659,Minneapolis’ Uranium Club seem to revel in bei...,all of them naturals,uranium club,http://pitchfork.com/reviews/albums/22659-all-...,7.3,0,david glickman,contributor,2017-01-07,...,minneapolis uranium club seem revel aggressive...,"([many, love, talented, perfect], [vaguely, ca...",326,0.012270,0.009202,0.978528,1,0.561350,0.797546,64
3,22661,Kleenex began with a crash. It transpired one ...,first songs,"kleenex, liliput",http://pitchfork.com/reviews/albums/22661-firs...,9.0,1,jenn pelly,associate reviews editor,2017-01-06,...,kleenex began crash transpired one night long ...,"([nice, delighted, ok, greatest, perfect, amus...",745,0.021477,0.002685,0.975839,1,0.514094,0.783893,217
4,22725,It is impossible to consider a given release b...,new start,taso,http://pitchfork.com/reviews/albums/22725-new-...,8.1,0,kevin lozano,tracks coordinator,2017-01-06,...,impossible consider given release footwork art...,"([remarkable, best, warm, perfect, perfect, ab...",309,0.035599,0.003236,0.961165,1,0.543689,0.796117,91
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18396,1535,The marketing guys of yer average modern megac...,let us replay!,coldcut,http://pitchfork.com/reviews/albums/1535-let-u...,8.9,0,james p. wisdom,,1999-01-26,...,marketing guy yer average modern megaconglomer...,"([love, interesting, good, good, pleased], [va...",280,0.017857,0.007143,0.975000,1,0.439286,0.778571,64
18397,1341,"Well, it's been two weeks now, and I guess it'...",1999,cassius,http://pitchfork.com/reviews/albums/1341-1999/,4.8,0,james p. wisdom,,1999-01-26,...,well 's two week guess 's time tell guy someth...,"([plausible, sure, sure, sure, beloved, better...",134,0.044776,0.014925,0.940299,0,0.432836,0.649254,11
18398,5376,"Out of Tune is a Steve Martin album. Yes, I'l...",out of tune,mojave 3,http://pitchfork.com/reviews/albums/5376-out-o...,6.3,0,jason josephes,contributor,1999-01-12,...,tune steve martin album yes 'll explain upon t...,"([notably, catching, sure, good, good, happy, ...",457,0.019694,0.019694,0.960613,0,0.382932,0.684902,101
18399,2413,"Well, kids, I just went back and re-read my re...","singles breaking up, vol. 1",don caballero,http://pitchfork.com/reviews/albums/2413-singl...,7.2,0,james p. wisdom,,1999-01-12,...,well kid went back re-read review guy last alb...,"([love, sure, sure, love, perfect, enjoying], ...",243,0.024691,0.008230,0.967078,1,0.456790,0.699588,25


### Artist/Album Mention

In [57]:
import math
import re
from nltk.stem import WordNetLemmatizer

def count_albarts_mention(token_list, track, artist):
    try:
        count1 = 0
        count2 = 0
        div1 = len(track.split())
        div2 = len(artist.split())
        for token in token_list:
            if token in track:
                count1+=1
        for token in token_list:
            if token in artist:
                count2+=1
        return math.ceil(count1/div1), math.ceil(count2/div2)
    except:
        return (0,0)
    
#test checks out!

print(count_albarts_mention(df['Sentiment List (Pos/Neg/Neu)'][0][2], df['track'][0], df['artist'][0]))



(13, 7)


In [58]:
album_c = []
artist_c = []
for i, row in df.iterrows():
    al_c, ar_c = count_albarts_mention(row['Sentiment List (Pos/Neg/Neu)'][2],row['track'],row['artist'])
    album_c.append(al_c)
    artist_c.append(ar_c)

df['Album mention counts'] = album_c
df['Artist mention counts'] = artist_c



### Most similar from the top cosine similarities of each genre

In [208]:
import sqlite3

connection = sqlite3.connect('datasets/database.sqlite')
cursor = connection.cursor()

query = "select * from genres;"
cursor.execute(query)
df_genres = pd.DataFrame(cursor.fetchall(), columns=["reviewid","genre"])
df_genres.head()

Unnamed: 0,reviewid,genre
0,22703,electronic
1,22721,metal
2,22659,rock
3,22661,rock
4,22725,electronic


In [209]:
df_genres['genre'].value_counts()

rock            9436
electronic      3874
experimental    1815
rap             1559
pop/r&b         1432
metal            860
folk/country     685
jazz             435
global           217
Name: genre, dtype: int64

In [210]:
df_genres['id'] = df_genres['reviewid']
df_genres = df_genres.drop(['reviewid'], axis = 1)

In [214]:
b_dict = dict(zip(df_genres['id'], df_genres['genre']))
df['genre'] = df['id'].map(b_dict)

In [218]:
df.shape

(18376, 26)

In [219]:
df.isnull().sum()

id                                 0
review                             0
track                              0
artist                             0
url                                0
score                              0
best_new_music                     0
author                             0
date                               0
weekday                            0
day                                0
month                              0
year                               0
preprocessed_review                0
Sentiment List (Pos/Neg/Neu)       0
review_length                      0
%Positive Sentiment                0
%Negative Sentiment                0
%Neutral Sentiment                 0
label                              0
% Long Word Length                 0
% Common POS Tag                   0
Proper Noun Count                  0
Album mention counts               0
Artist mention counts              0
genre                           2364
dtype: int64

In [220]:
df.head()

Unnamed: 0,id,review,track,artist,url,score,best_new_music,author,date,weekday,...,%Positive Sentiment,%Negative Sentiment,%Neutral Sentiment,label,% Long Word Length,% Common POS Tag,Proper Noun Count,Album mention counts,Artist mention counts,genre
0,22703,"“Trip-hop” eventually became a ’90s punchline,...",mezzanine,massive attack,http://pitchfork.com/reviews/albums/22703-mezz...,9.3,0,nate patrin,2017-01-08,6,...,0.012791,0.003488,0.983721,1,0.560465,0.740698,244,13,7,electronic
1,22721,"Eight years, five albums, and two EPs in, the ...",prelapsarian,krallice,http://pitchfork.com/reviews/albums/22721-prel...,7.9,0,zoe camp,2017-01-07,5,...,0.014493,0.01087,0.974638,1,0.576087,0.811594,59,3,4,metal
2,22659,Minneapolis’ Uranium Club seem to revel in bei...,all of them naturals,uranium club,http://pitchfork.com/reviews/albums/22659-all-...,7.3,0,david glickman,2017-01-07,5,...,0.01227,0.009202,0.978528,1,0.56135,0.797546,64,1,9,rock
3,22661,Kleenex began with a crash. It transpired one ...,first songs,"kleenex, liliput",http://pitchfork.com/reviews/albums/22661-firs...,9.0,1,jenn pelly,2017-01-06,4,...,0.021477,0.002685,0.975839,1,0.514094,0.783893,217,8,13,rock
4,22725,It is impossible to consider a given release b...,new start,taso,http://pitchfork.com/reviews/albums/22725-new-...,8.1,0,kevin lozano,2017-01-06,4,...,0.035599,0.003236,0.961165,1,0.543689,0.796117,91,4,8,electronic


In [221]:
df.dropna(inplace = True)

In [222]:
df.shape

(16012, 26)

In [275]:
# getting the first review from those with 10/10 score of each genre.
genres = list(df['genre'].value_counts().index)

def first_of_best(genres):
    ids = []
    for g in genres:
        print(g)
        if g == 'metal':
            ids.append(df.loc[(df['score'] == 9.7) & (df['genre'] == g)].loc[:,'id'].iloc[0])
        elif g == 'global':
            ids.append(df.loc[(df['score'] == 9.4) & (df['genre'] == g)].loc[:,'id'].iloc[0])
        else:
            ids.append(df.loc[(df['score'] == 10) & (df['genre'] == g)].loc[:,'id'].iloc[0])
        print(ids)
    return ids

best_ids = first_of_best(genres)
    

rock
[22374]
electronic
[22374, 2377]
experimental
[22374, 2377, 22061]
rap
[22374, 2377, 22061, 21218]
pop/r&b
[22374, 2377, 22061, 21218, 22174]
metal
[22374, 2377, 22061, 21218, 22174, 8383]
folk/country
[22374, 2377, 22061, 21218, 22174, 8383, 699]
jazz
[22374, 2377, 22061, 21218, 22174, 8383, 699, 21158]
global
[22374, 2377, 22061, 21218, 22174, 8383, 699, 21158, 22255]


In [276]:
bests = df[df['id'].isin(best_ids)]

In [290]:
best_reviews = list(bests.loc[:,'review'])

In [300]:
def cossims(text):
    sims = []
    vectorizer = TfidfVectorizer()
    for rev in best_reviews:
        vectors = vectorizer.fit_transform([text,rev])
        sims.append(cosine_similarity(vectors)[0][1])
    return round(max(sims),3)

print(cossims(df['review'][0]))

0.845


In [301]:
df['best_sim_with_top_revs'] = df['review'].apply(cossims)

In [302]:
df.head()

Unnamed: 0,id,review,track,artist,url,score,best_new_music,author,date,weekday,...,%Negative Sentiment,%Neutral Sentiment,label,% Long Word Length,% Common POS Tag,Proper Noun Count,Album mention counts,Artist mention counts,genre,best_sim_with_top_revs
0,22703,"“Trip-hop” eventually became a ’90s punchline,...",mezzanine,massive attack,http://pitchfork.com/reviews/albums/22703-mezz...,9.3,0,nate patrin,2017-01-08,6,...,0.003488,0.983721,1,0.560465,0.740698,244,13,7,electronic,0.845
1,22721,"Eight years, five albums, and two EPs in, the ...",prelapsarian,krallice,http://pitchfork.com/reviews/albums/22721-prel...,7.9,0,zoe camp,2017-01-07,5,...,0.01087,0.974638,1,0.576087,0.811594,59,3,4,metal,0.745
2,22659,Minneapolis’ Uranium Club seem to revel in bei...,all of them naturals,uranium club,http://pitchfork.com/reviews/albums/22659-all-...,7.3,0,david glickman,2017-01-07,5,...,0.009202,0.978528,1,0.56135,0.797546,64,1,9,rock,0.777
3,22661,Kleenex began with a crash. It transpired one ...,first songs,"kleenex, liliput",http://pitchfork.com/reviews/albums/22661-firs...,9.0,1,jenn pelly,2017-01-06,4,...,0.002685,0.975839,1,0.514094,0.783893,217,8,13,rock,0.791
4,22725,It is impossible to consider a given release b...,new start,taso,http://pitchfork.com/reviews/albums/22725-new-...,8.1,0,kevin lozano,2017-01-06,4,...,0.003236,0.961165,1,0.543689,0.796117,91,4,8,electronic,0.71


## Modeling

In [318]:
df.to_csv("datasets/updated_features_data.csv", index = False)

In [319]:
df.columns

Index(['id', 'review', 'track', 'artist', 'url', 'score', 'best_new_music',
       'author', 'date', 'weekday', 'day', 'month', 'year',
       'preprocessed_review', 'Sentiment List (Pos/Neg/Neu)', 'review_length',
       '%Positive Sentiment', '%Negative Sentiment', '%Neutral Sentiment',
       'label', '% Long Word Length', '% Common POS Tag', 'Proper Noun Count',
       'Album mention counts', 'Artist mention counts', 'genre',
       'best_sim_with_top_revs'],
      dtype='object')

In [361]:
# Isolate the features of interest

df_model = df[["score","label", "review_length", "%Positive Sentiment", "%Negative Sentiment", "%Neutral Sentiment",
"% Long Word Length", "% Common POS Tag", "Proper Noun Count",
 "Album mention counts", "Artist mention counts", "best_sim_with_top_revs"]]
df_model

Unnamed: 0,score,label,review_length,%Positive Sentiment,%Negative Sentiment,%Neutral Sentiment,% Long Word Length,% Common POS Tag,Proper Noun Count,Album mention counts,Artist mention counts,best_sim_with_top_revs
0,9.3,1,860,0.012791,0.003488,0.983721,0.560465,0.740698,244,13,7,0.845
1,7.9,1,276,0.014493,0.010870,0.974638,0.576087,0.811594,59,3,4,0.745
2,7.3,1,326,0.012270,0.009202,0.978528,0.561350,0.797546,64,1,9,0.777
3,9.0,1,745,0.021477,0.002685,0.975839,0.514094,0.783893,217,8,13,0.791
4,8.1,1,309,0.035599,0.003236,0.961165,0.543689,0.796117,91,4,8,0.710
...,...,...,...,...,...,...,...,...,...,...,...,...
18395,7.2,1,155,0.032258,0.012903,0.954839,0.477419,0.722581,9,4,4,0.635
18396,8.9,1,280,0.017857,0.007143,0.975000,0.439286,0.778571,64,7,10,0.657
18397,4.8,0,134,0.044776,0.014925,0.940299,0.432836,0.649254,11,1,1,0.571
18398,6.3,0,457,0.019694,0.019694,0.960613,0.382932,0.684902,101,1,4,0.731


In [362]:
X = df_model.drop(['label','score'],axis = 1)
y = df_model['score']

In [363]:
scaler = StandardScaler()
scaler.fit(X)
x = scaler.transform(X)

In [364]:
# Split data 80/20
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [365]:
from sklearn.svm import SVR

# instantiate linear regression model, fit and predict
model_svr = SVR().fit(x_train, y_train)

preds_train_svr = model_svr.predict(x_train)
preds_test_svr = model_svr.predict(x_test)

In [366]:
# instantiate linear regression model, fit and predict
model = LinearRegression().fit(x_train, y_train)

preds_train = model.predict(x_train)
preds_test = model.predict(x_test)

In [367]:
# Round preds to first decimal point
preds_train = [round(num,1) for num in preds_train]
preds_test = [round(num,1) for num in preds_test]

# Round preds to first decimal point
preds_train_svr = [round(num,1) for num in preds_train_svr]
preds_test_svr = [round(num,1) for num in preds_test_svr]


### Evaluation w/ Mean Squared Error

In [18]:
# # Mean squared error (OLD ONE)
# mse_train = mean_squared_error(df_model.loc[x_train.index]['score'],preds_train)
# mse_test = mean_squared_error(df_model.loc[x_test.index]['score'],preds_test)

# print("Training MSE:",mse_train)
# print("Testing MSE:",mse_test)

Training MSE: 2.7162755102040816
Testing MSE: 2.6008269858541895


In [368]:
# Mean squared error
mse_train = mean_squared_error(y_train,preds_train)
mse_test = mean_squared_error(y_test,preds_test)

print("Training MSE:",mse_train)
print("Testing MSE:",mse_test)

Training MSE: 1.4655687407291749
Testing MSE: 1.5570746175460504


In [369]:
# Mean squared error
mse_train_svr = mean_squared_error(y_train,preds_train_svr)
mse_test_svr = mean_squared_error(y_test,preds_test_svr)

print("Training MSE:",mse_train_svr)
print("Testing MSE:",mse_test_svr)

Training MSE: 1.423371067218362
Testing MSE: 1.6365719637839526
