# NLP Methods on Music Reviews: Baseline Model


In [1]:
# !pip install textblob

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.linear_model import LinearRegression

## Baseline Implementation

In [3]:
df = pd.read_parquet("datasets/processed_reviews.parquet")

#### Importing Lexicons/Counting Sentiments

In [4]:
 
def sentiment_counter(token_list):
    """Function to make list of each sentiment type"""
    pos_word_list = []
    neg_word_list = []
    neu_word_list = []

    for word in token_list:
        testimonial = TextBlob(word)
        if testimonial.sentiment.polarity >= 0.5:
            pos_word_list.append(word)
        elif testimonial.sentiment.polarity <= -0.5:
            neg_word_list.append(word)
        else:
            neu_word_list.append(word)
    return pos_word_list, neg_word_list, neu_word_list
        

In [5]:
# Adding the sentiments to the df - TEXTBLOB

df["Sentiment List (Pos/Neg/Neu)"] = df['preprocessed_review'].apply(lambda x:textblob_sentiment_counter(x.split()))
# df.to_parquet("datasets/reviews_w_sentiment.parquet")
df.head()

Unnamed: 0,id,review,track,artist,url,score,best_new_music,author,author_type,date,weekday,day,month,year,preprocessed_review,Sentiment List (Pos/Neg/Neu)
0,22703,"“Trip-hop” eventually became a ’90s punchline,...",mezzanine,massive attack,http://pitchfork.com/reviews/albums/22703-mezz...,9.3,0,nate patrin,contributor,2017-01-08,6,8,1,2017,trip-hop eventually became 90 punchline music-...,"([best, famous, love, beautiful, top, sure, cr..."
1,22721,"Eight years, five albums, and two EPs in, the ...",prelapsarian,krallice,http://pitchfork.com/reviews/albums/22721-prel...,7.9,0,zoe camp,contributor,2017-01-07,5,7,1,2017,eight year five album two eps new york-based o...,"([great, best, spontaneously, good], [grim, ha..."
2,22659,Minneapolis’ Uranium Club seem to revel in bei...,all of them naturals,uranium club,http://pitchfork.com/reviews/albums/22659-all-...,7.3,0,david glickman,contributor,2017-01-07,5,7,1,2017,minneapolis uranium club seem revel aggressive...,"([many, love, talented, perfect], [vaguely, ca..."
3,22661,Kleenex began with a crash. It transpired one ...,first songs,"kleenex, liliput",http://pitchfork.com/reviews/albums/22661-firs...,9.0,1,jenn pelly,associate reviews editor,2017-01-06,4,6,1,2017,kleenex began crash transpired one night long ...,"([nice, delighted, ok, greatest, perfect, amus..."
4,22725,It is impossible to consider a given release b...,new start,taso,http://pitchfork.com/reviews/albums/22725-new-...,8.1,0,kevin lozano,tracks coordinator,2017-01-06,4,6,1,2017,impossible consider given release footwork art...,"([remarkable, best, warm, perfect, perfect, ab..."


In [12]:
# record review length and drop empty reviews
df["review_length"] = df["preprocessed_review"].apply(lambda x: len(x.split()))
df = df[df["review_length"]>0]

# make columns for percentage of each review that is pos/neg/nue
df["%Positive Sentiment"] = df['Sentiment List (Pos/Neg/Neu)'].apply(lambda x:len(x[0]))/df["review_length"]
df["%Negative Sentiment"] = df['Sentiment List (Pos/Neg/Neu)'].apply(lambda x:len(x[1]))/df["review_length"]
df["%Neutral Sentiment"] = df['Sentiment List (Pos/Neg/Neu)'].apply(lambda x:len(x[2]))/df["review_length"]
df.head()

Unnamed: 0,id,review,track,artist,url,score,best_new_music,author,author_type,date,weekday,day,month,year,preprocessed_review,Sentiment List (Pos/Neg/Neu),review_length,%Positive Sentiment,%Negative Sentiment,%Neutral Sentiment
0,22703,"“Trip-hop” eventually became a ’90s punchline,...",mezzanine,massive attack,http://pitchfork.com/reviews/albums/22703-mezz...,9.3,0,nate patrin,contributor,2017-01-08,6,8,1,2017,trip-hop eventually became 90 punchline music-...,"([best, famous, love, beautiful, top, sure, cr...",860,0.012791,0.003488,0.983721
1,22721,"Eight years, five albums, and two EPs in, the ...",prelapsarian,krallice,http://pitchfork.com/reviews/albums/22721-prel...,7.9,0,zoe camp,contributor,2017-01-07,5,7,1,2017,eight year five album two eps new york-based o...,"([great, best, spontaneously, good], [grim, ha...",276,0.014493,0.01087,0.974638
2,22659,Minneapolis’ Uranium Club seem to revel in bei...,all of them naturals,uranium club,http://pitchfork.com/reviews/albums/22659-all-...,7.3,0,david glickman,contributor,2017-01-07,5,7,1,2017,minneapolis uranium club seem revel aggressive...,"([many, love, talented, perfect], [vaguely, ca...",326,0.01227,0.009202,0.978528
3,22661,Kleenex began with a crash. It transpired one ...,first songs,"kleenex, liliput",http://pitchfork.com/reviews/albums/22661-firs...,9.0,1,jenn pelly,associate reviews editor,2017-01-06,4,6,1,2017,kleenex began crash transpired one night long ...,"([nice, delighted, ok, greatest, perfect, amus...",745,0.021477,0.002685,0.975839
4,22725,It is impossible to consider a given release b...,new start,taso,http://pitchfork.com/reviews/albums/22725-new-...,8.1,0,kevin lozano,tracks coordinator,2017-01-06,4,6,1,2017,impossible consider given release footwork art...,"([remarkable, best, warm, perfect, perfect, ab...",309,0.035599,0.003236,0.961165


In [13]:
# Making Labels
df['label'] = df['score'].apply(lambda sc: 1 if sc>=7 else 0)
df.head()

Unnamed: 0,id,review,track,artist,url,score,best_new_music,author,author_type,date,...,day,month,year,preprocessed_review,Sentiment List (Pos/Neg/Neu),review_length,%Positive Sentiment,%Negative Sentiment,%Neutral Sentiment,label
0,22703,"“Trip-hop” eventually became a ’90s punchline,...",mezzanine,massive attack,http://pitchfork.com/reviews/albums/22703-mezz...,9.3,0,nate patrin,contributor,2017-01-08,...,8,1,2017,trip-hop eventually became 90 punchline music-...,"([best, famous, love, beautiful, top, sure, cr...",860,0.012791,0.003488,0.983721,1
1,22721,"Eight years, five albums, and two EPs in, the ...",prelapsarian,krallice,http://pitchfork.com/reviews/albums/22721-prel...,7.9,0,zoe camp,contributor,2017-01-07,...,7,1,2017,eight year five album two eps new york-based o...,"([great, best, spontaneously, good], [grim, ha...",276,0.014493,0.01087,0.974638,1
2,22659,Minneapolis’ Uranium Club seem to revel in bei...,all of them naturals,uranium club,http://pitchfork.com/reviews/albums/22659-all-...,7.3,0,david glickman,contributor,2017-01-07,...,7,1,2017,minneapolis uranium club seem revel aggressive...,"([many, love, talented, perfect], [vaguely, ca...",326,0.01227,0.009202,0.978528,1
3,22661,Kleenex began with a crash. It transpired one ...,first songs,"kleenex, liliput",http://pitchfork.com/reviews/albums/22661-firs...,9.0,1,jenn pelly,associate reviews editor,2017-01-06,...,6,1,2017,kleenex began crash transpired one night long ...,"([nice, delighted, ok, greatest, perfect, amus...",745,0.021477,0.002685,0.975839,1
4,22725,It is impossible to consider a given release b...,new start,taso,http://pitchfork.com/reviews/albums/22725-new-...,8.1,0,kevin lozano,tracks coordinator,2017-01-06,...,6,1,2017,impossible consider given release footwork art...,"([remarkable, best, warm, perfect, perfect, ab...",309,0.035599,0.003236,0.961165,1


## Modeling

In [14]:
# Isolate the features of interest

df_model = df[["score","label", "%Positive Sentiment", "%Negative Sentiment", "%Neutral Sentiment"]]
df_model

Unnamed: 0,score,label,%Positive Sentiment,%Negative Sentiment,%Neutral Sentiment
0,9.3,1,0.012791,0.003488,0.983721
1,7.9,1,0.014493,0.010870,0.974638
2,7.3,1,0.012270,0.009202,0.978528
3,9.0,1,0.021477,0.002685,0.975839
4,8.1,1,0.035599,0.003236,0.961165
...,...,...,...,...,...
18396,8.9,1,0.017857,0.007143,0.975000
18397,4.8,0,0.044776,0.014925,0.940299
18398,6.3,0,0.019694,0.019694,0.960613
18399,7.2,1,0.024691,0.008230,0.967078


In [15]:
df_model.to_csv("datasets/sentiments_extracted_data.csv",index=False)

In [16]:
X = df_model.drop(['label','score'],axis = 1)
y = df_model['score']

In [17]:
# Split data 80/20
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [19]:
# instantiate linear regression model, fit and predict
model = LinearRegression().fit(x_train, y_train)

preds_train = model.predict(x_train)
preds_test = model.predict(x_test)

In [20]:
# Round preds to first decimal point
preds_train = [round(num,1) for num in preds_train]
preds_test = [round(num,1) for num in preds_test]


### Evaluation w/ Mean Squared Error

In [22]:
# Mean squared error
mse_train = mean_squared_error(y_train,preds_train)
mse_test = mean_squared_error(y_test,preds_test)

print("Training MSE:",mse_train)
print("Testing MSE:",mse_test)

Training MSE: 1.5999904761904762
Testing MSE: 1.9738711195981623e+22
