In [1]:
import pandas as pd 
import sklearn
import numpy as np
import matplotlib.pyplot as plt
import statistics 
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
import re

In [16]:
#creates X -- the array of reviews, and y -- the score associated to each review
df_reviews = pd.read_csv("reviews.csv")
df_reviews.dropna(inplace=True)

X = np.array(df_reviews.drop(["reviewid", "pub_year", "score"], axis = 1))
y = np.array(df_reviews["score"])

#nonempties = []
#for i in range(X.shape[0]):
#    if isinstance(X[i][0], str) and len(X[i][0]) > 300:
#        nonempties.append(i)

#X = np.array([X[i] for i in nonempties])
#y = np.array([y[i] for i in nonempties])

print(X.shape)

#Spreads out the scores between 5 and 8.5
y_scaled = np.copy(y)
for i in range(len(y_scaled)):
    if 5 <= y[i] <= 8.5:
        y_scaled[i] = 6 + (4 * (y[i] - 6))
    elif 8 < y[i]:
        y_scaled[i] = 14 + (y[i] - 8)


(18391, 2)


In [7]:
#A count of number of reviews per reviewer.
review_counts = {}
for i in X:
    if i[0] not in review_counts:
        review_counts[i[0]] = 1
    else:
        review_counts[i[0]] += 1

for i in review_counts:
    if review_counts[i] > 400:
        print(i)

#Reviewed more than 400 albums
#ian cohen
#brian howe
#marc hogan
#stuart berman
#mark richardson
#stephen m. deusner
#joe tangari

ian cohen
brian howe
marc hogan
stuart berman
mark richardson
stephen m. deusner
joe tangari


In [4]:
#binary search function, to be used in next cell for generating an alphabetized list of unique words 
#showing up in all reviews

def binary_search(arr, low, high, x):
 
    # Check base case
    if high >= low:
 
        mid = (high + low) // 2
 
        # If element is present at the middle itself
        if arr[mid] == x:
            return mid, True
 
        # If element is smaller than mid, then it can only
        # be present in left subarray
        elif arr[mid] > x:
            return binary_search(arr, low, mid - 1, x)
 
        # Else the element can only be present in right subarray
        else:
            return binary_search(arr, mid + 1, high, x)
 
    else:
        # Element is not present in the array
        return low, False

In [12]:
#Generates array of words and dictionary of word counts. Takes a couple minutes to run.

words = np.array(200000*[''], dtype=object)
wordcounts = {}
count = 0
for i in X:
    if i[0] == 'stephen m. deusner':
        clean = re.sub(r"[,.;@#?!&$]+\ *", " ", i[1])
        review = clean.split()
        for j in range(len(review)):
            word = review[j]
            if word.isalpha():
                word = word.lower()
                if words[count - 1] < word:
                    words[count] = word
                    wordcounts[word] = 1
                    count += 1
                else:
                    index, found = binary_search(words, 0, count, word)
                    if found == True:
                        wordcounts[word] += 1
                    else:
                        words[index+1:count+1] = words[index:count]
                        words[index] = word
                        wordcounts[word] = 1
                        count += 1


In [24]:
#Produces a csv file of words appearing more than 1000 times.

common_words = []
for i in wordcounts.keys():
    if wordcounts[i] > 300:
        common_words.append([i, wordcounts[i]])

def takeSecond(elem):
    return elem[1]

sorted_words = sorted(common_words, key=takeSecond, reverse=True)

common_frame = pd.DataFrame(np.array(sorted_words), columns=['word', 'frequency'])
common_frame.to_csv('stephen m. deusner.csv',index=False)

In [51]:
#Not finding any significant correlations

contractions = ["no", "not", "aren't", "can't", "couldn't", "didn't", "hasn't", "isn't", "mustn't", "shouldn't", "wasn't", "weren't", "won't", "wouldn't"]

#good = ["good", "nice", "solid", "warm", "catchy", "appeal", "interesting", "aesthetic", "cool", "bright", "rich", "creative", "complex", "dynamic", "fresh"]
#great = ["great", "better", "impressive", "compelling", "powerful", "memorable"]
#best = ["best", "perfect", "beautiful", "perfectly", "greatest", "excellent", "epic", "favorite", "gorgeous"]
#bad = ["bad", "strange", "lack", "difficult", "weird", "drone", "odd", "unfortunately", "drones"]

good = ["like"]
great = ["more"]
best = ["not", "no"]
bad = ["i", "my", "me"]

X_steph = np.array([[X[i][1]] for i in range(X.shape[0]) if X[i][0] == 'stephen m. deusner'])
y_steph = np.array([y[i] for i in range(X.shape[0]) if X[i][0] == 'stephen m. deusner'])
goodnesses = np.zeros((725,4))

for i in range(X_steph.shape[0]):
    clean = re.sub(r"[,.;@#?!&$]+\ *", " ", X_steph[i][0])
    review = clean.split()
    for j in range(len(review)):
        word = review[j]
        if word.isalpha():
            word = word.lower()
            if word in good:
                goodnesses[i][0] += 1
            elif word in great:
                goodnesses[i][1] += 1
            elif word in contractions:
                goodnesses[i][2] += 1
            elif word in bad:
                goodnesses[i][3] += 1
    goodnesses[i, :] /= len(review)
 

In [52]:
reg = LinearRegression().fit(goodnesses,y_steph)

print("The r-squared value is", reg.score(goodnesses,y_steph))
reg.coef_

The r-squared value is 0.045497196497228165


array([-52.48602822, -21.65621467, -49.56484165,  26.13559128])

In [None]:
#Determine whether there is a correlation between review length and the score given.

X_len = np.copy(X)

#cleans up empty slots and replaces a review by its length
for i in range(len(X_len)):
    X_len[i][0] = len(X_len[i][0])

#performs and plots a linear regression
reg = LinearRegression().fit(X_len,y)

plt.rcParams["figure.figsize"] = [9.50, 7.50]
plt.rcParams["figure.autolayout"] = True
print("The r-squared value is", reg.score(X_len,y))

plt.plot(X_len, y, 'r*')
plt.plot(X_len, reg.coef_ * X_len + reg.intercept_, linestyle='solid')
plt.show()