In [2]:
from collections import defaultdict
import numpy as np
import pandas as pd
import json
import torch
import torch.nn as nn
import torch.optim as optim


def load_data(path):
    data = []
    cols = {'verified':0,'reviewTime':1,'reviewerID':2,'asin':3,"reviewText":4,"summary":5,"unixReviewTime":6,"sentiment":7,"id":8}
    for line in open(path):
        review_data = json.loads(line)
        tmp = [None]*len(cols)
        for key in review_data:
            if key in cols:
                if key == "sentiment":
                    tmp[cols[key]] = 1 if review_data[key] == "positive" else 0
                else:
                    tmp[cols[key]] = review_data[key]
        data.append(tmp)
    X = pd.DataFrame(data, columns=cols)
    # set empty reviews to '' (instead of None)
    X.loc[X['reviewText'].isna(), 'reviewText'] = ''
    X.loc[X['summary'].isna(), 'summary'] = ''
    y = X['sentiment']
    X.drop(columns='sentiment', inplace=True)
    return X, y

df, target = load_data('Data/music_reviews_train.json')
df_dev, y_dev = load_data('Data/music_reviews_dev.json')
df_test, y_test = load_data('Data/music_reviews_test_masked.json')

## Reason why we don't have max document frequency

Words that appear in more than 5% of the reviews: ['album', 'best', 'better', 'buy', 'cd', 'don', 'good', 'great', 'heard', 'just', 'know', 'like', 'listen', 'love', 'music', 'new', 'really', 'song', 'songs', 'sound', 'time', 'version', 'voice', 'way']

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df = 40)
sh = vectorizer.fit_transform(df["reviewText"]+" "+df["summary"])
words2idx = vectorizer.vocabulary_
print(sh.shape)


(100000, 5618)


In [4]:
def oneHot(sentence):
    output = [0]*len(words2idx)
    for word in sentence:
        word = word.lower()
        if word in words2idx:
            output[words2idx[word]] = 1
    return output

X = []

for line in df["reviewText"]+" "+df["summary"]:
    X.append(oneHot(line.split()))

In [5]:
# DEV
X_dev = []
for line in df_dev["reviewText"]+" "+df_dev["summary"]:
    X_dev.append(oneHot(line.split()))

# TEST
X_test = []
for line in df_test["reviewText"]+" "+df_test["summary"]:
    X_test.append(oneHot(line.split())) # can't split None values

In [9]:
sum(df["sentiment"])/len(df["sentiment"])

0.6076681407960298

In [11]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X, target)
lr.score(X_dev, y_dev)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9199

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X, target)
clf.score(X_dev, y_dev)

In [35]:
predictions = lr.predict(X_test)
import csv
with open('submission.csv', mode='w',newline='') as file:
    writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

    writer.writerow(['id', 'prediction'])
    for idx, pred in enumerate(predictions):
        writer.writerow([str(idx),str(pred)])

In [6]:
## Graveyard

def oneHot(sentence):
    output = [0]*len(vocab)
    for word in sentence:
        if word in vocab:
            output[words2idx[word]] = 1
    return output

freqs = defaultdict(int)

for line in df["reviewText"]:
    for word in line.split():
        freqs[word] +=1

words = Counter(freqs)
w = words.most_common(1000)
vocab = set([x[0] for x in w])
words2idx = {x:idx for idx,x in enumerate(vocab)}
X = []

for line in df["reviewText"]:
    X.append(oneHot(line.split()))