In [1]:
from collections import defaultdict
import numpy as np
import pandas as pd
import json
import torch
import torch.nn as nn
import torch.optim as optim
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

def load_data(path):
    data = []
    cols = {'verified':0,'reviewTime':1,'reviewerID':2,'asin':3,"reviewText":4,"summary":5,"unixReviewTime":6,"sentiment":7,"id":8}
    for line in open(path):
        review_data = json.loads(line)
        tmp = [None]*len(cols)
        for key in review_data:
            if key in cols:
                if key == "sentiment":
                    tmp[cols[key]] = 1 if review_data[key] == "positive" else 0
                else:
                    tmp[cols[key]] = review_data[key]
        data.append(tmp)
    X = pd.DataFrame(data, columns=cols)
    # set empty reviews to '' (instead of None)
    X.loc[X['reviewText'].isna(), 'reviewText'] = ''
    X.loc[X['summary'].isna(), 'summary'] = ''
    y = X['sentiment']
    X.drop(columns='sentiment', inplace=True)
    return X, y

df, target = load_data('Data/music_reviews_train.json')
df_dev, y_dev = load_data('Data/music_reviews_dev.json')
df_test, y_test = load_data('Data/music_reviews_test_masked.json')

## Reason why we don't have max document frequency

Words that appear in more than 5% of the reviews: ['album', 'best', 'better', 'buy', 'cd', 'don', 'good', 'great', 'heard', 'just', 'know', 'like', 'listen', 'love', 'music', 'new', 'really', 'song', 'songs', 'sound', 'time', 'version', 'voice', 'way']

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df = 25)
sh = vectorizer.fit_transform(df["reviewText"]+" "+df["summary"])
words2idx = vectorizer.vocabulary_
print(sh.shape)


(100000, 7623)


In [8]:
sum(target)/len(target)

0.60783

In [35]:
predictions = lr.predict(X_test)

with open('submission.csv', mode='w',newline='') as file:
    writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

    writer.writerow(['id', 'prediction'])
    for idx, pred in enumerate(predictions):
        writer.writerow([str(idx),str(pred)])

In [14]:
X_train = (df["reviewText"]+" "+df["summary"]).to_list()
len(X_train), len(target)

(100000, 100000)

In [16]:
class OnehotTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def convert(self, sentence):# [[w1, w2, w3], [w1, w2, w3]]
        output = [0]*len(self.vocab)
        for word in sentence.split():
            word = word.lower()
            if word in self.vocab:
                output[self.vocab[word]] = 1
        return output

    def fit(self, X, y=None):
        vectorizer = TfidfVectorizer(min_df = 25)
        vectorizer.fit(X)
        self.vocab = vectorizer.vocabulary_
        return self
    
    def transform(self, X, y=None):
        X_ = [self.convert(row) for row in X]
        return X_

    
pipe = Pipeline([
    ('onehot', OnehotTransformer()),
    ('clf', LogisticRegression())
])
pipe.fit(X_train, target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('onehot', OnehotTransformer()), ('clf', LogisticRegression())])

In [19]:
pipe.score((df_dev["reviewText"]+" "+df_dev["summary"]).to_list(), y_dev)

0.9207

In [17]:
import pickle
pickle.dump(pipe, open("model.obj", 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

In [21]:
with open('Data/tests_n500', encoding='UTF-8') as f:
    X_test = f.readlines()

predictions = pipe.predict(X_test)
predictions_percentage = pipe.predict_proba(X_test)
print(predictions_percentage[0])



[0.38559888 0.61440112]


In [24]:
with open('Data/checklist.pred', mode='w') as file:
    for pred, perc in zip(predictions, predictions_percentage):
        neg, pos = perc
        if pred == 1:
            pred += 1
        
        file.write(f'{pred} {neg} 0.0 {pos}\n')

In [22]:
predictions[0], predictions_percentage[0]

(1, array([0.38559888, 0.61440112]))

In [3]:
import checklist
from checklist.test_suite import TestSuite
suite_path = 'Data/sentiment_suite.pkl'
suite = TestSuite.from_file(suite_path)

pred_path = 'Data/checklist.pred'
suite.run_from_file(pred_path, overwrite=True)
suite.summary() # or suite.visual_summary_table()

Vocabulary

single positive words
Test cases:      34
Fails (rate):    1 (2.9%)

Example fails:
0.7 0.0 0.3 admired
----


single negative words
Test cases:      35
Fails (rate):    13 (37.1%)

Example fails:
0.4 0.0 0.6 dreaded
----
0.2 0.0 0.8 rough
----
0.4 0.0 0.6 abhorred
----


single neutral words
Test cases:      13
Fails (rate):    13 (100.0%)

Example fails:
0.4 0.0 0.6 saw
----
0.3 0.0 0.7 private
----
0.4 0.0 0.6 international
----


Sentiment-laden words in context
Test cases:      8658
Test cases run:  500
Fails (rate):    170 (34.0%)

Example fails:
0.4 0.0 0.6 I despise this food.
----
0.4 0.0 0.6 This food was sad.
----
0.4 0.0 0.6 I despise this cabin crew.
----


neutral words in context
Test cases:      1716
Test cases run:  500
Fails (rate):    500 (100.0%)

Example fails:
0.6 0.0 0.4 This customer service was Indian.
----
0.4 0.0 0.6 The flight is British.
----
0.6 0.0 0.4 The company was commercial.
----


intensifiers
Test cases:      2000
Test cases run:  500
F

In [None]:
with open('s', mode='w') as file:
    suite.summary()