In [38]:
from collections import defaultdict
import numpy as np
import pandas as pd
import json
import torch
import torch.nn as nn
import torch.optim as optim
import csv
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

def load_data(path):
    data = []
    cols = {'verified':0,'reviewTime':1,'reviewerID':2,'asin':3,"reviewText":4,"summary":5,"unixReviewTime":6,"sentiment":7,"id":8}
    for line in open(path):
        review_data = json.loads(line)
        tmp = [None]*len(cols)
        for key in review_data:
            if key in cols:
                if key == "sentiment":
                    tmp[cols[key]] = 1 if review_data[key] == "positive" else 0
                else:
                    tmp[cols[key]] = str(review_data[key])
        data.append(tmp)
    X = pd.DataFrame(data, columns=cols)
    # set empty reviews to '' (instead of None)
    X.loc[X['reviewText'].isna(), 'reviewText'] = ''
    X.loc[X['summary'].isna(), 'summary'] = ''
    y = X['sentiment']
    X.drop(columns='sentiment', inplace=True)
    return X, y

df, target = load_data('Data/music_reviews_train.json')
df_dev, y_dev = load_data('Data/music_reviews_dev.json')
df_test, y_test = load_data('Data/music_reviews_test.json')
#df_hard, y_hard = load_data('Data/phase_2_masked.json')
sum(target)/len(target)

0.60783

## Reason why we don't have max document frequency

Words that appear in more than 5% of the reviews: ['album', 'best', 'better', 'buy', 'cd', 'don', 'good', 'great', 'heard', 'just', 'know', 'like', 'listen', 'love', 'music', 'new', 'really', 'song', 'songs', 'sound', 'time', 'version', 'voice', 'way']

In [28]:
X_train = (df["reviewText"]).to_list()
len(X_train), len(target)

(100000, 100000)

In [36]:
class OnehotTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def convert(self, sentence):# [[w1, w2, w3], [w1, w2, w3]]
        output = [0]*len(self.vocab)
        for word in sentence.split():
            word = word.lower()
            if word in self.vocab:
                output[self.vocab[word]] = 1
        return output

    def fit(self, X, y=None):
        vectorizer = CountVectorizer(min_df = 25)
        vectorizer.fit(X)
        self.vocab = vectorizer.vocabulary_
        return self
    
    def transform(self, X, y=None):
        X_ = [self.convert(row) for row in X]
        return X_

    
pipe = Pipeline([
    ('onehot', OnehotTransformer()),
    ('clf', LogisticRegression())
])
pipe.fit(X_train, target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('onehot', OnehotTransformer()), ('clf', LogisticRegression())])

In [39]:
pipe.score((df_test["reviewText"]).to_list(), y_test)

0.8791

In [33]:
TfidfVectorizer(min_df = 25).fit(X_train).vocabulary_ == CountVectorizer(min_df = 25).fit(X_train).vocabulary_

True

{'so': 6060,
 'creative': 1640,
 'love': 3975,
 'his': 3184,
 'music': 4391,
 'the': 6611,
 'words': 7328,
 'message': 4210,
 'some': 6083,
 'of': 4579,
 'my': 4403,
 'favorite': 2537,
 'songs': 6098,
 'on': 4609,
 'this': 6643,
 'cd': 1149,
 'should': 5914,
 'have': 3085,
 'bought': 905,
 'it': 3524,
 'years': 7386,
 'ago': 320,
 'tape': 6522,
 'can': 1064,
 'hardly': 3057,
 'be': 700,
 'understood': 6940,
 'and': 416,
 'was': 7175,
 'listed': 3900,
 'for': 2687,
 'sale': 5667,
 'as': 541,
 'very': 7066,
 'good': 2900,
 'bad': 647,
 'buy': 1035,
 'do': 2016,
 'not': 4523,
 'mp3': 4372,
 'album': 345,
 'download': 2059,
 'is': 3514,
 'no': 4501,
 'longer': 3944,
 'available': 609,
 'but': 1028,
 'you': 7397,
 'don': 2036,
 'find': 2599,
 'that': 6609,
 'out': 4672,
 'until': 6985,
 'after': 308,
 'purchased': 5178,
 'dallas': 1721,
 'voice': 7116,
 'thank': 6603,
 'will': 7274,
 'all': 359,
 'concerts': 1486,
 'in': 3360,
 'heaven': 3119,
 'forever': 2698,
 'great': 2950,
 'memories': 

In [19]:
X_test = (df_hard["reviewText"]+" "+df_hard["summary"]).to_list()
predictions = pipe.predict(X_test)

with open('submission.csv', mode='w',newline='') as file:
    writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

    writer.writerow(['id', 'prediction'])
    for idx, pred in enumerate(predictions):
        writer.writerow([str(idx),str(pred)])

In [17]:
import pickle
pickle.dump(pipe, open("model.obj", 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

In [21]:
with open('Data/tests_n500', encoding='UTF-8') as f:
    X_test = f.readlines()

predictions = pipe.predict(X_test)
predictions_percentage = pipe.predict_proba(X_test)
print(predictions_percentage[0])



[0.38559888 0.61440112]


In [24]:
with open('Data/checklist.pred', mode='w') as file:
    for pred, perc in zip(predictions, predictions_percentage):
        neg, pos = perc
        if pred == 1:
            pred += 1
        
        file.write(f'{pred} {neg} 0.0 {pos}\n')

In [22]:
predictions[0], predictions_percentage[0]

(1, array([0.38559888, 0.61440112]))

In [3]:
import checklist
from checklist.test_suite import TestSuite
suite_path = 'Data/sentiment_suite.pkl'
suite = TestSuite.from_file(suite_path)

pred_path = 'Data/checklist.pred'
suite.run_from_file(pred_path, overwrite=True)
suite.summary() # or suite.visual_summary_table()

Vocabulary

single positive words
Test cases:      34
Fails (rate):    1 (2.9%)

Example fails:
0.7 0.0 0.3 admired
----


single negative words
Test cases:      35
Fails (rate):    13 (37.1%)

Example fails:
0.4 0.0 0.6 dreaded
----
0.2 0.0 0.8 rough
----
0.4 0.0 0.6 abhorred
----


single neutral words
Test cases:      13
Fails (rate):    13 (100.0%)

Example fails:
0.4 0.0 0.6 saw
----
0.3 0.0 0.7 private
----
0.4 0.0 0.6 international
----


Sentiment-laden words in context
Test cases:      8658
Test cases run:  500
Fails (rate):    170 (34.0%)

Example fails:
0.4 0.0 0.6 I despise this food.
----
0.4 0.0 0.6 This food was sad.
----
0.4 0.0 0.6 I despise this cabin crew.
----


neutral words in context
Test cases:      1716
Test cases run:  500
Fails (rate):    500 (100.0%)

Example fails:
0.6 0.0 0.4 This customer service was Indian.
----
0.4 0.0 0.6 The flight is British.
----
0.6 0.0 0.4 The company was commercial.
----


intensifiers
Test cases:      2000
Test cases run:  500
F

In [None]:
with open('s', mode='w') as file:
    suite.summary()

In [4]:
cols = {'verified':None,'reviewTime':None,'reviewerID':None,'asin':None,"reviewText":None,"summary":None,"unixReviewTime":None,"sentiment":None,"id":None}

with open("Data/clean_cases.csv") as f:
    with open("Data/clean_cases.json", 'w') as o:
        for line in f:
            text, sentiment, *_ = line.split('\t')
            c = cols.copy()
            c["text"] = text
            c["sentiment"] = sentiment
            o.write(str(c))
            o.write('\n')

In [17]:
from transformers import pipeline
generator = pipeline('text-generation', model='EleutherAI/gpt-neo-1.3B')
generator("EleutherAI has", do_sample=True, min_length=50)

Downloading:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

KeyError: 'gpt_neo'

# LSTM

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size=2):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

model = LSTMTagger(embedding_dim=50, hidden_dim=10, vocab_size=len(vocab))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in range(10): 
    for sentence, label in zip(df.reviewText, target):
        model.zero_grad()

        pred = model(sentence)

        loss = loss_function(pred, label)
        loss.backward()
        optimizer.step()

with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)

    print(tag_scores)

In [7]:
vectorizer = TfidfVectorizer(min_df = 25)
vectorizer.fit(df.reviewText)
vocab = vectorizer.vocabulary_

In [9]:
len(vocab)

7422

In [2]:
import nltk
nltk.download('punkt')
sentence = """At eight o'clock on Thursday morning
Arthur didn't feel very good."""
tokens = nltk.word_tokenize(sentence)
tokens

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Christoffer\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


['At',
 'eight',
 "o'clock",
 'on',
 'Thursday',
 'morning',
 'Arthur',
 'did',
 "n't",
 'feel',
 'very',
 'good',
 '.']

In [5]:
from nltk.stem import WordNetLemmatizer
  
lemmatizer = WordNetLemmatizer()
  
print("rocks :", lemmatizer.lemmatize("rocks are the greatest"))

rocks : rocks are the greatest
