# Imports

In [None]:
import gzip
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

from sklearn import preprocessing
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import f1_score
import pickle

# Reading in the data

In [None]:
def reading_in(path):
    thing=[]
    for line in gzip.open(path):
        review_data = json.loads(line)
        subthing = dict()
        for key in review_data:
            subthing[key]= review_data[key]
        thing.append(subthing)
    return thing

In [None]:
train = reading_in('../data/classification/music_reviews_train.json.gz')
dev = reading_in('../data/classification/music_reviews_dev.json.gz')
test = reading_in('../data/classification/music_reviews_test_masked.json.gz')

https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [None]:
len([i for i in test if "reviewText" in i.keys()])#how many do have review

9993

In [None]:
#checking sentiments
sents=set()
[sents.add(i['sentiment']) for i in train if "reviewText" in i.keys() and "sentiment" in i.keys()];
print(sents)

{'negative', 'positive'}


In [None]:
train_x = [i['reviewText'] for i in train if "reviewText" in i.keys() and "sentiment" in i.keys()]
train_y =  [i['sentiment'] for i in train if "reviewText" in i.keys() and "sentiment" in i.keys()]

dev_x = [i['reviewText'] for i in dev if "reviewText" in i.keys() and "sentiment" in i.keys()]
dev_y = [i['sentiment'] for i in dev if "reviewText" in i.keys() and "sentiment" in i.keys()]

In [None]:
len(train_x)

99946

In [None]:
#Label encoding

def sent_encoder(y_values):
    le = preprocessing.LabelEncoder()
    le.fit(['negative', 'positive'])
    train_y = le.transform(y_values)
    return train_y

In [None]:
train_y = sent_encoder(train_y)
dev_y = sent_encoder(dev_y)

In [None]:
count_vect = CountVectorizer(ngram_range=(1, 1))
tfidf_transformer = TfidfTransformer()

X_train_counts = count_vect.fit_transform(train_x)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [None]:
#vocab_dict = count_vect.vocabulary_
#vocab_list = count_vect.get_feature_names()

#X_array = X_train_counts.toarray()
#X_array.sum()

# Pipeline

In [None]:
pipeline = Pipeline([
    ('CountVectorizer',CountVectorizer()),
    ('Tfidf',TfidfTransformer()),
    ('clf', MultinomialNB())
])

In [None]:
pipeline.fit(train_x, train_y)

Pipeline(steps=[('CountVectorizer', CountVectorizer()),
                ('Tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

In [None]:
pred = pipeline.predict(dev_x)
f1 = f1_score(dev_y, pred, average='macro')
print('F1 score for development is:', f1)

F1 score for development is: 0.879278129121847


In [None]:
from sklearn.model_selection import GridSearchCV 

parameters = {'clf__alpha': [0,1],
              'clf__fit_prior': [True, False],
              'CountVectorizer__ngram_range': [(1,1),(1,2),(1,3),(1,4),(2,3),(2,4)]}
grid = GridSearchCV(pipeline, parameters)
grid.fit(train_x, train_y)

  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN
  % _ALPHA_MIN


KeyboardInterrupt: 

In [None]:
grid.best_estimator_

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

## Loading a saved model

In [None]:
def load_saved(filename):
    loaded_model = pickle.load(open('../models/' + filename, 'rb'))
    return loaded_model

In [None]:
model1 = load_saved('model1.pkl')

In [None]:
## trying with nn

In [None]:
pipeline2 = Pipeline([
    ('CountVectorizer',CountVectorizer()),
    ('Tfidf',TfidfTransformer()),
    ('clf', MLPClassifier(random_state=1, max_iter=300))
])

In [None]:
pipeline2.fit(train_x, train_y)
pred = pipeline2.predict(dev_x)
f1 = f1_score(dev_y, pred, average='macro')
print('F1 score for development is:', f1)

F1 score for development is: 0.8857984310314382


In [None]:
parameters = {'clf__alpha': [0.0001,],
              'CountVectorizer__ngram_range': [(1,1),(1,2)],
              'clf__activation': ['relu', 'logistic'],
              'clf__hidden_layer_sizes': [(30,20),]}
grid = GridSearchCV(pipeline2, parameters)
grid.fit(train_x, train_y)
#activation
#layers
#learning rate



KernelInterrupted: Execution interrupted by the Jupyter kernel.

In [None]:
pipeline2.get_params()

{'memory': None,
 'steps': [('CountVectorizer', CountVectorizer()),
  ('Tfidf', TfidfTransformer()),
  ('clf', MLPClassifier(max_iter=300, random_state=1))],
 'verbose': False,
 'CountVectorizer': CountVectorizer(),
 'Tfidf': TfidfTransformer(),
 'clf': MLPClassifier(max_iter=300, random_state=1),
 'CountVectorizer__analyzer': 'word',
 'CountVectorizer__binary': False,
 'CountVectorizer__decode_error': 'strict',
 'CountVectorizer__dtype': numpy.int64,
 'CountVectorizer__encoding': 'utf-8',
 'CountVectorizer__input': 'content',
 'CountVectorizer__lowercase': True,
 'CountVectorizer__max_df': 1.0,
 'CountVectorizer__max_features': None,
 'CountVectorizer__min_df': 1,
 'CountVectorizer__ngram_range': (1, 1),
 'CountVectorizer__preprocessor': None,
 'CountVectorizer__stop_words': None,
 'CountVectorizer__strip_accents': None,
 'CountVectorizer__token_pattern': '(?u)\\b\\w\\w+\\b',
 'CountVectorizer__tokenizer': None,
 'CountVectorizer__vocabulary': None,
 'Tfidf__norm': 'l2',
 'Tfidf__smoo

In [None]:
#https://towardsdatascience.com/the-simplest-way-to-train-a-neural-network-in-python-17613fa97958

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b88dfe01-c7e1-473c-bcfd-798313fc6522' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>