In [1]:
import pandas as pd

In [2]:
# Sources - https://towardsdatascience.com/a-comprehensive-introduction-to-different-types-of-convolutions-in-deep-learning-669281e58215

In [2]:
df = pd.read_json('../MyAnimeList Scraper/anime_reviews.json')

In [3]:
df = df.transpose()

In [4]:
# Regex który usuwa ep_XX i eps_XX i dziwny śmietnik

df = df.replace(regex=r'eps [0-9]+', value='')
df = df.replace(regex=r'ep [0-9]+', value='')
df = df.replace(regex=r'[0-9]+[a-zA-z]+', value='')

In [5]:
df['text'] = df['text'].replace(regex=r'[0-9]+', value='')

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

'''
    sublinear_df => is set to True to use a logarithmic form for frequency.
    min_df => is the minimum numbers of documents a word must be present in to be kept.
    
'''

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

features = tfidf.fit_transform(df.text).toarray()
labels = df.score

In [7]:
features.shape
# For 16275 reviews we extract 119386 features.

(16275, 118288)

In [8]:
from sklearn.feature_selection import chi2
import numpy as np
N = 2
for category_id in ['1', '2','3','4','5','6','7','8','9','10']:
    features_chi2 = chi2(features, labels == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    
    print(f'For category {category_id}')
    
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

For category 1
  . Most correlated unigrams:
. nomizu
. okazaki
  . Most correlated bigrams:
. shouta aoi
. overall pathetic
For category 2
  . Most correlated unigrams:
. terrible
. worst
  . Most correlated bigrams:
. animation pleasing
. worst anime
For category 3
  . Most correlated unigrams:
. anos
. worst
  . Most correlated bigrams:
. overall poor
. self insertion
For category 4
  . Most correlated unigrams:
. boring
. dorei
  . Most correlated bigrams:
. plastic little
. life won
For category 5
  . Most correlated unigrams:
. mediocrity
. mediocre
  . Most correlated bigrams:
. just brief
. just mediocre
For category 6
  . Most correlated unigrams:
. dantalian
. neptunia
  . Most correlated bigrams:
. fan cgi
. overall fair
For category 7
  . Most correlated unigrams:
. omake
. preliminary
  . Most correlated bigrams:
. enjoyed fan
. watch special
For category 8
  . Most correlated unigrams:
. shigofumi
. terrible
  . Most correlated bigrams:
. enjoyed series
. bit dry
For cate

In [15]:
from sklearn.model_selection import train_test_split

train_features, test_features = train_test_split(features, test_size=0.2, shuffle=False)
valid_features, test_features = train_test_split(test_features, test_size=0.5, shuffle=False)

train_labels, test_labels = train_test_split(labels, test_size=0.2, shuffle=False)
valid_labels, test_labels = train_test_split(test_labels, test_size=0.5, shuffle=False)

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lgr = LogisticRegression()
lgr.fit(train_features, train_labels)
predictions = lgr.predict(valid_features)
print(predictions)
print(accuracy_score(y_true=valid_labels, y_pred=predictions))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


['8' '8' '9' ... '6' '9' '8']
0.21819299323909036


In [30]:
predictions = predictions.astype(int)
predictions

array([8, 8, 9, ..., 6, 9, 8])

In [29]:
valid_labels = valid_labels.astype(int)
valid_labels

array([10,  9,  6, ...,  3,  7,  7])

In [35]:
exact_acc = 0
one_error_acc = 0
two_error_acc = 0
three_error_acc = 0
size = len(predictions)

for index, p in enumerate(predictions):
    true_label = valid_labels[index]
    if p == true_label:
        exact_acc += 1
        one_error_acc += 1
        two_error_acc += 1
        three_error_acc += 1
    elif p - true_label == 1 or p - true_label == -1:
        one_error_acc += 1
        two_error_acc += 1
        three_error_acc += 1
    elif p - true_label == 2 or p - true_label == -2:
        two_error_acc += 1
        three_error_acc += 1
    elif p - true_label == 3 or p - true_label == -3:
        three_error_acc += 1
                
print(f'Exact accuracy: {exact_acc / size}')
print(f'Accuracy missed by one: {one_error_acc / size}')
print(f'Accuracy missed by two: {two_error_acc / size}')
print(f'Accuracy missed by three: {three_error_acc / size}')



Exact accuracy: 0.21819299323909036
Accuracy missed by one: 0.5519360786724032
Accuracy missed by two: 0.7473878303626306
Accuracy missed by three: 0.8518746158574063
