# TfidfVectorizer (text features extraction) + Linear Regression for Anime Reviews Classification

In [3]:
import pandas as pd

In [4]:
df = pd.read_json('anime_reviews.json')

In [5]:
df = df.transpose()

In [6]:
from sklearn.utils import shuffle
df = shuffle(df)

### Modules to install

In [7]:
# !pip install spacy
# !python -m spacy download en_core_web_sm

## Data pre-processign

### Tokenization

In [8]:
import spacy
import string
import re

tok = spacy.load('en_core_web_sm')
def tokenize (text):
    # remove punctuation and numbers and set to lowercase
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

### Deletion of infrequent words

In [9]:
from collections import Counter

vocab = Counter()
for _, record in df.iterrows():
    vocab.update(tokenize(record['text']))

for word in list(vocab):
    if vocab[word] < 50:
        del vocab[word]

### Remove Stop Words

In [10]:
from spacy.lang.en.stop_words import STOP_WORDS

for word in list(vocab):
    if word in STOP_WORDS:
        del vocab[word]
        
del vocab['ep']
del vocab['eps']

In [11]:
def clean_text(text):
    tokenized = tokenize(text)
    for token in tokenized[:]:
        if token not in vocab:
            tokenized.remove(token)
    return ' '.join(tokenized)

## Data Cleaning

In [12]:
df['clean_text'] = df['text'].apply(lambda x: clean_text(x))

## Features extraction

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

'''
    sublinear_df => is set to True to use a logarithmic form for frequency.
    min_df => is the minimum numbers of documents a word must be present in to be kept.
    
'''

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

features = tfidf.fit_transform(df.clean_text).toarray()
labels = df.score.to_numpy()

## The most popular unigrams and bigrams

In [14]:
from sklearn.feature_selection import chi2
import numpy as np
N = 2
for category_id in ['1', '2','3','4','5','6','7','8','9','10']:
    features_chi2 = chi2(features, labels == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    
    print(f'For category {category_id}')
    
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

For category 1
  . Most correlated unigrams:
. cu
. pathetic
  . Most correlated bigrams:
. green green
. overall pathetic
For category 2
  . Most correlated unigrams:
. terrible
. worst
  . Most correlated bigrams:
. animation pleasing
. worst anime
For category 3
  . Most correlated unigrams:
. poorly
. worst
  . Most correlated bigrams:
. disappointed watched
. overall poor
For category 4
  . Most correlated unigrams:
. bland
. boring
  . Most correlated bigrams:
. life won
. section analysis
For category 5
  . Most correlated unigrams:
. mediocrity
. mediocre
  . Most correlated bigrams:
. overall disappointing
. second commercial
For category 6
  . Most correlated unigrams:
. fair
. chirico
  . Most correlated bigrams:
. episode min
. overall fair
For category 7
  . Most correlated unigrams:
. omake
. preliminary
  . Most correlated bigrams:
. watch special
. enjoyed fan
For category 8
  . Most correlated unigrams:
. mediocre
. terrible
  . Most correlated bigrams:
. enjoyed serie

## Logistic Regression

In [15]:
from sklearn.model_selection import train_test_split

train_features, test_features = train_test_split(features, test_size=0.2, shuffle=False)
valid_features, test_features = train_test_split(test_features, test_size=0.5, shuffle=False)

train_labels, test_labels = train_test_split(labels, test_size=0.2, shuffle=False)
valid_labels, test_labels = train_test_split(test_labels, test_size=0.5, shuffle=False)

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lgr = LogisticRegression()
lgr.fit(train_features, train_labels)
predictions = lgr.predict(valid_features)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
predictions = predictions.astype(int)
valid_labels = valid_labels.astype(int)

## Accuracy

In [22]:
exact_acc = 0
one_error_acc = 0
two_error_acc = 0
three_error_acc = 0
size = len(predictions)

for index, p in enumerate(predictions):
    true_label = valid_labels[index]
    if p == true_label:
        exact_acc += 1
        one_error_acc += 1
        two_error_acc += 1
        three_error_acc += 1
    elif p - true_label == 1 or p - true_label == -1:
        one_error_acc += 1
        two_error_acc += 1
        three_error_acc += 1
    elif p - true_label == 2 or p - true_label == -2:
        two_error_acc += 1
        three_error_acc += 1
    elif p - true_label == 3 or p - true_label == -3:
        three_error_acc += 1
                
print(f'Exact accuracy: {exact_acc / size}')
print(f'Accuracy missed by one: {one_error_acc / size}')
print(f'Accuracy missed by two: {two_error_acc / size}')
print(f'Accuracy missed by three: {three_error_acc / size}')

Exact accuracy: 0.27473878303626303
Accuracy missed by one: 0.6724031960663799
Accuracy missed by two: 0.858020897357099
Accuracy missed by three: 0.9280885064535955


## Sources

https://towardsdatascience.com/a-comprehensive-introduction-to-different-types-of-convolutions-in-deep-learning-669281e58215