In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
import re
import nltk
from nltk.stem import WordNetLemmatizer

In [10]:
img_train = pd.read_pickle('train_images_valid.pickle')
img_test = pd.read_pickle('test_images_valid.pickle')

In [11]:
books_train = pd.read_pickle('all_books_train.pickle')
books_test = pd.read_pickle('all_books_test.pickle')

In [129]:
img_train.loc[2]

KeyError: 'the label [2] is not in the [index]'

In [12]:
print(books_train.shape)
print(books_test.shape)

(36389, 14)
(12131, 14)


In [13]:
books_train = books_train[books_train.index.isin(img_train.index)]
books_test = books_test[books_test.index.isin(img_test.index)]

In [14]:
print(books_train.shape)
print(books_test.shape)

(36298, 14)
(12096, 14)


In [15]:
def transform_genres(genre):
    genres = genre.split('|')
    return list(set(genres))

In [16]:
len(set([item for sublist in books_train['genres_cut'] for item in sublist]))

23

In [17]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    pos = nltk.pos_tag(tokens)
    leave_tokens = [token[0] for token in pos if token[1].startswith('VB') or token[1].startswith('JJ') or 
                   token[1] =='NN' or token[1] =='NNS']
    return ' '.join(leave_tokens)
def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    tokens = text.split(' ')
    lemm_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemm_tokens)

In [18]:
def preprocess(text):
    clean = clean_text(text)
    token = tokenize(clean)
    lemm = lemmatize(token)
    return lemm

In [19]:
train = books_train[['book_desc', 'genres_cut']]
test = books_test[['book_desc', 'genres_cut']]

In [20]:
books_train['desc_proc'] = books_train['book_desc'].apply(preprocess)
books_test['desc_proc'] = books_test['book_desc'].apply(preprocess)

In [21]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(books_train['genres_cut'])
Y_train = multilabel_binarizer.transform(books_train['genres_cut'])
Y_test = multilabel_binarizer.transform(books_test['genres_cut'])

In [215]:
tfidf_vect = TfidfVectorizer(stop_words=stop_words, max_features=10000)
tfidf_vect.fit(books_train['desc_proc'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words={'i', 'me', 'than', 'whom', 'in', 'other', 'off', 'our', "it's", 'itself', 'on', 'isn', 'but', "don't", 'nor', 'through', "needn't", "you'd", 'with', 'will', "haven't", 'between', 'll', "mustn't", 'wouldn', 'during', "you'll", 'all', 'didn', 'm', 'myself', 'under', 'it', 'you', 'for', 'do...lves', 'this', 'of', 'here', 'then', "shouldn't", 'been', 'at', 'be', 'did', "wasn't", 'themselves'},
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [216]:
X_train = tfidf_vect.transform(books_train['desc_proc'])
X_test = tfidf_vect.transform(books_test['desc_proc'])

In [217]:
X_train.shape

(36298, 10000)

In [24]:
clf = OneVsRestClassifier(LogisticRegression())
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
print(f1_score(y_true=Y_test, y_pred=Y_pred, average='micro'))
print(hamming_loss(y_true=Y_test, y_pred=Y_pred))

0.5936832886310143
0.08004442719116632


In [25]:
clf = OneVsRestClassifier(MultinomialNB())
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
print(f1_score(y_true=Y_test, y_pred=Y_pred, average='micro'))
print(hamming_loss(y_true=Y_test, y_pred=Y_pred))

0.29523918255508613
0.11094217276282493


In [218]:
clf = OneVsRestClassifier(LinearSVC(class_weight='balanced'))
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
print(f1_score(y_true=Y_test, y_pred=Y_pred, average='micro'))
print(hamming_loss(y_true=Y_test, y_pred=Y_pred))

0.6213280671439151
0.11027720266850702


In [27]:
clf = OneVsRestClassifier(RandomForestClassifier())
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
print(f1_score(y_true=Y_test, y_pred=Y_pred, average='micro'))
print(hamming_loss(y_true=Y_test, y_pred=Y_pred))

0.40604204077321215
0.1010574821716126


In [28]:
from xgboost import XGBClassifier

In [29]:
clf = OneVsRestClassifier(XGBClassifier())
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
print(f1_score(y_true=Y_test, y_pred=Y_pred, average='micro'))
print(hamming_loss(y_true=Y_test, y_pred=Y_pred))

0.481467030823254
0.09554002760524499


In [30]:
from sklearn.multioutput import ClassifierChain

In [31]:
classifier = ClassifierChain(LinearSVC(class_weight='balanced'), cv=11)
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)
print(f1_score(y_true=Y_test, y_pred=Y_pred, average='micro'))
print(hamming_loss(y_true=Y_test, y_pred=Y_pred))

0.692465105776423
0.0783262882447665


In [32]:
multilabel_binarizer.classes_

array(['Adventure', 'Biography', 'Childrens', 'Classics', 'Contemporary',
       'Crime', 'Cultural', 'European Literature', 'Fantasy', 'Fiction',
       'Historical', 'History', 'Horror', 'Humor', 'Literature',
       'Mystery', 'Nonfiction', 'Novels', 'Paranormal', 'Romance',
       'Science Fiction', 'Thriller', 'Young Adult'], dtype=object)

In [33]:
def get_genres(y):
    indices = np.where(y == 1)[0]
    return multilabel_binarizer.classes_[indices]

In [34]:
get_genres(Y_pred[1])

array(['Fiction', 'History', 'Nonfiction'], dtype=object)

In [35]:
get_genres(Y_test[1])

array(['Biography', 'History', 'Nonfiction'], dtype=object)

In [36]:
test.iloc[1]

book_desc     On February 1, 1978, the first group of space ...
genres_cut                     [History, Biography, Nonfiction]
Name: 34116, dtype: object

In [41]:
from sklearn.model_selection import KFold

In [42]:
kf = KFold(n_splits=5)

In [48]:
import time

In [137]:
predictions = []
for train_index, test_index in kf.split(books_train, books_train['genres_cut']):
    print("TRAIN:", train_index, "TEST:", test_index)
    x_train, x_test = books_train.iloc[train_index], books_train.iloc[test_index]
    y_train, y_test = books_train['genres_cut'].iloc[train_index], books_train['genres_cut'].iloc[test_index]
    y_train_transformed = multilabel_binarizer.transform(y_train)
    y_test_transformed = multilabel_binarizer.transform(y_test)
    x_train_transformed = tfidf_vect.transform(x_train['desc_proc'])
    x_test_transformed = tfidf_vect.transform(x_test['desc_proc'])
    #print(x_train_transformed.shape)
    #print(y_train_transformed.shape)
    clf = OneVsRestClassifier(LinearSVC(class_weight='balanced'))
    start = time.time()
    clf.fit(x_train_transformed, y_train_transformed)
    end = time.time()
    print('Training time: ', (end-start)/60, 'minutes')
    y_pred = clf.predict(x_test_transformed)
    pred = pd.Series(list(y_pred))
    y_pred_idx = pd.DataFrame({'index': y_test.index, 'prediction': pred})
    y_pred_idx.set_index('index', inplace=True)
    print(y_pred_idx.shape)
    print('F1: ', f1_score(y_true=y_test_transformed, y_pred=y_pred, average='micro'))
    print('Hamming loss: ', hamming_loss(y_true=y_test_transformed, y_pred=y_pred))
    predictions.append(y_pred_idx)
    print(len(predictions))

TRAIN: [ 7260  7261  7262 ... 36295 36296 36297] TEST: [   0    1    2 ... 7257 7258 7259]
Training time:  0.6261144518852234 minutes
(7260, 1)
F1:  0.6790123456790124
Hamming loss:  0.07847646424721523
1
TRAIN: [    0     1     2 ... 36295 36296 36297] TEST: [ 7260  7261  7262 ... 14517 14518 14519]
Training time:  0.5848576823870341 minutes
(7260, 1)
F1:  0.6835745261414022
Hamming loss:  0.07778177027188884
2
TRAIN: [    0     1     2 ... 36295 36296 36297] TEST: [14520 14521 14522 ... 21777 21778 21779]
Training time:  0.47241026560465493 minutes
(7260, 1)
F1:  0.6827920172177656
Hamming loss:  0.07767397293088993
3
TRAIN: [    0     1     2 ... 36295 36296 36297] TEST: [21780 21781 21782 ... 29036 29037 29038]
Training time:  0.4216134707132975 minutes
(7259, 1)
F1:  0.6855982780549847
Hamming loss:  0.07698988362272921
4
TRAIN: [    0     1     2 ... 29036 29037 29038] TEST: [29039 29040 29041 ... 36295 36296 36297]
Training time:  0.41213842630386355 minutes
(7259, 1)
F1:  0.683

In [138]:
train_pred = pd.concat(predictions)

In [139]:
X_train.shape

(36298, 168622)

In [140]:
train_pred.shape

(36298, 1)

In [141]:
train_pred.head()

Unnamed: 0_level_0,prediction
index,Unnamed: 1_level_1
6377,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
19880,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ..."
23352,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
16021,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
49981,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."


In [142]:
pred = pd.Series(list(Y_pred))

In [143]:
test_pred = pd.DataFrame({'index': test.index, 'prediction': pred})

In [144]:
test['prediction'] = pred

In [145]:
test_pred.set_index('index', inplace=True)

In [149]:
train_pred.to_pickle('train_text_predictions.pickle')
test_pred.to_pickle('test_text_predictions.pickle')