In [179]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, hamming_loss
from sklearn.multioutput import ClassifierChain

In [53]:
train_text = pd.read_pickle('train_text_predictions.pickle')
test_text = pd.read_pickle('test_text_predictions.pickle')

In [54]:
train_img = pd.read_pickle('train_with_new_feature.pickle')
test_img = pd.read_pickle('test_with_new_feature.pickle')

In [191]:
train_orig = pd.read_pickle('all_books_train.pickle')
test_orig = pd.read_pickle('all_books_test.pickle')

In [230]:
train_add = train_orig[train_orig.index.isin(train_img.index)][['book_rating', 'book_review_count']]
test_add = test_orig[test_orig.index.isin(test_img.index)][['book_rating', 'book_review_count']]

In [193]:
train_add.shape

(36298, 2)

In [194]:
train_img.shape

(36298, 3)

In [56]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(train_img['genres_cut'])
y_train = multilabel_binarizer.transform(train_img['genres_cut'])
y_test = multilabel_binarizer.transform(test_img['genres_cut'])

In [57]:
img_train_pred = multilabel_binarizer.transform(train_img['keras_pred'])
img_test_pred = multilabel_binarizer.transform(test_img['keras_pred'])

In [58]:
test_img_final = pd.DataFrame({'index': test_img.index, 'keras_pred': list(img_test_pred)})

In [59]:
test_img_final.set_index('index', inplace=True)

In [60]:
train_img_final = pd.DataFrame({'index': train_img.index, 'keras_pred': list(img_train_pred)})
train_img_final.set_index('index', inplace=True)

In [82]:
train = pd.concat([train_img_final, train_text], axis=1)
test = pd.concat([test_img_final, test_text], axis=1)

In [96]:
train.head()

Unnamed: 0_level_0,keras_pred,prediction,concat_pred
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6377,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
19880,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
23352,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
16021,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
49981,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."


In [95]:
train['concat_pred'] = train.apply(lambda row: np.concatenate((row.keras_pred, row.prediction)), axis=1)
test['concat_pred'] = test.apply(lambda row: np.concatenate((row.keras_pred, row.prediction)), axis=1)

In [97]:
clf = OneVsRestClassifier(LogisticRegression())
clf.fit(list(train.concat_pred), y_train)
y_pred = clf.predict(list(test.concat_pred))
print(f1_score(y_true=y_test, y_pred=y_pred, average='micro'))
print(hamming_loss(y_true=y_test, y_pred=y_pred))

0.6790762338765737
0.07467434437543133


In [167]:
test['sum_pred'] = test.keras_pred+test.prediction
test.sum_pred = test.sum_pred.apply(lambda x: np.array([1 if i>0 else 0 for i in x ]))

In [150]:
sum_list = []
for pred in test.sum_pred:
    sum_list.append(pred)

In [153]:
pred_genres = multilabel_binarizer.inverse_transform(np.array(sum_list))

In [154]:
binary_pred_genres = multilabel_binarizer.transform(pred_genres)

In [171]:
test['sum_pred_label'] = pd.Series(list(binary_pred_genres))

In [172]:
print(f1_score(y_true=y_test, y_pred=binary_pred_genres, average='micro'))
print(hamming_loss(y_true=y_test, y_pred=binary_pred_genres))

0.6831722812449949
0.08532464918334484


In [173]:
clf = OneVsRestClassifier(LinearSVC(class_weight='balanced'))
clf.fit(list(train.concat_pred), y_train)
y_pred = clf.predict(list(test.concat_pred))
print(f1_score(y_true=y_test, y_pred=y_pred, average='micro'))
print(hamming_loss(y_true=y_test, y_pred=y_pred))

0.5954288120007534
0.13125790775247298


In [174]:
clf = OneVsRestClassifier(MultinomialNB())
clf.fit(list(train.concat_pred), y_train)
y_pred = clf.predict(list(test.concat_pred))
print(f1_score(y_true=y_test, y_pred=y_pred, average='micro'))
print(hamming_loss(y_true=y_test, y_pred=y_pred))

0.6407013185013694
0.09148550724637682


In [175]:
clf = OneVsRestClassifier(RandomForestClassifier())
clf.fit(list(train.concat_pred), y_train)
y_pred = clf.predict(list(test.concat_pred))
print(f1_score(y_true=y_test, y_pred=y_pred, average='micro'))
print(hamming_loss(y_true=y_test, y_pred=y_pred))

0.6525535038244611
0.08082082470669427


In [182]:
clf = ClassifierChain(LogisticRegression(), cv=11)
clf.fit(list(train.concat_pred), y_train)
y_pred = clf.predict(list(test.concat_pred))
print(f1_score(y_true=y_test, y_pred=y_pred, average='micro'))
print(hamming_loss(y_true=y_test, y_pred=y_pred))

0.6792132402890138
0.07468512767425811


In [231]:
train_add['predictions'] = train.concat_pred
test_add['predictions'] = test.concat_pred

In [200]:
train_add.head()

Unnamed: 0,book_rating,book_review_count,predictions
6377,4.02,1159,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
19880,3.74,161,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
23352,4.04,41,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
16021,3.35,18,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
49981,3.89,154,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."


In [216]:
test_add.head()

Unnamed: 0,book_rating_count,book_review_count
9102,5061,402
34116,3358,299
53879,10689,1513
30327,87,8
36487,1578,70


In [232]:
test_transformed = test_add.apply(lambda row: np.append(row.predictions, [row.book_rating]), axis=1)

In [233]:
train_transformed = train_add.apply(lambda row: np.append(row.predictions, [row.book_rating]), axis=1)

In [235]:
clf = OneVsRestClassifier(LogisticRegression())
clf.fit(list(train_transformed), y_train)
y_pred = clf.predict(list(test_transformed))
print(f1_score(y_true=y_test, y_pred=y_pred, average='micro'))
print(hamming_loss(y_true=y_test, y_pred=y_pred))

0.6787418655097613
0.07452697262479871
