# Anime Recommendation System
### Predicting which animes you'll like
---

## 1.0 Imports

In [66]:
#Data extraction and manipulation
import pandas as pd 
import pymysql as pml
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import numpy as np

#Machine Learning Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

#Evaluation Metrics
from sklearn.metrics import roc_auc_score, average_precision_score

#Data Visualization
from sklearn.tree import plot_tree
import matplotlib.pylab as plt
import seaborn as sns

# 2.0 Helper Functions

In [67]:
# disable max columns limit
pd.set_option('display.max_columns', 70)

# sets to not crop long vlaues for rows
pd.set_option("display.max_colwidth", 70)

# sets seaborn default style
sns.set_style('white')

# 3.0 Check the Dataset

## 3.1 Data Dictionary

The dataset that i am using is from the site myanimelist: https://myanimelist.net/topanime.php

extracted via API, filtered and stored in a database on these notebooks: 


 |  **Feature** 	|                  **Description**                 	|
 |:------------:	|:------------------------------------------------:	|
 |   anime_id   	|          An id that represents a animes          	|
 |     title    	|                  The anime title                 	|
 |     genre    	|              The main genre of anime             	|
 |  mean_score  	|          Average anime viewership score          	|
 |  media_type  	| Anime type (movie, tv, special, oca, ona, music) 	|
 | num_episodes 	|              The number of episodes              	|
 |    picture   	|       A link to view the anime front cover       	|
 |    rating    	|        Anime rating (pg_13, r, g, r+, pg)        	|
 |  start_date  	|                   Release date                   	|
 |   end_date   	|     Completion date (may be a expected date)     	|
 |       y      	|      My interest in anime (0 - no, 1 - yes)      	| 


## 3.2 Loading the dataset

In [68]:
#Read the data with labels and load into a dataframe 
df_animes_with_labels = pd.read_csv('animes_with_label.csv', index_col=0)
#keep only those with label
df_animes_with_labels = df_animes_with_labels[df_animes_with_labels['y'].notnull()] 
df_animes_with_labels.shape

(605, 11)

In [69]:
#Creating a new dataframe with the new labeled data 
df_active_learning = pd.read_csv("active_label_done.csv", index_col= 0)
df_active_learning = df_active_learning[df_active_learning['y'].notnull()]
df_active_learning.shape

(200, 12)

In [70]:
def data_split(X, y, test_size):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,
                                                        random_state=42)
    return X_train, X_test, y_train, y_test

def X_split(X, test_size):
    X_train, X_test = train_test_split(X, test_size=test_size,
                                                        random_state=42)
    return X_train, X_test

In [71]:
def word_vectorizer(word_train, word_test, min_df):
    word_vec = TfidfVectorizer(min_df= min_df)
    word_bow_train = word_vec.fit_transform(word_train)
    word_bow_test = word_vec.transform(word_test)
    
    return word_bow_train, word_bow_test

In [72]:
#Split the data with labels
X = df_animes_with_labels[['mean_score', 'num_episodes']]
y = df_animes_with_labels['y'].copy()

X_train, X_test, y_train, y_test = data_split(X, y, 0.5)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((302, 2), (303, 2), (302,), (303,))

In [73]:
#Split the titles in training and test
title_train, title_test = X_split(df_animes_with_labels['title'], 0.5)

In [74]:
title_bow_train, title_bow_test = word_vectorizer(title_train, title_test, 1)

In [75]:
Xtrain = hstack([X_train, title_bow_train])
Xtest = hstack([X_test, title_bow_test])

In [76]:
#Concat the new data to test set
X_test_plus = pd.concat([X_test, df_active_learning[['mean_score', 'num_episodes']]])
y_test_plus = pd.concat([y_test, df_active_learning['y']])
title_test_plus = pd.concat([title_test, df_active_learning['title']])

In [77]:
title_bow_train, title_bow_test_plus = word_vectorizer(title_train, title_test_plus, 1)

In [78]:
Xtest_plus = hstack([X_test_plus, title_bow_test_plus])

In [79]:
mdl = RandomForestClassifier(n_estimators = 1000, random_state= 0, class_weight="balanced")
mdl.fit(Xtrain, y_train)
probability_score_test_plus = mdl.predict_proba(Xtest_plus)[:,1]

In [80]:
print(average_precision_score(y_test_plus, probability_score_test_plus), 
      roc_auc_score(y_test_plus, probability_score_test_plus))

0.6149893004819301 0.7838457336818696


In [81]:
#Testing min_samples_leaf = 2
mdl = RandomForestClassifier(n_estimators = 1000, random_state= 0, min_samples_leaf=2, class_weight="balanced")
mdl.fit(Xtrain, y_train)
probability_score_test_plus = mdl.predict_proba(Xtest_plus)[:,1]

print(average_precision_score(y_test_plus, probability_score_test_plus), 
      roc_auc_score(y_test_plus, probability_score_test_plus))

0.4808257084083387 0.6906577971987048


In [82]:
#Testing min_samples_leaf = 2 and n_estimator = 500
mdl = RandomForestClassifier(n_estimators = 500, random_state= 0, min_samples_leaf=2, class_weight="balanced")
mdl.fit(Xtrain, y_train)
probability_score_test_plus = mdl.predict_proba(Xtest_plus)[:,1]

print(average_precision_score(y_test_plus, probability_score_test_plus), 
      roc_auc_score(y_test_plus, probability_score_test_plus))

0.47144530034091076 0.6842300339432719


In [83]:
#Testing tfdfvectorizer ngram_range = (1,3)

def word_vectorizer(word_train, word_test, min_df):
    word_vec = TfidfVectorizer(min_df= min_df, ngram_range=(1,3))
    word_bow_train = word_vec.fit_transform(word_train)
    word_bow_test = word_vec.transform(word_test)
    
    return word_bow_train, word_bow_test

#Split the titles in training and test
title_train, title_test = X_split(df_animes_with_labels['title'], 0.5)

title_bow_train, title_bow_test = word_vectorizer(title_train, title_test, 1)

Xtrain = hstack([X_train, title_bow_train])
Xtest = hstack([X_test, title_bow_test])

#Concat the new data to test set
X_test_plus = pd.concat([X_test, df_active_learning[['mean_score', 'num_episodes']]])
y_test_plus = pd.concat([y_test, df_active_learning['y']])
title_test_plus = pd.concat([title_test, df_active_learning['title']])

title_bow_train, title_bow_test_plus = word_vectorizer(title_train, title_test_plus, 1)

Xtest_plus = hstack([X_test_plus, title_bow_test_plus])

mdl = RandomForestClassifier(n_estimators = 1000, random_state= 0, class_weight="balanced")
mdl.fit(Xtrain, y_train)
probability_score_test_plus = mdl.predict_proba(Xtest_plus)[:,1]

print(average_precision_score(y_test_plus, probability_score_test_plus), 
      roc_auc_score(y_test_plus, probability_score_test_plus))

0.6876159471901129 0.8117611486091061


## LightGBM

In [84]:
from lightgbm import LGBMClassifier

In [85]:
mdl = LGBMClassifier(random_state=0, class_weight= 'balanced', n_jobs=6)
mdl.fit(Xtrain, y_train)
probability_score_test_plus = mdl.predict_proba(Xtest_plus)[:,1]



In [86]:
print(average_precision_score(y_test_plus, probability_score_test_plus), 
      roc_auc_score(y_test_plus, probability_score_test_plus))

0.4263935604056547 0.6575533533611642


## Bayesian Optimization

In [87]:
from skopt import forest_minimize

In [146]:
def tune_lgbm(params):
    print(params)
    lr = params[0]
    max_depth = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    n_estimators = params[5]
    
    min_df = params[6]
    ngram_range = (1, params[7])
    
    
    title_train, title_test = X_split(df_animes_with_labels['title'], 0.5)
    
    word_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
    word_bow_train = word_vec.fit_transform(title_train)
    word_bow_test = word_vec.transform(title_test)

    Xtrain = hstack([X_train, word_bow_train])
    Xtest = hstack([X_test, word_bow_test])

    #Concat the new data to test set
    X_test_plus = pd.concat([X_test, df_active_learning[['mean_score', 'num_episodes']]])
    y_test_plus = pd.concat([y_test, df_active_learning['y']])
    title_test_plus = pd.concat([title_test, df_active_learning['title']])

    word_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
    word_bow_train = word_vec.fit_transform(title_train)
    word_bow_test_plus = word_vec.transform(title_test_plus)
    
    Xtest_plus = hstack([X_test_plus, word_bow_test_plus])
    
    mdl = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth, 
                         min_child_samples=min_child_samples, subsample=subsample,
                         colsample_bytree=colsample_bytree, bagging_freq=1,n_estimators=n_estimators, random_state=0, 
                         class_weight="balanced", n_jobs=6)
    
    mdl.fit(Xtrain, y_train)
    
    probability_score_test_plus = mdl.predict_proba(Xtest_plus)[:,1]
    
    print(roc_auc_score(y_test_plus, probability_score_test_plus))
    
    return -average_precision_score(y_test_plus, probability_score_test_plus)


space = [(1e-3, 1e-1, 'log-uniform'), # lr
          (1, 10), # max_depth
          (1, 20), # min_child_samples
          (0.05, 1.), # subsample
          (0.05, 1.), # colsample_bytree
          (100,1000), # n_estimators
          (1,5), # min_df
          (1,5)] # ngram_range

res = forest_minimize(tune_lgbm, space, random_state=160745, n_random_starts=20, n_calls=50, verbose=1)

Iteration No: 1 started. Evaluating function at random point.
[0.009944912110647982, 5, 1, 0.4677107511929402, 0.49263223036174764, 272, 3, 1]




0.7203484062268347
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.3101
Function value obtained: -0.5002
Current minimum: -0.5002
Iteration No: 2 started. Evaluating function at random point.
[0.053887464791860025, 1, 15, 0.7437489153990157, 0.8675167974293533, 549, 3, 4]
0.6912137645819516
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.0908
Function value obtained: -0.4697
Current minimum: -0.5002
Iteration No: 3 started. Evaluating function at random point.
[0.004151454520895999, 6, 20, 0.8682075103820793, 0.9491436163200662, 411, 4, 3]




0.672164566345441
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.1027
Function value obtained: -0.4806
Current minimum: -0.5002
Iteration No: 4 started. Evaluating function at random point.
[0.0014099928811969545, 9, 9, 0.6502182010234373, 0.6866210554187129, 828, 5, 2]




0.6751589871639812
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.2803
Function value obtained: -0.4497
Current minimum: -0.5002
Iteration No: 5 started. Evaluating function at random point.
[0.08530558241838007, 8, 19, 0.2137736299768322, 0.1313765544201984, 961, 4, 1]
0.6484140298856853
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 0.1007
Function value obtained: -0.4079
Current minimum: -0.5002
Iteration No: 6 started. Evaluating function at random point.
[0.003567949451535685, 10, 19, 0.7232951768944309, 0.7298538828427115, 939, 4, 3]




0.6789142054543327
Iteration No: 6 ended. Evaluation done at random point.
Time taken: 0.2802
Function value obtained: -0.4869
Current minimum: -0.5002
Iteration No: 7 started. Evaluating function at random point.
[0.014828577273549474, 7, 1, 0.18428087097824575, 0.3261556557915816, 274, 1, 2]




0.7243084546057508
Iteration No: 7 ended. Evaluation done at random point.
Time taken: 0.3102
Function value obtained: -0.5784
Current minimum: -0.5784
Iteration No: 8 started. Evaluating function at random point.
[0.0015212976972079912, 3, 12, 0.44234694306528044, 0.399351303640462, 272, 3, 5]
0.6778510397565448
Iteration No: 8 ended. Evaluation done at random point.
Time taken: 0.0688
Function value obtained: -0.4689
Current minimum: -0.5784
Iteration No: 9 started. Evaluating function at random point.
[0.01946212855369041, 9, 18, 0.5235636153223084, 0.6728679300083596, 747, 4, 5]




0.672047520580547
Iteration No: 9 ended. Evaluation done at random point.
Time taken: 0.1606
Function value obtained: -0.4825
Current minimum: -0.5784
Iteration No: 10 started. Evaluating function at random point.
[0.0012116790683302117, 3, 2, 0.06616307483844217, 0.23025600705315752, 677, 2, 5]




0.6857808903281183
Iteration No: 10 ended. Evaluation done at random point.
Time taken: 0.1945
Function value obtained: -0.4754
Current minimum: -0.5784
Iteration No: 11 started. Evaluating function at random point.
[0.0053139776214487944, 6, 9, 0.14251441334450304, 0.8175761405215897, 297, 1, 5]
0.6619815847996566
Iteration No: 11 ended. Evaluation done at random point.
Time taken: 0.0878
Function value obtained: -0.4442
Current minimum: -0.5784
Iteration No: 12 started. Evaluating function at random point.
[0.0068572961982704935, 10, 5, 0.2390386584472456, 0.49053406102209746, 176, 2, 4]
0.6854297530334361
Iteration No: 12 ended. Evaluation done at random point.
Time taken: 0.0908
Function value obtained: -0.4600
Current minimum: -0.5784
Iteration No: 13 started. Evaluating function at random point.
[0.00781968225875022, 3, 4, 0.7078936710077383, 0.31818755505678337, 275, 4, 4]




0.6810405368499084
Iteration No: 13 ended. Evaluation done at random point.
Time taken: 0.0768
Function value obtained: -0.4638
Current minimum: -0.5784
Iteration No: 14 started. Evaluating function at random point.
[0.017293945600511968, 2, 15, 0.9007557574888567, 0.41026441194439994, 316, 5, 1]
0.6884826967344231
Iteration No: 14 ended. Evaluation done at random point.
Time taken: 0.0668
Function value obtained: -0.4623
Current minimum: -0.5784
Iteration No: 15 started. Evaluating function at random point.
[0.012250750764764855, 8, 6, 0.5976582413192033, 0.2474882432951916, 516, 4, 4]




0.6473801256291211
Iteration No: 15 ended. Evaluation done at random point.
Time taken: 0.1805
Function value obtained: -0.4069
Current minimum: -0.5784
Iteration No: 16 started. Evaluating function at random point.
[0.018353598126553926, 4, 3, 0.47305622526323254, 0.1404164811277527, 133, 4, 1]
0.6726327494050173
Iteration No: 16 ended. Evaluation done at random point.
Time taken: 0.0588
Function value obtained: -0.4743
Current minimum: -0.5784
Iteration No: 17 started. Evaluating function at random point.
[0.0010383234748454694, 9, 19, 0.9256771571832196, 0.9321438677645206, 312, 4, 3]
0.6504818383988139
Iteration No: 17 ended. Evaluation done at random point.
Time taken: 0.1067
Function value obtained: -0.4417
Current minimum: -0.5784
Iteration No: 18 started. Evaluating function at random point.
[0.004955229758078229, 5, 5, 0.06939551310802591, 0.4193273080472823, 725, 4, 1]




0.6574460614100114
Iteration No: 18 ended. Evaluation done at random point.
Time taken: 0.1047
Function value obtained: -0.4520
Current minimum: -0.5784
Iteration No: 19 started. Evaluating function at random point.
[0.0699516121742407, 9, 10, 0.6477856515609233, 0.8594430701440198, 616, 1, 1]




0.6402208263431001
Iteration No: 19 ended. Evaluation done at random point.
Time taken: 0.2134
Function value obtained: -0.3851
Current minimum: -0.5784
Iteration No: 20 started. Evaluating function at random point.
[0.0014752743467850462, 5, 4, 0.9747950537021096, 0.982207187458162, 909, 2, 4]




0.6419179899340642
Iteration No: 20 ended. Evaluation done at random point.
Time taken: 0.5136
Function value obtained: -0.4169
Current minimum: -0.5784
Iteration No: 21 started. Searching for the next optimal point.
[0.014630845775527177, 3, 1, 0.4439621581317118, 0.16536888986349557, 240, 2, 2]




0.7322090437361009
Iteration No: 21 ended. Search finished for the next optimal point.
Time taken: 0.4328
Function value obtained: -0.5054
Current minimum: -0.5784
Iteration No: 22 started. Searching for the next optimal point.
[0.09367977466191704, 10, 1, 0.3747625359026133, 0.3632568406893375, 162, 2, 1]




0.7425675939292263
Iteration No: 22 ended. Search finished for the next optimal point.
Time taken: 0.6074
Function value obtained: -0.5542
Current minimum: -0.5784
Iteration No: 23 started. Searching for the next optimal point.
[0.056045872522138745, 10, 1, 0.3544846355258283, 0.7872021923747273, 390, 2, 1]




0.6991338613397838
Iteration No: 23 ended. Search finished for the next optimal point.
Time taken: 1.0412
Function value obtained: -0.4789
Current minimum: -0.5784
Iteration No: 24 started. Searching for the next optimal point.
[0.03169711632063249, 9, 1, 0.29258986373177887, 0.18479415219761008, 544, 2, 1]




0.7624751277749601
Iteration No: 24 ended. Search finished for the next optimal point.
Time taken: 0.9141
Function value obtained: -0.6250
Current minimum: -0.6250
Iteration No: 25 started. Searching for the next optimal point.
[0.04220593701752512, 9, 1, 0.05777899118046498, 0.3451859028481066, 730, 2, 1]




0.7325601810307829
Iteration No: 25 ended. Search finished for the next optimal point.
Time taken: 0.8165
Function value obtained: -0.5755
Current minimum: -0.6250
Iteration No: 26 started. Searching for the next optimal point.
[0.08360760623245106, 10, 1, 0.094686171960567, 0.16976906951670082, 751, 2, 1]




0.7502926144122353
Iteration No: 26 ended. Search finished for the next optimal point.
Time taken: 0.9547
Function value obtained: -0.5790
Current minimum: -0.6250
Iteration No: 27 started. Searching for the next optimal point.
[0.09512642699731642, 8, 3, 0.2175840449302332, 0.06385779506370344, 706, 1, 1]
0.6728278256798408




Iteration No: 27 ended. Search finished for the next optimal point.
Time taken: 0.4222
Function value obtained: -0.4368
Current minimum: -0.6250
Iteration No: 28 started. Searching for the next optimal point.
[0.07886687563185459, 8, 1, 0.05332338504830386, 0.19959977040464572, 944, 3, 1]




0.7399730794740743
Iteration No: 28 ended. Search finished for the next optimal point.
Time taken: 0.7852
Function value obtained: -0.5743
Current minimum: -0.6250
Iteration No: 29 started. Searching for the next optimal point.
[0.030697157994527144, 8, 1, 0.22491316267334754, 0.25231269955818936, 538, 2, 1]




0.7447329405797667
Iteration No: 29 ended. Search finished for the next optimal point.
Time taken: 0.7775
Function value obtained: -0.6081
Current minimum: -0.6250
Iteration No: 30 started. Searching for the next optimal point.
[0.04023975317126381, 7, 1, 0.20726009788784033, 0.9068128067837882, 773, 1, 2]




0.6927938824080216
Iteration No: 30 ended. Search finished for the next optimal point.
Time taken: 1.0396
Function value obtained: -0.4946
Current minimum: -0.6250
Iteration No: 31 started. Searching for the next optimal point.
[0.0013763369501059444, 9, 1, 0.15556074244318452, 0.26500532973317575, 706, 1, 1]




0.7443915570988258
Iteration No: 31 ended. Search finished for the next optimal point.
Time taken: 1.0176
Function value obtained: -0.5769
Current minimum: -0.6250
Iteration No: 32 started. Searching for the next optimal point.
[0.0018667577225655752, 9, 1, 0.31253568370797274, 0.18846314533375413, 334, 1, 1]




0.759539229838867
Iteration No: 32 ended. Search finished for the next optimal point.
Time taken: 0.7807
Function value obtained: -0.6191
Current minimum: -0.6250
Iteration No: 33 started. Searching for the next optimal point.
[0.00397568506427144, 9, 1, 0.933250245070084, 0.24606019404825202, 607, 1, 1]




0.7261714330303151
Iteration No: 33 ended. Search finished for the next optimal point.
Time taken: 0.8783
Function value obtained: -0.5353
Current minimum: -0.6250
Iteration No: 34 started. Searching for the next optimal point.
[0.026981405870830887, 9, 3, 0.2854969115593755, 0.1778067678790748, 549, 2, 1]




0.651271897311849
Iteration No: 34 ended. Search finished for the next optimal point.
Time taken: 0.4360
Function value obtained: -0.4516
Current minimum: -0.6250
Iteration No: 35 started. Searching for the next optimal point.
[0.06490061958007427, 9, 1, 0.7734106642364434, 0.1509232517332273, 349, 1, 1]




0.782967890445164
Iteration No: 35 ended. Search finished for the next optimal point.
Time taken: 0.5718
Function value obtained: -0.6125
Current minimum: -0.6250
Iteration No: 36 started. Searching for the next optimal point.
[0.011374813810910588, 9, 1, 0.8597550421933585, 0.11037301644825068, 224, 2, 1]




0.7321310132261715
Iteration No: 36 ended. Search finished for the next optimal point.
Time taken: 0.5147
Function value obtained: -0.5464
Current minimum: -0.6250
Iteration No: 37 started. Searching for the next optimal point.
[0.001162454682124478, 8, 1, 0.28805374087366936, 0.22706766606639645, 538, 5, 1]




0.7557352424798096
Iteration No: 37 ended. Search finished for the next optimal point.
Time taken: 0.8358
Function value obtained: -0.6085
Current minimum: -0.6250
Iteration No: 38 started. Searching for the next optimal point.
[0.026731709830311874, 10, 1, 0.11960906129605038, 0.06296434665533734, 314, 1, 1]




0.7270492762670205
Iteration No: 38 ended. Search finished for the next optimal point.
Time taken: 0.6720
Function value obtained: -0.5664
Current minimum: -0.6250
Iteration No: 39 started. Searching for the next optimal point.
[0.03694030310870213, 8, 1, 0.6854917493644457, 0.1709621564218445, 353, 3, 1]




0.7799442081854004
Iteration No: 39 ended. Search finished for the next optimal point.
Time taken: 0.4867
Function value obtained: -0.5988
Current minimum: -0.6250
Iteration No: 40 started. Searching for the next optimal point.
[0.03150120510533658, 8, 1, 0.4706176993329644, 0.1635953701709405, 476, 2, 1]




0.7814170340603175
Iteration No: 40 ended. Search finished for the next optimal point.
Time taken: 0.7148
Function value obtained: -0.6197
Current minimum: -0.6250
Iteration No: 41 started. Searching for the next optimal point.
[0.04340553830908002, 8, 2, 0.2494793984085939, 0.2175853075134747, 426, 2, 1]




0.6683605789863837
Iteration No: 41 ended. Search finished for the next optimal point.
Time taken: 0.4590
Function value obtained: -0.4497
Current minimum: -0.6250
Iteration No: 42 started. Searching for the next optimal point.
[0.03721378650814084, 8, 1, 0.7964705729491824, 0.20236642433156526, 509, 4, 1]




0.7772424017790956
Iteration No: 42 ended. Search finished for the next optimal point.
Time taken: 0.6064
Function value obtained: -0.5802
Current minimum: -0.6250
Iteration No: 43 started. Searching for the next optimal point.
[0.02273874344391386, 7, 1, 0.30048290824472407, 0.17788929733848896, 789, 4, 1]




0.7630896180406539
Iteration No: 43 ended. Search finished for the next optimal point.
Time taken: 0.9766
Function value obtained: -0.6203
Current minimum: -0.6250
Iteration No: 44 started. Searching for the next optimal point.
[0.03624233745052156, 1, 1, 0.6373993924228265, 0.1051699718471863, 534, 2, 1]
0.711784557762085




Iteration No: 44 ended. Search finished for the next optimal point.
Time taken: 0.3680
Function value obtained: -0.4824
Current minimum: -0.6250
Iteration No: 45 started. Searching for the next optimal point.
[0.07615773360210461, 9, 1, 0.29021334224322437, 0.17713551955963808, 126, 3, 1]




0.7642405680621123
Iteration No: 45 ended. Search finished for the next optimal point.
Time taken: 0.4408
Function value obtained: -0.6315
Current minimum: -0.6315
Iteration No: 46 started. Searching for the next optimal point.
[0.018659851666110786, 8, 2, 0.5641663128406718, 0.17026065575328703, 409, 4, 1]
0.6811283211735788




Iteration No: 46 ended. Search finished for the next optimal point.
Time taken: 0.4069
Function value obtained: -0.4731
Current minimum: -0.6315
Iteration No: 47 started. Searching for the next optimal point.
[0.0033016096364467317, 6, 1, 0.3018476035328086, 0.1635295101409761, 838, 3, 1]




0.7562619484218329
Iteration No: 47 ended. Search finished for the next optimal point.
Time taken: 0.8912
Function value obtained: -0.6083
Current minimum: -0.6315
Iteration No: 48 started. Searching for the next optimal point.
[0.03397633790553571, 8, 1, 0.2782708974343007, 0.2583747641824502, 497, 5, 1]




0.7452596465217901
Iteration No: 48 ended. Search finished for the next optimal point.
Time taken: 0.7511
Function value obtained: -0.5878
Current minimum: -0.6315
Iteration No: 49 started. Searching for the next optimal point.
[0.0010499941434957431, 6, 1, 0.33718308544705317, 0.17381253387200854, 559, 3, 1]




0.7467714876516717
Iteration No: 49 ended. Search finished for the next optimal point.
Time taken: 0.7252
Function value obtained: -0.5895
Current minimum: -0.6315
Iteration No: 50 started. Searching for the next optimal point.
[0.0014981253289572797, 7, 1, 0.28904027807001825, 0.06063151338681717, 824, 5, 1]




0.8035972065077446
Iteration No: 50 ended. Search finished for the next optimal point.
Time taken: 0.8923
Function value obtained: -0.7105
Current minimum: -0.7105


In [148]:
res.x

[0.0014981253289572797,
 7,
 1,
 0.28904027807001825,
 0.06063151338681717,
 824,
 5,
 1]

In [147]:
res.fun

-0.7105097462366898

## Logistic Regression

In [91]:
from sklearn.preprocessing import MaxAbsScaler
from scipy.sparse import csr_matrix
from sklearn.linear_model import LogisticRegression

In [167]:
Xtrain2 = csr_matrix(Xtrain.copy())
Xtest2 = csr_matrix(Xtest_plus.copy())

scaler = MaxAbsScaler()

Xtrain2 = scaler.fit_transform(Xtrain2)
Xtest2 = scaler.transform(Xtest2)

In [174]:
mdl = LogisticRegression(C=10,n_jobs=6, random_state=0)
mdl.fit(Xtrain2, y_train)

LogisticRegression(C=10, n_jobs=6, random_state=0)

In [175]:
p = mdl.predict_proba(Xtest2)[:, 1]

In [170]:
# C = 500
print(average_precision_score(y_test_plus, p), 
      roc_auc_score(y_test_plus, p))

0.644108514437606 0.7950723732979595


In [173]:
#C = 200
print(average_precision_score(y_test_plus, p), 
      roc_auc_score(y_test_plus, p))

0.6439463286580266 0.7957941555148064


In [176]:
# C = 10
print(average_precision_score(y_test_plus, p), 
      roc_auc_score(y_test_plus, p))

0.6352147464960527 0.7898833443876556


# Final Emsemble

## Random Forest

In [155]:
mdl_random = RandomForestClassifier(n_estimators = 1000, random_state= 0, class_weight="balanced")
mdl_random.fit(Xtrain, y_train)
probability_score_random_forest = mdl_random.predict_proba(Xtest_plus)[:,1]

print(average_precision_score(y_test_plus, probability_score_random_forest), 
      roc_auc_score(y_test_plus, probability_score_random_forest))

0.6876159471901129 0.8117611486091061


## LightGBM

In [157]:
params = [0.0014981253289572797, 7, 1, 0.28904027807001825, 0.06063151338681717, 824, 1, 3]

lr = params[0]
max_depth = params[1]
min_child_samples = params[2]
subsample = params[3]
colsample_bytree = params[4]
n_estimators = params[5]
    
min_df = params[6]
ngram_range = (1, params[7])
    
    
title_train, title_test = X_split(df_animes_with_labels['title'], 0.5)
    
word_vec = TfidfVectorizer(min_df=min_df, ngram_range= ngram_range)
word_bow_train = word_vec.fit_transform(title_train)
word_bow_test = word_vec.transform(title_test)

Xtrain = hstack([X_train, word_bow_train])
Xtest = hstack([X_test, word_bow_test])

#Concat the new data to test set
X_test_plus = pd.concat([X_test, df_active_learning[['mean_score', 'num_episodes']]])
y_test_plus = pd.concat([y_test, df_active_learning['y']])
title_test_plus = pd.concat([title_test, df_active_learning['title']])

word_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
word_bow_train = word_vec.fit_transform(title_train)
word_bow_test_plus = word_vec.transform(title_test_plus)
    
Xtest_plus = hstack([X_test_plus, word_bow_test_plus])
    
mdl_lgbm = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth, 
                         min_child_samples=min_child_samples, subsample=subsample,
                         colsample_bytree=colsample_bytree, bagging_freq=1,n_estimators=n_estimators, random_state=0, 
                         class_weight="balanced", n_jobs=6)
    
mdl_lgbm.fit(Xtrain, y_train)
    
probability_score_light = mdl_lgbm.predict_proba(Xtest_plus)[:,1]
    
print(average_precision_score(y_test_plus, probability_score_light), roc_auc_score(y_test_plus, probability_score_light))

0.7105097462366898 0.8035972065077446




## Logistic Regression

In [177]:
from sklearn.pipeline import make_pipeline

In [178]:
Xtrain2 = csr_matrix(Xtrain.copy())
Xtest2 = csr_matrix(Xtest_plus.copy())

lr_pipeline = make_pipeline(MaxAbsScaler(), LogisticRegression(C = 500, n_jobs= 6, random_state= 0))

lr_pipeline.fit(Xtrain2, y_train)

probability_score_logistic = lr_pipeline.predict_proba(Xtest2)[:, 1] 

print(average_precision_score(y_test_plus, probability_score_logistic), 
      roc_auc_score(y_test_plus, probability_score_logistic))

0.644108514437606 0.7950723732979595


## Ensemble

0.6876159471901129 0.8117611486091061 RF

0.7105097462366898 0.8035972065077446 LGBM

0.644108514437606 0.7950723732979595 LR

In [102]:
probability_score = (probability_score_light + probability_score_random_forest + probability_score_logistic) / 3
print(average_precision_score(y_test_plus, probability_score), roc_auc_score(y_test_plus, probability_score))

0.7132421902362809 0.8227927119503726


In [103]:
pd.DataFrame({"LR" : probability_score_logistic, "RF": probability_score_random_forest, "LGBM": probability_score_light}).corr()

Unnamed: 0,LR,RF,LGBM
LR,1.0,0.717407,0.556873
RF,0.717407,1.0,0.80585
LGBM,0.556873,0.80585,1.0


In [104]:
probability_score = 0.5 * probability_score_logistic + 0.5 * probability_score_light
print(average_precision_score(y_test_plus, probability_score), roc_auc_score(y_test_plus, probability_score))

0.7088473966317359 0.8183059576294331


In [105]:
probability_score = 0.4 * probability_score_logistic + 0.6 * probability_score_light
print(average_precision_score(y_test_plus, probability_score), roc_auc_score(y_test_plus, probability_score))

0.7174079743934428 0.8215442237915025


In [106]:
probability_score = 0.3 * probability_score_logistic + 0.7 * probability_score_light
print(average_precision_score(y_test_plus, probability_score), roc_auc_score(y_test_plus, probability_score))

0.7231399030576191 0.8236705551870782


In [107]:
probability_score = 0.2 * probability_score_logistic + 0.8 * probability_score_light
print(average_precision_score(y_test_plus, probability_score), roc_auc_score(y_test_plus, probability_score))

0.7306053772507798 0.8252701806406305


In [108]:
probability_score = 0.1 * probability_score_logistic + 0.9 * probability_score_light
print(average_precision_score(y_test_plus, probability_score), roc_auc_score(y_test_plus, probability_score))

0.7405124135918438 0.8269283289766298


In [109]:
probability_score = 0.2 * probability_score_random_forest + 0.8 * probability_score_light
print(average_precision_score(y_test_plus, probability_score), roc_auc_score(y_test_plus, probability_score))

0.7248197765306668 0.8210760407319262


## Save Models

In [179]:
import joblib as jb

jb.dump(mdl_lgbm, "lgbm_animes.pkl.z")
jb.dump(lr_pipeline, "Logistic_reg_animes.pkl.z")
jb.dump(word_vec, "word_vectorizer_animes.pkl.z")

['word_vectorizer_animes.pkl.z']