In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import joblib

In [None]:
# load data
X_train = np.load('/content/drive/My Drive/CSVs/X_train_transformed.npy')
y_train = pd.read_parquet('/content/drive/My Drive/CSVs/y_train.parquet')
X_test = np.load('/content/drive/My Drive/CSVs/X_test_transformed.npy')
y_test = pd.read_parquet('/content/drive/My Drive/CSVs/y_test.parquet')
y_train = y_train.values
y_test = y_test.values

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(random_state=42)


In [None]:
# Set up the sample space
param_dist = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]}

In [None]:
random_search = RandomizedSearchCV(estimator = rf, param_distributions = param_dist, n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = 2)


In [None]:
best_rf_classifiers = []

In [None]:
# Conduct a random search to tune hyperparameters for each genre
for i in tqdm(range(18)):
    random_search.fit(X_train, y_train[:,i])
    best_rf_classifiers.append(random_search.best_estimator_)


  0%|          | 0/18 [00:00<?, ?it/s]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  6%|▌         | 1/18 [02:33<43:35, 153.84s/it]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


 11%|█         | 2/18 [05:04<40:28, 151.75s/it]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


 17%|█▋        | 3/18 [07:33<37:39, 150.61s/it]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


 22%|██▏       | 4/18 [10:05<35:18, 151.32s/it]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


 28%|██▊       | 5/18 [12:28<32:07, 148.28s/it]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


 33%|███▎      | 6/18 [14:52<29:19, 146.62s/it]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


 39%|███▉      | 7/18 [17:21<27:01, 147.41s/it]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


 44%|████▍     | 8/18 [19:40<24:08, 144.89s/it]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


 50%|█████     | 9/18 [22:04<21:41, 144.61s/it]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


 56%|█████▌    | 10/18 [24:31<19:22, 145.26s/it]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


 61%|██████    | 11/18 [27:00<17:04, 146.31s/it]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


 67%|██████▋   | 12/18 [29:19<14:26, 144.34s/it]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


 72%|███████▏  | 13/18 [31:37<11:52, 142.42s/it]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


 78%|███████▊  | 14/18 [33:46<09:13, 138.28s/it]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


 83%|████████▎ | 15/18 [35:58<06:48, 136.23s/it]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


 89%|████████▉ | 16/18 [37:49<04:17, 128.78s/it]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


 94%|█████████▍| 17/18 [40:08<02:11, 131.91s/it]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


100%|██████████| 18/18 [42:02<00:00, 140.12s/it]


In [None]:
# Check best parameters for each genre
best_rf_classifiers


[RandomForestClassifier(bootstrap=False, max_depth=30, min_samples_leaf=2,
                        n_estimators=150, random_state=42),
 RandomForestClassifier(bootstrap=False, max_depth=30, min_samples_leaf=2,
                        n_estimators=150, random_state=42),
 RandomForestClassifier(bootstrap=False, max_depth=30, min_samples_leaf=2,
                        n_estimators=150, random_state=42),
 RandomForestClassifier(bootstrap=False, max_depth=30, min_samples_leaf=2,
                        n_estimators=150, random_state=42),
 RandomForestClassifier(bootstrap=False, max_depth=30, min_samples_leaf=2,
                        n_estimators=150, random_state=42),
 RandomForestClassifier(bootstrap=False, max_depth=30, min_samples_leaf=2,
                        n_estimators=150, random_state=42),
 RandomForestClassifier(bootstrap=False, max_depth=30, min_samples_leaf=2,
                        n_estimators=150, random_state=42),
 RandomForestClassifier(bootstrap=False, max_depth=30, 

In [None]:
# Use best classifiers to fit a model
for i, clf in enumerate(tqdm(best_rf_classifiers)):
    clf.fit(X_train, y_train[:, i])

100%|██████████| 18/18 [04:00<00:00, 13.39s/it]


In [None]:
# Save tuned models
for i, clf in enumerate(best_rf_classifiers):
    joblib.dump(clf, f'/content/drive/My Drive/Models/tuned_rf_classifier_label_{i}.pkl')

In [None]:
# Recover models upon rerunning the notebook
best_rf_classifiers = []
for i in range(18):
  best_rf_classifiers.append(joblib.load(f'/content/drive/My Drive/Models/tuned_rf_classifier_label_{i}.pkl'))

In [None]:
y_pred = np.array([clf.predict(X_test) for clf in best_rf_classifiers]).T

In [None]:
from sklearn.metrics import accuracy_score, hamming_loss, f1_score, classification_report

In [None]:
# Evaluate model
hamming_loss_score = hamming_loss(y_test, y_pred)
f1_score_micro = f1_score(y_test, y_pred, average='micro')
f1_score_macro = f1_score(y_test, y_pred, average='macro')

print(f'Hamming Loss: {hamming_loss_score:.4f}')
print(f'F1 Score (Micro): {f1_score_micro:.4f}')
print(f'F1 Score (Macro): {f1_score_macro:.4f}')

Hamming Loss: 0.0943
F1 Score (Micro): 0.5942
F1 Score (Macro): 0.4975


In [None]:
y_train_labeled = pd.read_parquet('/content/drive/My Drive/CSVs/y_train.parquet')

In [None]:
print(classification_report(y_test, y_pred,target_names = y_train_labeled.columns))


                  precision    recall  f1-score   support

         Mystery       0.77      0.74      0.76      1585
        Thriller       0.74      0.70      0.72      1323
         Fantasy       0.82      0.66      0.73      1181
 Science Fiction       0.74      0.42      0.54       726
           Crime       0.68      0.42      0.52       729
    Contemporary       0.72      0.35      0.47       702
         Romance       0.76      0.38      0.51       642
        Suspense       0.77      0.23      0.35       610
     Young Adult       0.80      0.30      0.43       502
      Historical       0.79      0.38      0.51       599
          Horror       0.84      0.07      0.13       298
       Adventure       0.89      0.21      0.34       255
      Paranormal       0.81      0.08      0.14       273
         History       0.91      0.73      0.81       250
Literary Fiction       0.75      0.08      0.14       267
       Biography       0.77      0.43      0.55       210
        Class

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Try a random search with twice as many iterations, just in case.
random_search_more = RandomizedSearchCV(estimator = rf, param_distributions = param_dist, n_iter = 20, cv = 5, verbose=2, random_state=42, n_jobs = 3,scoring='f1')


In [None]:
better_rf_classifiers = []

In [None]:
for i in tqdm(range(18)):
    random_search_more.fit(X_train, y_train[:,i])
    better_rf_classifiers.append(random_search_more.best_estimator_)


  0%|          | 0/18 [00:00<?, ?it/s]

Fitting 5 folds for each of 20 candidates, totalling 100 fits


  6%|▌         | 1/18 [04:44<1:20:34, 284.39s/it]

Fitting 5 folds for each of 20 candidates, totalling 100 fits


 11%|█         | 2/18 [09:18<1:14:14, 278.43s/it]

Fitting 5 folds for each of 20 candidates, totalling 100 fits


 17%|█▋        | 3/18 [13:47<1:08:27, 273.86s/it]

Fitting 5 folds for each of 20 candidates, totalling 100 fits


 22%|██▏       | 4/18 [18:15<1:03:24, 271.73s/it]

Fitting 5 folds for each of 20 candidates, totalling 100 fits


 28%|██▊       | 5/18 [22:28<57:23, 264.89s/it]  

Fitting 5 folds for each of 20 candidates, totalling 100 fits


 33%|███▎      | 6/18 [26:47<52:36, 263.01s/it]

Fitting 5 folds for each of 20 candidates, totalling 100 fits


 39%|███▉      | 7/18 [31:19<48:44, 265.85s/it]

Fitting 5 folds for each of 20 candidates, totalling 100 fits


 44%|████▍     | 8/18 [35:17<42:51, 257.10s/it]

Fitting 5 folds for each of 20 candidates, totalling 100 fits


 50%|█████     | 9/18 [39:24<38:03, 253.76s/it]

Fitting 5 folds for each of 20 candidates, totalling 100 fits


 56%|█████▌    | 10/18 [43:45<34:10, 256.25s/it]

Fitting 5 folds for each of 20 candidates, totalling 100 fits


 61%|██████    | 11/18 [48:13<30:17, 259.63s/it]

Fitting 5 folds for each of 20 candidates, totalling 100 fits


 67%|██████▋   | 12/18 [52:11<25:19, 253.22s/it]

Fitting 5 folds for each of 20 candidates, totalling 100 fits


 72%|███████▏  | 13/18 [56:20<20:59, 251.85s/it]

Fitting 5 folds for each of 20 candidates, totalling 100 fits


 78%|███████▊  | 14/18 [1:00:13<16:24, 246.14s/it]

Fitting 5 folds for each of 20 candidates, totalling 100 fits


 83%|████████▎ | 15/18 [1:03:54<11:56, 238.68s/it]

Fitting 5 folds for each of 20 candidates, totalling 100 fits


 89%|████████▉ | 16/18 [1:07:20<07:37, 228.88s/it]

Fitting 5 folds for each of 20 candidates, totalling 100 fits


 94%|█████████▍| 17/18 [1:11:25<03:53, 233.44s/it]

Fitting 5 folds for each of 20 candidates, totalling 100 fits


100%|██████████| 18/18 [1:14:34<00:00, 248.57s/it]


In [None]:
for i, clf in enumerate(tqdm(better_rf_classifiers)):
  clf.fit(X_train, y_train[:,i])

100%|██████████| 18/18 [03:58<00:00, 13.26s/it]


In [None]:
y_pred = np.array([clf.predict(X_test) for clf in better_rf_classifiers]).T

In [None]:
hamming_loss_score = hamming_loss(y_test, y_pred)
f1_score_micro = f1_score(y_test, y_pred, average='micro')
f1_score_macro = f1_score(y_test, y_pred, average='macro')

print(f'Hamming Loss: {hamming_loss_score:.4f}')
print(f'F1 Score (Micro): {f1_score_micro:.4f}')
print(f'F1 Score (Macro): {f1_score_macro:.4f}')

Hamming Loss: 0.0944
F1 Score (Micro): 0.5941
F1 Score (Macro): 0.4971


In [None]:
# This actually ended up doing almost exactly the same.
print(classification_report(y_test, y_pred,target_names = y_train_labeled.columns))


                  precision    recall  f1-score   support

         Mystery       0.77      0.74      0.76      1585
        Thriller       0.74      0.69      0.72      1323
         Fantasy       0.82      0.66      0.73      1181
 Science Fiction       0.74      0.42      0.54       726
           Crime       0.68      0.42      0.52       729
    Contemporary       0.72      0.35      0.47       702
         Romance       0.77      0.38      0.51       642
        Suspense       0.76      0.23      0.35       610
     Young Adult       0.80      0.30      0.43       502
      Historical       0.79      0.38      0.51       599
          Horror       0.84      0.07      0.13       298
       Adventure       0.90      0.20      0.33       255
      Paranormal       0.84      0.08      0.14       273
         History       0.92      0.74      0.82       250
Literary Fiction       0.78      0.07      0.12       267
       Biography       0.74      0.45      0.56       210
        Class

  _warn_prf(average, modifier, msg_start, len(result))


Let's try an XG_Boost model

In [None]:
# Set up the sample space
xgb_param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6],
    'min_child_weight': [1, 2, 3],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.6, 0.7, 0.8],
    'colsample_bytree': [0.6, 0.7, 0.8],
    'lambda': [0, 1, 2],
    'alpha': [0, 1, 2],
    'scale_pos_weight': [1, 2, 3],
    'eval_metric': ['logloss', 'auc'],
    'tree_method': ['auto', 'exact', 'approx', 'hist'],
    'grow_policy': ['depthwise', 'lossguide'],
}

In [None]:
from xgboost import XGBClassifier


In [None]:
xgb = XGBClassifier(random_state=42)


In [None]:
xgb_search = RandomizedSearchCV(estimator = xgb, param_distributions = xgb_param_dist, n_iter = 50, cv = 5, verbose=2, random_state=42, n_jobs = -1,scoring='f1')

In [None]:
best_xgb_classifiers = []

In [None]:
# Find best xgb model for each label
for i in tqdm(range(18)):
    xgb_search.fit(X_train, y_train[:,i])
    best_xgb_classifiers.append(xgb_search.best_estimator_)

  0%|          | 0/18 [00:00<?, ?it/s]

Fitting 5 folds for each of 50 candidates, totalling 250 fits


  6%|▌         | 1/18 [02:36<44:15, 156.18s/it]

Fitting 5 folds for each of 50 candidates, totalling 250 fits


 11%|█         | 2/18 [05:11<41:26, 155.40s/it]

Fitting 5 folds for each of 50 candidates, totalling 250 fits


 17%|█▋        | 3/18 [07:49<39:14, 156.99s/it]

Fitting 5 folds for each of 50 candidates, totalling 250 fits


 22%|██▏       | 4/18 [10:22<36:15, 155.39s/it]

Fitting 5 folds for each of 50 candidates, totalling 250 fits


 28%|██▊       | 5/18 [12:54<33:23, 154.11s/it]

Fitting 5 folds for each of 50 candidates, totalling 250 fits


 33%|███▎      | 6/18 [15:26<30:41, 153.44s/it]

Fitting 5 folds for each of 50 candidates, totalling 250 fits


 39%|███▉      | 7/18 [17:57<27:58, 152.60s/it]

Fitting 5 folds for each of 50 candidates, totalling 250 fits


 44%|████▍     | 8/18 [20:31<25:29, 152.90s/it]

Fitting 5 folds for each of 50 candidates, totalling 250 fits


 50%|█████     | 9/18 [23:01<22:48, 152.10s/it]

Fitting 5 folds for each of 50 candidates, totalling 250 fits


 56%|█████▌    | 10/18 [25:35<20:20, 152.58s/it]

Fitting 5 folds for each of 50 candidates, totalling 250 fits


 61%|██████    | 11/18 [28:05<17:43, 151.96s/it]

Fitting 5 folds for each of 50 candidates, totalling 250 fits


 67%|██████▋   | 12/18 [30:39<15:14, 152.39s/it]

Fitting 5 folds for each of 50 candidates, totalling 250 fits


 72%|███████▏  | 13/18 [33:08<12:36, 151.40s/it]

Fitting 5 folds for each of 50 candidates, totalling 250 fits


 78%|███████▊  | 14/18 [35:26<09:50, 147.52s/it]

Fitting 5 folds for each of 50 candidates, totalling 250 fits


 83%|████████▎ | 15/18 [37:55<07:23, 147.71s/it]

Fitting 5 folds for each of 50 candidates, totalling 250 fits


 89%|████████▉ | 16/18 [40:18<04:52, 146.48s/it]

Fitting 5 folds for each of 50 candidates, totalling 250 fits


 94%|█████████▍| 17/18 [42:40<02:25, 145.00s/it]

Fitting 5 folds for each of 50 candidates, totalling 250 fits


100%|██████████| 18/18 [45:02<00:00, 150.13s/it]


In [None]:
# Save classifier
for i, clf in enumerate(best_xgb_classifiers):
    joblib.dump(clf, f'/content/drive/My Drive/Models/tuned_xgb_classifier_label_{i}.pkl')

In [None]:
# Recover classifiers upon rerunning notebook
best_xgb_classifiers = []
for i in range(18):
  best_xgb_classifiers.append(joblib.load(f'/content/drive/My Drive/Models/tuned_xgb_classifier_label_{i}.pkl'))

In [None]:
y_pred = np.array([clf.predict(X_test) for clf in best_xgb_classifiers]).T


In [None]:
# Evaluate xgb predictions
hamming_loss_score = hamming_loss(y_test, y_pred)
f1_score_micro = f1_score(y_test, y_pred, average='micro')
f1_score_macro = f1_score(y_test, y_pred, average='macro')

print(f'Hamming Loss: {hamming_loss_score:.4f}')
print(f'F1 Score (Micro): {f1_score_micro:.4f}')
print(f'F1 Score (Macro): {f1_score_macro:.4f}')

Hamming Loss: 0.1064
F1 Score (Micro): 0.6523
F1 Score (Macro): 0.6047


In [None]:
print(classification_report(y_test, y_pred,target_names = y_train_labeled.columns))


                  precision    recall  f1-score   support

         Mystery       0.70      0.88      0.78      1585
        Thriller       0.66      0.85      0.74      1323
         Fantasy       0.72      0.76      0.74      1181
 Science Fiction       0.56      0.68      0.61       726
           Crime       0.52      0.73      0.61       729
    Contemporary       0.56      0.65      0.60       702
         Romance       0.59      0.66      0.62       642
        Suspense       0.48      0.66      0.56       610
     Young Adult       0.61      0.55      0.58       502
      Historical       0.56      0.61      0.58       599
          Horror       0.36      0.32      0.34       298
       Adventure       0.46      0.36      0.40       255
      Paranormal       0.45      0.27      0.34       273
         History       0.84      0.81      0.83       250
Literary Fiction       0.48      0.40      0.43       267
       Biography       0.60      0.68      0.64       210
        Class

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
X_test_labels = pd.read_parquet('/content/drive/My Drive/CSVs/X_test.parquet')



In [None]:
# Find feature names
X_test_labels.drop([ 'level_0','book title', 'book author', 'book genres', 'num genres'],axis=1,inplace=True)
X_test_labels.columns



Index(['total words', 'vividness score', 'passive voice', 'all adverbs',
       'ly-adverbs', 'non-ly-adverbs', 'publication year', '0', '00', '000',
       ...
       'raven-black', 'shirt-sleeves', 'shit-ton', 'ther', 'tightbeam',
       'timepieces', 'vendettas', 'volatiles', 'warbands', 'whimsically'],
      dtype='object', length=66194)

In [None]:
# Include all 10 features from the Sparse PCA
clf_labels = list(X_test_labels.columns[:7])
for i in range(10):
  clf_labels.append(f'pca_feature_{i}')

In [None]:
# Create dictionary of feature importance
xgb_features = {}
for i, clf in enumerate(best_xgb_classifiers):
  xgb_features[y_train_labeled.columns[i]] = list(zip(clf_labels, clf.feature_importances_))




In [None]:
# Sort features by importance to each genre
for key in xgb_features.keys():
  xgb_features[key] = sorted(xgb_features[key], key=lambda x: x[1], reverse=True)

In [None]:
# View top 4 features for each genre
for key in xgb_features.keys():
  print(key)
  print(xgb_features[key][:4])

Mystery
[('pca_feature_7', 0.18848614), ('pca_feature_4', 0.08687109), ('pca_feature_9', 0.067120425), ('passive voice', 0.06335263)]
Thriller
[('pca_feature_7', 0.23364581), ('total words', 0.076511696), ('pca_feature_9', 0.061683957), ('passive voice', 0.060265798)]
Fantasy
[('pca_feature_5', 0.15867022), ('pca_feature_7', 0.14312406), ('pca_feature_0', 0.06924042), ('pca_feature_9', 0.068620086)]
Science Fiction
[('pca_feature_4', 0.15463465), ('pca_feature_9', 0.09518076), ('pca_feature_5', 0.08730194), ('pca_feature_7', 0.07835112)]
Crime
[('pca_feature_7', 0.22122678), ('pca_feature_5', 0.0793049), ('pca_feature_9', 0.072958335), ('pca_feature_4', 0.071404755)]
Contemporary
[('pca_feature_4', 0.13903148), ('pca_feature_9', 0.12173863), ('pca_feature_5', 0.07401706), ('pca_feature_7', 0.06909184)]
Romance
[('pca_feature_0', 0.13201624), ('pca_feature_5', 0.0956254), ('pca_feature_7', 0.08982267), ('pca_feature_9', 0.07264135)]
Suspense
[('pca_feature_7', 0.22614163), ('pca_feature

In [None]:
# Display significant non-pca features
for key in xgb_features.keys():
  print(key)
  print([pair for pair in xgb_features[key] if 'pca_feature' not in pair[0] and pair[1] > 0.04])

Mystery
[('passive voice', 0.06335263), ('total words', 0.0588639), ('publication year', 0.04775864)]
Thriller
[('total words', 0.076511696), ('passive voice', 0.060265798), ('publication year', 0.04578491), ('vividness score', 0.042795822)]
Fantasy
[('total words', 0.04780043), ('vividness score', 0.04647772), ('publication year', 0.041930426), ('passive voice', 0.040479884)]
Science Fiction
[('publication year', 0.055750623), ('total words', 0.047980454), ('passive voice', 0.04487671), ('vividness score', 0.040183436)]
Crime
[('passive voice', 0.060581055), ('total words', 0.051159676), ('publication year', 0.049579803), ('vividness score', 0.04702891)]
Contemporary
[('publication year', 0.06122225), ('total words', 0.04342679), ('vividness score', 0.042195603)]
Romance
[('publication year', 0.056198068), ('passive voice', 0.055783037), ('total words', 0.04696046)]
Suspense
[('passive voice', 0.06521782), ('total words', 0.057953335), ('publication year', 0.05448545), ('vividness sco

In [None]:
# Create dictionary of random forest features
rf_features = {}
for i, clf in enumerate(best_rf_classifiers):
  rf_features[y_train_labeled.columns[i]] = list(zip(clf_labels, clf.feature_importances_))



In [None]:
# Sort features for each genre by importance
for key in rf_features.keys():
  rf_features[key] = sorted(rf_features[key], key=lambda x: x[1], reverse=True)


In [None]:
# View 4 most importance features for each genre
for key in rf_features.keys():
  print(key)
  print(rf_features[key][:4])


Mystery
[('pca_feature_7', 0.2252672562628819), ('pca_feature_9', 0.0740765412054647), ('pca_feature_4', 0.06861775491574379), ('pca_feature_5', 0.06480080811547818)]
Thriller
[('pca_feature_7', 0.27467305753462573), ('pca_feature_9', 0.06340381564611969), ('pca_feature_2', 0.0621583366380735), ('passive voice', 0.05554134281722892)]
Fantasy
[('pca_feature_7', 0.16352102654867606), ('pca_feature_5', 0.140665249121287), ('pca_feature_6', 0.08727037216413368), ('pca_feature_9', 0.06746429772665036)]
Science Fiction
[('pca_feature_4', 0.1283620578911938), ('pca_feature_9', 0.10328627478880027), ('pca_feature_7', 0.08378203636599886), ('pca_feature_5', 0.07707449603980471)]
Crime
[('pca_feature_7', 0.20065229886575736), ('pca_feature_9', 0.07274523395062774), ('pca_feature_4', 0.06887366228273432), ('pca_feature_5', 0.0641496165754133)]
Contemporary
[('pca_feature_9', 0.12282304936872032), ('pca_feature_4', 0.10569257037355202), ('pca_feature_5', 0.07862165033687975), ('pca_feature_8', 0.0

In [None]:
# View significant non-PCA features for each genre
for key in rf_features.keys():
  print(key)
  print([pair for pair in rf_features[key] if 'pca_feature' not in pair[0] and pair[1] > 0.04])


Mystery
[('passive voice', 0.06242602816378059), ('total words', 0.05269244731088281), ('vividness score', 0.04010153286083108)]
Thriller
[('passive voice', 0.05554134281722892), ('total words', 0.05098745243895519), ('vividness score', 0.04179879139448148)]
Fantasy
[('vividness score', 0.046517993467687674), ('total words', 0.043335977327791514)]
Science Fiction
[('total words', 0.04884242066806887), ('publication year', 0.0447790176360953), ('passive voice', 0.04394901315666022), ('vividness score', 0.04106312387915402)]
Crime
[('passive voice', 0.05963816219514937), ('vividness score', 0.05038681086303353), ('total words', 0.049447266484862556)]
Contemporary
[('total words', 0.04894787385245639), ('vividness score', 0.04620504259312333), ('passive voice', 0.044824170378826744), ('publication year', 0.04122134055432999), ('ly-adverbs', 0.04062077391247297)]
Romance
[('total words', 0.048591349400430354), ('passive voice', 0.04832629963712024), ('vividness score', 0.043161030645143116

In [None]:
rf_significant = {}
for key in rf_features.keys():
  rf_significant[key] = [pair[0] for pair in rf_features[key] if 'pca_feature' not in pair[0] and pair[1] > 0.04]

In [None]:
xgb_significant = {}
for key in xgb_features.keys():
  xgb_significant[key] = [pair[0] for pair in xgb_features[key] if 'pca_feature' not in pair[0] and pair[1] > 0.04]

In [None]:
# Compare significant features between models
for key in rf_significant.keys():
  print(key)
  print("Random forest significant features: ", rf_significant[key])
  print("XGB significant features: ", xgb_significant[key])
  print()



Mystery
Random forest significant features:  ['passive voice', 'total words', 'vividness score']
XGB significant features:  ['passive voice', 'total words', 'publication year']

Thriller
Random forest significant features:  ['passive voice', 'total words', 'vividness score']
XGB significant features:  ['total words', 'passive voice', 'publication year', 'vividness score']

Fantasy
Random forest significant features:  ['vividness score', 'total words']
XGB significant features:  ['total words', 'vividness score', 'publication year', 'passive voice']

Science Fiction
Random forest significant features:  ['total words', 'publication year', 'passive voice', 'vividness score']
XGB significant features:  ['publication year', 'total words', 'passive voice', 'vividness score']

Crime
Random forest significant features:  ['passive voice', 'vividness score', 'total words']
XGB significant features:  ['passive voice', 'total words', 'publication year', 'vividness score']

Contemporary
Random fore

In [None]:
X_train_original = pd.read_parquet('/content/drive/My Drive/CSVs/X_train.parquet')


In [None]:
[X_train_original.columns[1:12]]

[Index(['book title', 'book author', 'total words', 'vividness score',
        'passive voice', 'all adverbs', 'ly-adverbs', 'non-ly-adverbs',
        'publication year', 'book genres', 'num genres'],
       dtype='object')]

In [None]:
X_train_original = X_train_original[X_train_original.columns[1:12]]

In [None]:
averages = {}
stds = {}
for feature in X_train_original.columns[2:-2]:
  averages[feature] = X_train_original[feature].mean()
  stds[feature] = X_train_original[feature].std()



In [None]:
# Check the xbg model's significant features for each genre, to see how far from average they are.
for genre in xgb_significant.keys():
  genre_df = X_train_original[X_train_original.apply(lambda x: genre in x['book genres'], axis=1)].head()
  for feature in xgb_significant[genre]:
    mean = genre_df[feature].mean()
    print(f"Average {feature} for {genre}: {round(mean,2)} vs \
{round(averages[feature],2)} for all books (z = {(round((mean-averages[feature])/stds[feature],2))})")
  print()




Average passive voice for Mystery: 8.07 vs 8.03 for all books (z = 0.03)
Average total words for Mystery: 84896.4 vs 92713.29 for all books (z = -0.18)
Average publication year for Mystery: 2008.6 vs 2010.13 for all books (z = -0.04)

Average total words for Thriller: 70799.0 vs 92713.29 for all books (z = -0.5)
Average passive voice for Thriller: 8.07 vs 8.03 for all books (z = 0.03)
Average publication year for Thriller: 2002.6 vs 2010.13 for all books (z = -0.19)
Average vividness score for Thriller: 55.11 vs 47.7 for all books (z = 0.61)

Average total words for Fantasy: 135011.6 vs 92713.29 for all books (z = 0.96)
Average vividness score for Fantasy: 58.22 vs 47.7 for all books (z = 0.87)
Average publication year for Fantasy: 2014.8 vs 2010.13 for all books (z = 0.12)
Average passive voice for Fantasy: 7.17 vs 8.03 for all books (z = -0.68)

Average publication year for Science Fiction: 2018.2 vs 2010.13 for all books (z = 0.2)
Average total words for Science Fiction: 111997.0 vs