# Final Models

In this notebook we train our final models on the combination of training and validation data and then evaluate them on the testing data.

In [1]:
import os
import random
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import auc, roc_auc_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

random.seed(42)
np.random.seed(42)

### Loading Data

In [3]:
OUTPUT_DATA_DIR = "./output_data/"

train_df_processed = pd.read_csv(OUTPUT_DATA_DIR+"text_processed_training.csv")
val_df_processed = pd.read_csv(OUTPUT_DATA_DIR+"text_processed_validation.csv")
test_df = pd.read_csv(OUTPUT_DATA_DIR+"text_processed_testing.csv")

train_df = pd.concat([train_df_processed, val_df_processed], axis=0)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


### Baseline Models

We first fit two baseline models: Logistic Regression and Random Forest. These are fit only to the user and book level features, without using any text features or the features from chainRec

In [9]:
columns_to_keep = ['text_reviews_count', 'is_ebook', 'average_rating', 'num_pages',
                   'ratings_count', 'is_translated', 'is_in_series', 'series_length',
                   'is_paperback', 'is_hardcover', 'is_audio', 'from_penguin',
                   'from_harpercollins', 'from_university_press', 'from_vintage',
                   'from_createspace', 'author_a', 'author_b', 'author_c', 'publication_year',
                   'author_d', 'author_e', 'author_f', 'book_shelved_count', 'title_len',
                   'shelved_count', 'read_count', 'rated_count', 'recommended_count']

y_train = train_df['recommended']
y_test = test_df['recommended']

X_train = train_df[columns_to_keep]
X_test = test_df[columns_to_keep]

##### Logistic Regression

In [10]:
min_max_scaler = MinMaxScaler()

X_train_reg = min_max_scaler.fit_transform(X_train)
X_test_reg = min_max_scaler.transform(X_test)

In [6]:
def print_classification_stats(train_preds, train_actual, test_preds, test_actual):
    """Prints the classification statistics for train and test data.
    
    Parameters
    ----------
    train_preds: np.array
        An array of predictions on the training set.
    train_actual: np.array
        An array of the target values for the training set.
    test_preds: np.array
        An array of predictions on the test set.
    test_actual: np.array
        An array of the target values for the test set.
    
    Returns
    -------
    None
    
    """
    print("Training")
    print("--------")
    print(classification_report(train_actual, train_preds))
    print("AUC: {}".format(roc_auc_score(train_actual, train_preds)))
    print()
    
    print("Testing")
    print("--------")
    print(classification_report(test_actual, test_preds))
    print("AUC: {}".format(roc_auc_score(test_actual, test_preds)))
    print()

In [11]:
reg_model = LogisticRegression(max_iter=10000)
reg_model.fit(X_train_reg, y_train)

reg_train_preds = reg_model.predict(X_train_reg)
reg_test_preds = reg_model.predict(X_test_reg)

print_classification_stats(reg_train_preds, y_train, reg_test_preds, y_test)

Training
--------
              precision    recall  f1-score   support

           0       0.70      0.60      0.64    128756
           1       0.71      0.80      0.75    163519

    accuracy                           0.71    292275
   macro avg       0.71      0.70      0.70    292275
weighted avg       0.71      0.71      0.70    292275

AUC: 0.6957666147229499

Testing
--------
              precision    recall  f1-score   support

           0       0.68      0.47      0.56     17725
           1       0.58      0.77      0.66     16905

    accuracy                           0.62     34630
   macro avg       0.63      0.62      0.61     34630
weighted avg       0.63      0.62      0.61     34630

AUC: 0.6216057041569311



In [13]:
reg_df = pd.DataFrame({'feature': columns_to_keep,
                       'regression_coefficient': reg_model.coef_[0]})

reg_df.head(len(columns_to_keep))

Unnamed: 0,feature,regression_coefficient
0,text_reviews_count,2.293108
1,is_ebook,0.19224
2,average_rating,1.368347
3,num_pages,0.183483
4,ratings_count,-0.588185
5,is_translated,-0.090949
6,is_in_series,-0.033236
7,series_length,0.210957
8,is_paperback,-0.094825
9,is_hardcover,-0.069245


##### Random Forest

In [12]:
rf_model = RandomForestClassifier(n_estimators=1000, max_depth=15)
rf_model.fit(X_train, y_train)

rf_train_preds = rf_model.predict(X_train)
rf_test_preds = rf_model.predict(X_test)

print_classification_stats(rf_train_preds, y_train, rf_test_preds, y_test)

Training
--------
              precision    recall  f1-score   support

           0       0.75      0.78      0.76    128756
           1       0.82      0.80      0.81    163519

    accuracy                           0.79    292275
   macro avg       0.79      0.79      0.79    292275
weighted avg       0.79      0.79      0.79    292275

AUC: 0.7876575727462174

Testing
--------
              precision    recall  f1-score   support

           0       0.66      0.70      0.68     17725
           1       0.66      0.62      0.64     16905

    accuracy                           0.66     34630
   macro avg       0.66      0.66      0.66     34630
weighted avg       0.66      0.66      0.66     34630

AUC: 0.6593787651811812



In [14]:
MAPPING_DIR = './mappings/'
cols_to_keep = ['user_number', 'item_number', 's1', 's2', 's3', 's4']

s_values_df = pd.read_csv(MAPPING_DIR+"goodreads_s_values_uniform.csv")[cols_to_keep]

In [15]:
s_values_df['user_number'] = s_values_df['user_number'].apply(lambda x: str(x))
s_values_df['item_number'] = s_values_df['item_number'].apply(lambda x: str(x))
s_values_df['user_item_id'] = s_values_df['user_number'] + "-" + s_values_df['item_number']

In [16]:
user_map = pd.read_csv(MAPPING_DIR+"user_map.csv")
book_map = pd.read_csv(MAPPING_DIR+"book_map.csv")

In [17]:
book_map['book_id'] = book_map['book_id'].apply(lambda x: str(x))

In [18]:
def create_user_item_id(data_df, u_map, i_map):
    """Creates a user-item ID for the records in `data_df`.

    The user-item ID is created from `u_map` and `i_map`.
    Both mappings, map text user IDs to numeric user IDs
    and these numeric user IDs are combined to form the
    user-item ID.

    Parameters
    ----------
    data_df: pd.DataFrame
        The DataFrame for which the user-item ID is created.
    u_map: pd.DataFrame
        A DataFrame containing a mapping from a text user ID to
        a numeric user ID.
    i_map: pd.DataFrame
        A DataFrame containing a mapping from a text item ID to
        a numeric item ID.

    Returns
    -------
    pd.DataFrame
        The DataFrame obtained from `data_df` after adding a
        user-item ID field based on `u_map` and `i_map`.

    """
    data_df['book_id'] = data_df['book_id'].apply(lambda x: str(x))
    data_df = pd.merge(data_df, u_map, how="left", on=["user_id"])
    data_df = pd.merge(data_df, i_map, how="left", on=["book_id"])
    data_df['user_number'] = data_df['user_number'].apply(lambda x: str(x))
    data_df['book_number'] = data_df['book_number'].apply(lambda x: str(x))
    data_df['user_item_id'] = data_df['user_number'] + "-" + data_df['book_number']
    return data_df.drop(columns=['user_number', 'book_number'])

In [20]:
train_df = create_user_item_id(train_df, user_map, book_map)
test_df = create_user_item_id(test_df, user_map, book_map)

In [21]:
s_values_df.drop(columns=['user_number', 'item_number'], inplace=True)

train_df_s = pd.merge(train_df, s_values_df, how='left', on=['user_item_id'])
test_df_s = pd.merge(test_df, s_values_df, how='left', on=['user_item_id'])

train_df_s.drop(columns=['user_item_id'], inplace=True)
test_df_s.drop(columns=['user_item_id'], inplace=True)

In [22]:
feature_columns = ['text_reviews_count', 'is_ebook', 'average_rating', 'num_pages',
                   'ratings_count', 'is_translated', 'is_in_series', 'series_length',
                   'is_paperback', 'is_hardcover', 'is_audio', 'from_penguin',
                   'from_harpercollins', 'from_university_press', 'from_vintage',
                   'from_createspace', 'publication_year', 'author_a', 'author_b',
                   'author_c', 'author_d', 'author_e', 'author_f',
                   'book_shelved_count', 'shelved_count', 'read_count', 'rated_count',
                   'recommended_count', 'title_len', 's1', 's2', 's3', 's4']
X_train = train_df_s[feature_columns]
X_test = test_df_s[feature_columns]

In [23]:
def log_transform_columns(data_df, cols):
    """Applies a log transform to `cols` in `data_df`.

    Parameters
    ----------
    data_df: pd.DataFrame
        The DataFrame in which the columns will be transformed.
    cols: collection
        The columns in `data_df` to be log scaled.

    Returns
    -------
    pd.DataFrame
        The DataFrame obtained from `data_df` after log scaling
        the columns `cols`.

    """
    for col in cols:
        data_df[col] = data_df[col].apply(lambda x: np.log(x) if x > 0 else 0)
    return data_df

In [24]:
log_transform_cols = ['text_reviews_count', 'ratings_count', 'shelved_count', 
                      'read_count', 'rated_count', 'recommended_count', 'book_shelved_count']
X_train = log_transform_columns(X_train, log_transform_cols)
X_test = log_transform_columns(X_test, log_transform_cols)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_df[col] = data_df[col].apply(lambda x: np.log(x) if x > 0 else 0)


In [25]:
def sigmoid(val):
    """Applies the sigmoid function to `val`.

    The sigmoid function has the form
    f(x) = 1 / (1 + exp(-x))

    Parameters
    ----------
    val: float
        The operand to the sigmoid function.

    Returns
    -------
    float
        The result of applying the sigmoid
        function to `val`.

    """
    return 1 / (1 + np.exp(-val))

In [26]:
def scale_s_values(data_df):
    """Applies the sigmoid function to the s values in `data_df`.

    Parameters
    ---------
    data_df: pd.DataFrame
        The DataFrame for which the operation is performed.

    Returns
    -------
    pd.DataFrame
        The DataFrame that results from scaling the s values
        in `data_df`.

    """
    for s_col in ["s1", "s2", "s3", "s4"]:
        data_df[s_col] = data_df[s_col].apply(lambda x: sigmoid(x))
    return data_df

In [27]:
X_train = scale_s_values(X_train)
X_test = scale_s_values(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_df[s_col] = data_df[s_col].apply(lambda x: sigmoid(x))


In [28]:
min_max_scaler = MinMaxScaler()

X_train_reg = min_max_scaler.fit_transform(X_train)
X_test_reg = min_max_scaler.transform(X_test)

In [29]:
book_df = pd.concat([train_df_s, test_df_s], axis=0)[['book_id', 'cleaned_text']]
book_df = book_df.drop_duplicates(subset=['book_id'])

book_df['cleaned_text'] = book_df['cleaned_text'].apply(lambda x: "" if pd.isnull(x) else x)

w2v = Word2Vec(list(book_df['cleaned_text']), size=200, window=10, min_count=1)

In [30]:
def create_book_vector(book_text, vec_length):
    """Creates a vector for the book given by `book_text`.

    The word vectors for each word in `book_text` are
    averaged to build a vector for the book.

    Parameters
    ----------
    book_text: str
        The book text for which the vector is generated.

    Returns
    -------
    vector
        A vector for the book.

    """
    text_vecs = [word for word in str(book_text) if word in w2v.wv.vocab]
    if len(text_vecs) > 0:
        return np.mean(w2v[text_vecs], axis=0)
    return np.zeros(vec_length)

In [31]:
book_df['book_vector'] = book_df['cleaned_text'].apply(lambda x: create_book_vector(x, 200))

  return np.mean(w2v[text_vecs], axis=0)


In [32]:
train_df_s = pd.merge(train_df_s, book_df, how='left', on=['book_id'])
test_df_s = pd.merge(test_df_s, book_df, how='left', on=['book_id'])

In [35]:
def create_book_vec_df(book_vecs, indices):
    """Creates a dataframe from `book_vecs`.

    Each numpy array in `book_vecs` is converted to a
    row in the resulting dataframe.

    Parameters
    ----------
    book_vecs: list
        A list of numpy arrays where each array corresponds
        to the book vector for a book.
    indicies: np.array
        A numpy array of indices for the DataFrame

    Returns
    -------
    pd.DataFrame
        The DataFrame obtained from converting `review_vecs`
        to a dataframe.

    """
    book_vec_df = pd.DataFrame(np.vstack(book_vecs))
    book_vec_df.columns = ["word" + str(col) for col in book_vec_df.columns]
    book_vec_df.index = indices
    return book_vec_df

In [36]:
train_wv = create_book_vec_df(train_df_s['book_vector'], train_df_s.index)
test_wv = create_book_vec_df(test_df_s['book_vector'], test_df_s.index)

In [37]:
X_train_wv_reg = pd.concat([train_wv, pd.DataFrame(np.vstack(X_train_reg))], axis=1)
X_test_wv_reg = pd.concat([test_wv, pd.DataFrame(np.vstack(X_test_reg))], axis=1)

In [38]:
reg_model = LogisticRegression(max_iter=10000)
reg_model.fit(X_train_wv_reg, y_train)

reg_train_preds = reg_model.predict(X_train_wv_reg)
reg_test_preds = reg_model.predict(X_test_wv_reg)

print_classification_stats(reg_train_preds, y_train, reg_test_preds, y_test)

Training
--------
              precision    recall  f1-score   support

           0       0.76      0.63      0.69    128756
           1       0.74      0.84      0.79    163519

    accuracy                           0.75    292275
   macro avg       0.75      0.74      0.74    292275
weighted avg       0.75      0.75      0.74    292275

AUC: 0.7359693904578151

Testing
--------
              precision    recall  f1-score   support

           0       0.71      0.45      0.55     17725
           1       0.58      0.81      0.68     16905

    accuracy                           0.62     34630
   macro avg       0.65      0.63      0.61     34630
weighted avg       0.65      0.62      0.61     34630

AUC: 0.6277156131355467



In [45]:
print(recall_score(y_test, reg_test_preds))

0.8091688849452825


In [40]:
X_train_wv = pd.concat([train_wv, X_train], axis=1)
X_test_wv = pd.concat([test_wv, X_test], axis=1)

In [41]:
xgb_model = XGBClassifier(
    objective='binary:logistic', learning_rate=0.1,
    n_estimators=2000, max_depth=2)
xgb_model.fit(X_train_wv, y_train)

xgb_train_preds = xgb_model.predict(X_train_wv)
xgb_test_preds = xgb_model.predict(X_test_wv)

print_classification_stats(xgb_train_preds, y_train, xgb_test_preds, y_test)

Training
--------
              precision    recall  f1-score   support

           0       0.77      0.67      0.72    128756
           1       0.77      0.84      0.80    163519

    accuracy                           0.77    292275
   macro avg       0.77      0.76      0.76    292275
weighted avg       0.77      0.77      0.76    292275

AUC: 0.7564293658845201

Testing
--------
              precision    recall  f1-score   support

           0       0.70      0.50      0.59     17725
           1       0.60      0.78      0.68     16905

    accuracy                           0.64     34630
   macro avg       0.65      0.64      0.63     34630
weighted avg       0.65      0.64      0.63     34630

AUC: 0.6400839003658128



In [46]:
print(recall_score(y_test, xgb_test_preds))

0.7792369121561669


In [47]:
print(recall_score(y_test, rf_test_preds))

0.6207039337474121


In [55]:
s_cols = ['s1', 's2', 's3', 's4']

X_train_xgb = X_train_wv.drop(columns=s_cols)
X_test_xgb = X_test_wv.drop(columns=s_cols)

X_train_s = X_train_wv[s_cols]
X_test_s = X_test_wv[s_cols]

In [57]:
xgb_model = XGBClassifier(
    objective='binary:logistic', learning_rate=0.1,
    n_estimators=2000, max_depth=2)
xgb_model.fit(X_train_xgb, y_train)

xgb_train_preds = xgb_model.predict(X_train_xgb)
xgb_test_preds = xgb_model.predict(X_test_xgb)

In [58]:
X_train_meta = pd.concat([X_train_s, pd.DataFrame(xgb_train_preds, columns=['xgb_preds'])], axis=1)
X_test_meta = pd.concat([X_test_s, pd.DataFrame(xgb_test_preds, columns=['xgb_preds'])], axis=1)

In [60]:
min_max_scaler = MinMaxScaler()

X_train_meta = min_max_scaler.fit_transform(X_train_meta)
X_test_meta = min_max_scaler.fit_transform(X_test_meta)

In [61]:
reg_model = LogisticRegression(max_iter=10000)
reg_model.fit(X_train_meta, y_train)

reg_train_preds = reg_model.predict(X_train_meta)
reg_test_preds = reg_model.predict(X_test_meta)

print_classification_stats(reg_train_preds, y_train, reg_test_preds, y_test)

Training
--------
              precision    recall  f1-score   support

           0       0.76      0.67      0.71    128756
           1       0.76      0.83      0.79    163519

    accuracy                           0.76    292275
   macro avg       0.76      0.75      0.75    292275
weighted avg       0.76      0.76      0.76    292275

AUC: 0.7497048746534598

Testing
--------
              precision    recall  f1-score   support

           0       0.69      0.57      0.62     17725
           1       0.62      0.74      0.67     16905

    accuracy                           0.65     34630
   macro avg       0.66      0.65      0.65     34630
weighted avg       0.66      0.65      0.65     34630

AUC: 0.6519841193360891



In [63]:
feature_cols = ['s1', 's2', 's3', 's4', 'xgb_preds']

reg_df = pd.DataFrame({'feature': feature_cols,
                       'regression_coefficient': reg_model.coef_[0]})

reg_df.head(len(feature_cols))

Unnamed: 0,feature,regression_coefficient
0,s1,7.055931
1,s2,-7.423238
2,s3,-9.975437
3,s4,9.692845
4,xgb_preds,2.134022


In [64]:
print(recall_score(y_test, reg_test_preds))

0.7351671103223898
