In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC,LinearSVC
from sklearn.metrics import auc, roc_auc_score, mean_squared_error, r2_score, classification_report
from sklearn.ensemble import RandomForestClassifier

import os
import json

In [2]:
pd.set_option('display.max_columns', None)

Loading the data from files:




In [3]:
def construct_data_path(dataset_name):
    """Constructs the path to `dataset_name`.
    
    Parameters
    ----------
    dataset_name: str
        The name of the dataset.
    
    Returns
    -------
    str
        A path to the dataset.
    
    """
    return os.path.join('../output_data', '{}.csv'.format(dataset_name))

In [4]:
train_data_1 = pd.read_csv(construct_data_path('text_processed_training'))
validation_data_1 = pd.read_csv(construct_data_path('text_processed_validation'))

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Step 2: Feature Selection

In [5]:
train_data = train_data_1.copy()
validation_data = validation_data_1.copy()

In [7]:
feature_columns = ['text_reviews_count', 'is_ebook', 'average_rating', 'num_pages',
                   'ratings_count', 'is_translated', 'is_in_series', 'series_length', 
                   'is_paperback', 'is_hardcover', 'is_audio', 'from_penguin', 
                   'from_harpercollins', 'from_university_press', 'from_vintage',
                   'from_createspace', 'publication_year', 'author_a', 'author_b', 'author_c',
                   'author_d', 'author_e', 'author_f', 'author_other', 'book_shelved_count',
                   'shelved_count', 'read_count', 'rated_count', 'recommended_count', 'title_len']

In [8]:
X_train = train_data[feature_columns]
y_train = train_data['recommended']

X_validation = validation_data[feature_columns]
y_validation = validation_data['recommended']

Step 3: Prediction

In [10]:
def predict_logistic_regression(X_test, X_train, y_train):
    """classify X_test entries with a logistic regression classifier 
    
    Parameters
    ----------
    DataFrame: X_test
        A DataFrame corresponding to the test data
        
    DataFrame: X_train
        A DataFrame corresponding to the train data
        
    list: y_train
        A list of categories corresponding to the Xs in the X_train
        
    
    
    Returns
    -------
    list: 
        A list of predicted categories corresponding the X_test
    
    """
        
    regr = LogisticRegression(max_iter = 10000)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    
    return y_pred

In [11]:
def predict_svm(X_test, X_train, y_train):
    """classify X_test entries with a logistic a linear support vector classifier 
    
    Parameters
    ----------
    DataFrame: X_test
        A DataFrame corresponding to the test data
        
    DataFrame: X_train
        A DataFrame corresponding to the train data
        
    list: y_train
        A list of categories corresponding to the Xs in the X_train
        
    
    
    Returns
    -------
    list: 
        A list of predicted categories corresponding the X_test
    
    """
        
    
    clf = make_pipeline(StandardScaler(), 
                        LinearSVC(random_state=0, max_iter=10000))
    
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    return y_pred

In [19]:
def predict_random_forest(X_test, X_train, y_train):
    """classify X_test entries with a Random Forest classifier 
    
    Parameters
    ----------
    DataFrame: X_test
        A DataFrame corresponding to the test data
        
    DataFrame: X_train
        A DataFrame corresponding to the train data
        
    list: y_train
        A list of categories corresponding to the Xs in the X_train
        
    
    
    Returns
    -------
    list: 
        A list of predicted categories corresponding the X_test
    
    """
        
    clf = RandomForestClassifier(n_jobs=4, n_estimators=1000, max_depth=15, random_state=0)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_pred

In [13]:
def test_model(model, X_test, y_test, X_train, y_train):
    y_pred_test = model(X_test, X_train, y_train)
    y_pred_train = model(X_train, X_train, y_train)
    
    print("validation:")
    print(classification_report(y_test, y_pred_test))
    print(roc_auc_score(y_test, y_pred_test))
    
    print("train:")
    print(classification_report(y_train, y_pred_train))
    print(roc_auc_score(y_train, y_pred_train))
    

In [14]:
test_model(predict_logistic_regression, X_validation, y_validation, X_train, y_train)

validation:
              precision    recall  f1-score   support

           0       0.75      0.23      0.35     17586
           1       0.54      0.92      0.68     17044

    accuracy                           0.57     34630
   macro avg       0.64      0.58      0.52     34630
weighted avg       0.65      0.57      0.51     34630

0.576169627447619
train:
              precision    recall  f1-score   support

           0       0.81      0.46      0.59    111170
           1       0.69      0.92      0.79    146475

    accuracy                           0.72    257645
   macro avg       0.75      0.69      0.69    257645
weighted avg       0.74      0.72      0.70    257645

0.6908926830991556


In [18]:
test_model(predict_random_forest, X_validation, y_validation, X_train, y_train)

validation:
              precision    recall  f1-score   support

           0       0.65      0.70      0.67     17586
           1       0.66      0.61      0.64     17044

    accuracy                           0.66     34630
   macro avg       0.66      0.66      0.66     34630
weighted avg       0.66      0.66      0.66     34630

0.6569771662632047
train:
              precision    recall  f1-score   support

           0       0.76      0.79      0.78    111170
           1       0.84      0.81      0.83    146475

    accuracy                           0.80    257645
   macro avg       0.80      0.80      0.80    257645
weighted avg       0.81      0.80      0.80    257645

0.8028121594786715


In [16]:
test_model(predict_svm, X_validation[:10000], y_validation[:10000], X_train[:50000], y_train[:50000])



validation:
              precision    recall  f1-score   support

           0       0.68      0.42      0.52      5087
           1       0.57      0.79      0.66      4913

    accuracy                           0.60     10000
   macro avg       0.62      0.61      0.59     10000
weighted avg       0.62      0.60      0.59     10000

0.6062111364836817
train:
              precision    recall  f1-score   support

           0       0.70      0.58      0.64     21513
           1       0.72      0.82      0.76     28487

    accuracy                           0.71     50000
   macro avg       0.71      0.70      0.70     50000
weighted avg       0.71      0.71      0.71     50000

0.6978785369149139


