In [33]:
import pathlib
import pickle

import numpy as np
import pandas as pd
from sklearn.ensemble import (AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, 
                              GradientBoostingRegressor, RandomForestRegressor)
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor

import model_util

In [5]:
data = pathlib.Path.cwd() / 'data'
npys = data / 'npys'
pkls = data / 'pkls'

In [41]:
def merge_features(name):
    """Merge all serialized features into a single dataframe.
    
    Parameters
    ----------
    name : str
        'train' or 'valid'
    
    Returns
    -------
    df : DataFrame
        DataFrame with all the features combined.
    """
    
    essay_df = pd.read_pickle(pkls / f'{name}.pkl')
    descr_df = pd.read_pickle(pkls / 'descr.pkl')[['essay_set', 'grade_level']]
    essay_to_grade_level = descr_df.set_index('essay_set').to_dict()['grade_level']
    
    grade_level_arr = essay_df['essay_set'].map(essay_to_grade_level).values
    grade_level_df = pd.DataFrame(grade_level_arr, columns=['grade_level'])
    
    promt_count_arr = np.load(npys / f'{name}_prompt_count.npy')
    promt_count_df = pd.DataFrame(promt_count_arr, columns=['prompt_count'])
    
    promt_tfidf_arr = np.load(npys / f'{name}_prompt_tfidf.npy')
    promt_tfidf_df = pd.DataFrame(promt_tfidf_arr, columns=['prompt_tfidf'])
    
    percent_df = essay_df[['domain1_percent', 'domain2_percent']]
    pos_df = pd.read_pickle(pkls / f'{name}_pos.pkl')
    sentiment_df = pd.read_pickle(pkls / f'{name}_sentiment.pkl')
    diff_level_df = pd.read_pickle(pkls / f'{name}_grade_level.pkl')
    
    dfs = [grade_level_df, percent_df, promt_count_df, promt_tfidf_df,
           diff_level_df, sentiment_df, pos_df]
    return pd.concat(dfs, axis=1)

In [42]:
train_df = merge_features('train')
train_df.head()

Unnamed: 0,grade_level,domain1_percent,domain2_percent,prompt_count,prompt_tfidf,flesch_reading_ease,smog_index,flesch_kincaid_grade,coleman_liau_index,automated_readability_index,...,VB,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WP$,WRB
0,8,60.0,,0.415474,0.297531,74.02,10.4,8.5,8.48,11.3,...,22,1,14,4,12,14,0,1,0,2
1,8,70.0,,0.339572,0.235998,67.08,11.6,9.1,7.78,9.9,...,29,8,20,4,16,6,2,0,0,9
2,8,50.0,,0.638644,0.529368,68.2,12.0,8.7,8.24,9.8,...,13,1,9,0,23,7,0,5,0,3
3,8,80.0,,0.418874,0.286866,53.34,13.9,12.3,11.2,15.4,...,30,18,5,9,24,12,2,3,0,4
4,8,60.0,,0.411188,0.297567,72.66,11.3,7.0,7.83,7.5,...,42,2,6,2,17,18,3,1,0,5


In [40]:
valid_df = merge_features('valid')
valid_df.head()

Unnamed: 0,grade_level,domain1_percent,domain2_percent,CC,CD,DT,EX,FW,IN,JJ,...,flesch_reading_ease,smog_index,flesch_kincaid_grade,coleman_liau_index,automated_readability_index,dale_chall_readability_score,linsear_write_formula,gunning_fog,prompt_count,prompt_tfidf
0,8,50.0,,11,0,18,0,0,22,22,...,56.59,12.7,11.1,10.39,13.3,6.7,10.833333,14.07,0.565212,0.410547
1,8,60.0,,14,1,24,1,0,36,12,...,80.31,10.2,6.1,5.86,6.4,5.51,10.5,9.23,0.370156,0.274027
2,8,70.0,,17,2,35,1,0,55,18,...,64.88,11.9,12.0,7.85,14.6,6.2,12.2,15.14,0.401037,0.297338
3,8,70.0,,15,0,25,0,0,42,17,...,73.17,9.9,6.8,7.3,7.1,5.47,15.25,8.77,0.325612,0.227408
4,8,70.0,,18,1,30,4,0,61,36,...,74.9,10.2,6.1,7.58,6.6,5.48,6.0,8.32,0.54926,0.404606


# JobLib Serialize Models