In [47]:
import pandas as pd
import numpy as np
import pprint
from ml_utils import Utils

from nltk.stem.snowball import SnowballStemmer
from sklearn import tree
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.grid_search import GridSearchCV
from sklearn.learning_curve import learning_curve, validation_curve

from pylab import figure, axes, title, savefig
import matplotlib.pyplot as plt

MIN_VAL = 1.0
MAX_VAL = 3.0

**Note about files being read:** These inputs are produced by running the `str_stem` function. They are essentially the files provided in the competition, but the strings have been pre-processed to eliminate irrelevant characters and other interferences, as well as to stem the strings, i.e., minimize the differences from inflexions found in the language (e.g., "cat" and "cats" are stemmed to the same word to maximize matches once the numerical features are calculated).

In [3]:
df_train = pd.read_csv("../input/_final_train.csv", encoding="ISO-8859-1")
df_test = pd.read_csv('../input/_final_test.csv', encoding="ISO-8859-1")
df_attr = pd.read_csv('../input/attributes.csv')
df_prod_desc = pd.read_csv('../input/_final_desc.csv', index_col="product_uid")
ytrain = df_train["relevance"]

In [4]:
# Make sure it worked
print df_prod_desc.head()

             Unnamed: 0                                product_description
product_uid                                                               
100001                0  not on do angl make joint stronger they also p...
100002                1  behr premium textur deckov is an innov solid c...
100003                2  classic architectur meet contemporari design i...
100004                3  the grape solar 265watt. polycrystallin pv sol...
100005                4  updat your bathroom with the delta vero singl ...


## Utilities

In [5]:
def print_scores(clf, verbose=False):
    """
    Prints the best scores and best parameters found in cross-validation.
    Takes:
    - clf, a classifier that has been cross-validated by grid search.
    Returns:
    """
    pp = pprint.PrettyPrinter(indent=4)
    print "Best score and parameters:"
    pp.pprint(clf.best_score_)
    pp.pprint(clf.best_params_)
    
    if verbose:
        print "~~~Verbose output~~~"
        print "All scores"
        pp.pprint(clf.grid_scores_)
    
def fmean_squared_error(ground_truth, predictions):
    """
    Returns the difference between the actual value and the predictions
    output by a classifier
    Takes:
    - ground_truth, the actual value Y of the inputs X being predicted.
    - predictions, the predicted value Yhat of the inputs X.
    Returns:
    - 
    """
    fmean_squared_error_ = mean_squared_error(ground_truth, predictions)**0.5
    return fmean_squared_error_

def is_prediction_in_range(yhat, min_val, max_val, verbose=False):
    """Returns whether the values of a prediction are in the expected range"""
    return min(yhat) >= min_val and max(yhat) <= max_val

def export_data(yhat, is_test, predictor_name):
        output_folder = "../output/"

        if not is_test:
            # Most common words in data with the worst errors
            df_word_cloud = pd.DataFrame(0, index=np.arange(len(df_train)),\
                                    columns=["product_title", "search_term",\
                                         "Y", "Yhat", "diff"])

            # Exports Y, Yhat and difference between them.
            # Can be helpful for tuning or seeing patterns
            df_num = pd.DataFrame(0, index=np.arange(len(df_train)),\
                                    columns=["Y", "Yhat", "diff"])

            # Word cloud
            df_word_cloud["product_title"] = df_train["product_title"]
            df_word_cloud["search_term"] = df_train["search_term"]
            df_word_cloud["Y"] = df_train["relevance"]
            df_word_cloud["Yhat"] = yhat
            df_word_cloud["diff"] = (df_word_cloud["Y"] - df_word_cloud["Yhat"]) ** 2

            # Diff
            df_num["Y"] = df_train["relevance"]
            df_num["Yhat"] = yhat
            df_num["diff"] = df_word_cloud["diff"]

            output_file_words = output_folder + predictor_name + "_word_cloud.csv"
            output_file_num = output_folder + predictor_name + "_num.csv"

            df_word_cloud.to_csv(output_file_words, encoding="utf-8")
            df_num.to_csv(output_file_num, encoding="utf-8")

        else:
            # Exports Yhat(testing data). This is the submission file
            # df_result = pd.DataFrame(0, index=np.arange(len(df_test)),
              #                      columns=["id", "relevance"])

            df_result = pd.DataFrame(yhat, columns=["relevance"],\
                                    index=df_test["id"])

            output_file_yhat = output_folder + predictor_name + "_yhat.csv"

            df_result.to_csv(output_file_yhat, encoding="utf-8")


RMSE = make_scorer(fmean_squared_error, greater_is_better=False)

## Create numerical features
### Simple feature extraction

In [8]:
train_num_feat = pd.DataFrame(0, index=np.arange(len(df_train)), columns=["search_in_title",\
                                     "search_in_desc"])
test_num_feat = pd.DataFrame(0, index=np.arange(len(df_test)), columns=["search_in_title",\
                                     "search_in_desc"])

# Count the training data occurrences
for i in range(0, df_train.shape[0]):
    search = df_train["search_term"][i]  # Search term used by user
    title = df_train["product_title"][i] # Product Title
    product_id = df_train.iloc[[i]]["product_uid"][i]  # Get product_uid of current index in df_train
    description = df_prod_desc.loc[product_id]["product_description"] # Description of product id
    
    for word in search.split():

        if word in title:
            train_num_feat["search_in_title"][i] += 1
            
        if word in description:
            train_num_feat["search_in_desc"][i] += 1
            
# Count the testing data occurrences
for i in range(0, df_test.shape[0]):
    search = df_test["search_term"][i]  # Search term used by user
    title = df_test["product_title"][i] # Product Title
    product_id = df_test.iloc[[i]]["product_uid"][i]  # Get product_uid of current index in df_train
    description = df_prod_desc.loc[product_id]["product_description"] # Description of product id

    for word in search.split():
        
        if word in title:
            test_num_feat["search_in_title"][i] += 1
        
        if word in description:
            test_num_feat["search_in_desc"][i] += 1

#### Export them

In [9]:
print train_num_feat.head(10)
#train_num_feat.to_csv("../input/simple_train_num_feat.csv")
#test_num_feat.to_csv("../input/simple_test_num_feat.csv")

   search_in_title  search_in_desc
0                1               1
1                1               1
2                1               1
3                1               1
4                3               2
5                1               2
6                2               2
7                1               1
8                2               2
9                2               2


### Or import them 
#### Simple

In [6]:
train_num_feat = pd.read_csv("../input/simple_train_num_feat.csv", usecols=[1,2])
test_num_feat = pd.read_csv("../input/simple_test_num_feat.csv", usecols=[1,2])

train_num_feat.head()

Unnamed: 0,search_in_title,search_in_desc
0,1,1
1,1,1
2,1,2
3,1,1
4,3,3


#### Or more complex

In [7]:
train_num_feat = pd.read_csv("../input/more_features/lab_x_train.csv")
test_num_feat = pd.read_csv("../input/more_features/lab_x_test.csv")

train_num_feat.head()

Unnamed: 0,query_len,title_len,desc_len,brand_len,query_in_title,query_in_desc,query_last_word_in_title,query_last_word_in_desc,word_in_title,word_in_desc,ratio_title,ratio_description,word_in_brand,ratio_brand,brand_feature,search_term_feature
0,2,6,135,3,0,0,0,0,1,1,0.5,0.5,0,0.0,1000,12
1,2,6,135,3,0,0,0,0,1,1,0.5,0.5,0,0.0,1000,9
2,2,12,169,4,0,0,0,0,1,1,0.5,0.5,1,0.25,1000,9
3,3,14,109,1,0,0,0,0,1,1,0.333,0.333,0,0.0,1010,16
4,3,14,109,1,1,0,1,1,3,3,1.0,1.0,0,0.0,1010,18


## Fit the model and predict both testing and training values
### Hand-tuning parameters
At first, I played with various parameters to the predictor and used it to predict training data. If the decision tree is not stopped early enough, the model will clearly overfit, resulting in a training MSE of ~0.001 with the complex feature set. I thought it would be an interesting experiment to try to limit the overfitting by matching the _training_ errors of the predictors that we had created with the best public _testing_ error so far (training MSE ~0.22). Interestingly enough, so far this hand-tuned version of the decision tree is the version that has produced the best public testing score out of all decision tree regressors we have tried, scoring better than the cross-validated model that should select better values for the parameters (see next section). This is, of course, not necessarily representative of the final performance of the models, since before the deadline the submissions are only scored on a portion of the testing data, but I thought it was interesting enough to mention.

In [8]:
clf = tree.DecisionTreeRegressor(min_samples_split=3, max_depth=6)
# Train
clf.fit(train_num_feat, df_train["relevance"])

# Predict training and testing data
yhat_train_val = clf.predict(train_num_feat)
yhat_test_val = clf.predict(test_num_feat)

# Make sure results are within boundaries
print "Training Yhat in range: " + str(is_prediction_in_range(yhat_train_val, MIN_VAL, MAX_VAL))
print "Testing Yhat in range: " + str(is_prediction_in_range(yhat_test_val, MIN_VAL, MAX_VAL))
# MSE
err = (df_train["relevance"] - yhat_train_val) ** 2
mse_train = sum(err) / df_train.shape[0]
print "Training MSE: " + str(mse_train)

export_data(yhat_test_val, True, "decision_tree_1_depth_lim")

Training Yhat in range: True
Testing Yhat in range: True
Training MSE: 0.228578828314


### Cross-validation

In order to cross validate, I will use `GridSearchCV`, a built-in scikit module that tries the different combinations of parameters that it is passed on a predictor, and stores the best parameters.

In [9]:
# Create the predictor
tree_multi = tree.DecisionTreeRegressor()

# Create parameter grid: Combinations of all of these parameters will be tried
# with the predictor
splitter = ["best", "random"]
max_features = np.linspace(1, 16, 5).astype(int)
max_depth = np.linspace(2, 10, 5)
min_samples_split = np.linspace(1, 5, 5)
min_samples_leaf = np.linspace(1, 5, 5)
presort = [True, False]

param_grid = dict(splitter=splitter, max_features=max_features,\
                  max_depth=max_depth,\
                  min_samples_split=min_samples_split,\
                  min_samples_leaf=min_samples_leaf,\
                  presort=presort
                 )

# Cross validate!
reg_multi = GridSearchCV(tree_multi, param_grid=param_grid, scoring=RMSE)
# Fit the final model on the training data
reg_multi.fit(train_num_feat, ytrain)

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'presort': [True, False], 'splitter': ['best', 'random'], 'min_samples_split': array([ 1.,  2.,  3.,  4.,  5.]), 'max_features': array([ 1,  4,  8, 12, 16]), 'max_depth': array([  2.,   4.,   6.,   8.,  10.]), 'min_samples_leaf': array([ 1.,  2.,  3.,  4.,  5.])},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=make_scorer(fmean_squared_error, greater_is_better=False),
       verbose=0)

In [10]:
yhat_multi = reg_multi.predict(test_num_feat)

The parameters obtained from cross-validation follow. Run with `verbose=True` to see looong output.

In [11]:
print_scores(reg_multi)

Best score and parameters:
-0.4852239132780885
{   'max_depth': 8.0,
    'max_features': 8,
    'min_samples_leaf': 5.0,
    'min_samples_split': 4.0,
    'presort': True,
    'splitter': 'best'}


### Export output and  some data for plotting

In [13]:
export_data(yhat_multi, True, "decision_tree_multi")

In [66]:
# Learning curve
train_sizes, train_scores, valid_scores = \
            learning_curve(reg_multi, train_num_feat,\
            ytrain, train_sizes=[50, 80, 110, 150], cv=5, scoring=RMSE)            

In [71]:
# In order to plot, need to get the average of all
# cross validation iterations
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
valid_scores_mean = np.mean(valid_scores, axis=1)
test_scores_std = np.std(valid_scores, axis=1)

print "Values obtained from cross-validation"
print "Number of elements sampled: " + str(train_sizes)
print "\n~~Training scores~~\n " + str(train_scores)\
+ "\nMean: " + str(train_scores_mean)
print "\n~~Cross-validation scores~~\n " + str(valid_scores)\
+ "\nMean: " + str(valid_scores_mean)

figure()
plt.figure()
plt.title("Learning curve")
plt.xlabel("Sample size")
plt.ylabel("RMSE")

plt.plot(train_sizes, train_scores_mean, 'o--', color="r",
             label="Training score")

plt.plot(train_sizes, valid_scores_mean, 'o--', color="g",
             label="Cross-validation score")

plt.legend()

savefig('../figures/decision_tree_learning_curve.png')

Values obtained from cross-validation
Number of elements sampled: [ 50  80 110 150]

~~Training scores~~
 [[-0.45993752 -0.40637694 -0.37039307 -0.39338532 -0.4440584 ]
 [-0.45956567 -0.47565507 -0.44975344 -0.47471139 -0.45821725]
 [-0.4624149  -0.4846245  -0.47183498 -0.48559654 -0.44499943]
 [-0.43534955 -0.47724468 -0.44011993 -0.48532353 -0.47049449]]
Mean: [-0.41483025 -0.46358057 -0.46989407 -0.46170643]

~~Cross-validation scores~~
 [[-0.53024197 -0.56284812 -0.62138403 -0.61905781 -0.62466501]
 [-0.53981095 -0.54153844 -0.50838085 -0.5317069  -0.54471335]
 [-0.54572244 -0.51577763 -0.50228521 -0.55929289 -0.59515406]
 [-0.5204019  -0.50998108 -0.50682008 -0.5418097  -0.55069922]]
Mean: [-0.59163939 -0.5332301  -0.54364645 -0.5259424 ]


In [None]:
![Min samples split][fig2]

[fig2]: https://raw.githubusercontent.com/JuanCTorres/KaggleHomeDepot/master/figures/decision_tree_min_samples_split.png "Minimum samples required to split"

Create a couple of spaces for plotting the cross-validations of maximum depth and minimum split, which is the "minimum number of samples required to split an internal node", as expressed in the scikit documentation for `DecisionTreeRegressor`.

In [72]:
# Create a couple of spaces for plotting the cross-validations
# of maximum depth 
train_samples_split = np.linspace(1, 10, 5).astype(int)
train_tree_depth = np.linspace(1, 10, 5).astype(int)

In [73]:
new_train_scores, new_valid_scores = validation_curve(tree_multi, train_num_feat,\
                                                      ytrain, "min_samples_split",\
                                                      train_samples_split, cv=5)

depth_train_scores, depth_valid_scores = validation_curve(tree_multi, train_num_feat,\
                                                         ytrain, "max_depth",
                                                         train_tree_depth, cv=5)

In [109]:
# Minimum samples required to split
plt.figure()

new_train_scores_mean = np.mean(new_train_scores, axis=1)
new_valid_scores_mean = np.mean(new_valid_scores, axis=1)

plt.plot(arr, new_train_scores_mean, 'o--', color='r',
        label='Training score')
plt.plot(arr, new_valid_scores_mean, 'o--', color='g',
        label="Cross-validation scores")

#plt.legend(bbox_to_anchor=(1.05, 1), loc=3, borderaxespad=0., mode="expand")
plt.legend(bbox_to_anchor=(0., 0.75, 1., .102), loc=1,
           ncol=1,  borderaxespad=0.2)
plt.title("Validation curve")
plt.xlabel("Minimum samples required to split")
plt.ylabel("RMSE")

savefig("../figures/decision_tree_min_samples_split.png")

![Min samples split][fig2]

[fig2]: https://raw.githubusercontent.com/JuanCTorres/KaggleHomeDepot/master/figures/decision_tree_min_samples_split.png "Minimum samples required to split"

In [110]:
# Max depth
plt.figure()

depth_train_scores_mean = np.mean(depth_train_scores, axis=1)
depth_valid_scores_mean = np.mean(depth_valid_scores, axis=1)

plt.plot(arr, depth_train_scores_mean, 'o--', color='r',
        label='Training score')
plt.plot(arr, depth_valid_scores_mean, 'o--', color='g',
        label="Cross-validation scores")

plt.legend(bbox_to_anchor=(0., 0.75, 1., .102), loc=1,
           ncol=1,  borderaxespad=0.2)
plt.title("Validation curve")
plt.xlabel("Maximum tree depth")
plt.ylabel("RMSE")

savefig("../figures/decision_tree_max_depth.png")

![Max depth][fig3]

[fig3]: https://raw.githubusercontent.com/JuanCTorres/KaggleHomeDepot/master/figures/decision_tree_max_depth.png "Maximum tree depth"