In [39]:
import pandas as pd
import numpy as np
import pprint

from nltk.stem.snowball import SnowballStemmer
from sklearn import tree
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.grid_search import GridSearchCV

In [30]:
df_train = pd.read_csv("../input/_final_train.csv", encoding="ISO-8859-1")
df_test = pd.read_csv('../input/_final_test.csv', encoding="ISO-8859-1")
df_attr = pd.read_csv('../input/attributes.csv')
df_prod_desc = pd.read_csv('../input/_final_desc.csv', index_col="product_uid")
ytrain = df_train["relevance"]

In [13]:
# print df_test.head()
# print "\n\n"
# print df_train.head()
print df_prod_desc.head()

             Unnamed: 0                                product_description
product_uid                                                               
100001                0  not on do angl make joint stronger they also p...
100002                1  behr premium textur deckov is an innov solid c...
100003                2  classic architectur meet contemporari design i...
100004                3  the grape solar 265watt. polycrystallin pv sol...
100005                4  updat your bathroom with the delta vero singl ...


## Utilities

In [33]:
def print_scores(clf):
    pp = pprint.PrettyPrinter(indent=4)
    print "Best score and params:"
    pp.pprint(clf.best_score_)
    pp.pprint(clf.best_params_)
    
    print "All params and scores:"
    pp.pprint(clf.grid_scores_)

def fmean_squared_error(ground_truth, predictions):
    fmean_squared_error_ = mean_squared_error(ground_truth, predictions)**0.5
    return fmean_squared_error_

RMSE = make_scorer(fmean_squared_error, greater_is_better=False)

## Create numerical features
### Extract them

In [8]:
train_num_feat = pd.DataFrame(0, index=np.arange(len(df_train)), columns=["search_in_title",\
                                     "search_in_desc"])
test_num_feat = pd.DataFrame(0, index=np.arange(len(df_test)), columns=["search_in_title",\
                                     "search_in_desc"])

# Count the training data occurrences
for i in range(0, df_train.shape[0]):
    search = df_train["search_term"][i]  # Search term used by user
    title = df_train["product_title"][i] # Product Title
    product_id = df_train.iloc[[i]]["product_uid"][i]  # Get product_uid of current index in df_train
    description = df_prod_desc.loc[product_id]["product_description"] # Description of product id
    
    for word in search.split():

        if word in title:
            train_num_feat["search_in_title"][i] += 1
            
        if word in description:
            train_num_feat["search_in_desc"][i] += 1
            
# Count the testing data occurrences
for i in range(0, df_test.shape[0]):
    search = df_test["search_term"][i]  # Search term used by user
    title = df_test["product_title"][i] # Product Title
    product_id = df_test.iloc[[i]]["product_uid"][i]  # Get product_uid of current index in df_train
    description = df_prod_desc.loc[product_id]["product_description"] # Description of product id

    for word in search.split():
        
        if word in title:
            test_num_feat["search_in_title"][i] += 1
        
        if word in description:
            test_num_feat["search_in_desc"][i] += 1

#### Export them

In [18]:
print train_num_feat.head(10)
train_num_feat.to_csv("../input/simple_train_num_feat.csv")
test_num_feat.to_csv("../input/simple_test_num_feat.csv")

   search_in_title  search_in_desc
0                1               1
1                1               1
2                1               2
3                1               1
4                3               3
5                1               2
6                2               2
7                1               1
8                2               2
9                2               1


### Or import them 
#### Simple

In [14]:
train_num_feat = pd.read_csv("../input/simple_train_num_feat.csv", usecols=[1,2])
test_num_feat = pd.read_csv("../input/simple_test_num_feat.csv", usecols=[1,2])

train_num_feat.head()

Unnamed: 0,search_in_title,search_in_desc
0,1,1
1,1,1
2,1,2
3,1,1
4,3,3


#### Or more complex

In [15]:
train_num_feat = pd.read_csv("../input/more_features/lab_x_train.csv")
test_num_feat = pd.read_csv("../input/more_features/lab_x_test.csv")

train_num_feat.head()

Unnamed: 0,query_len,title_len,desc_len,brand_len,query_in_title,query_in_desc,query_last_word_in_title,query_last_word_in_desc,word_in_title,word_in_desc,ratio_title,ratio_description,word_in_brand,ratio_brand,brand_feature,search_term_feature
0,2,6,135,3,0,0,0,0,1,1,0.5,0.5,0,0.0,1000,12
1,2,6,135,3,0,0,0,0,1,1,0.5,0.5,0,0.0,1000,9
2,2,12,169,4,0,0,0,0,1,1,0.5,0.5,1,0.25,1000,9
3,3,14,109,1,0,0,0,0,1,1,0.333,0.333,0,0.0,1010,16
4,3,14,109,1,1,0,1,1,3,3,1.0,1.0,0,0.0,1010,18


## Fit the model and predict both testing and training values
### No cross-validation
Keeping this just in case cross-validation does not work terribly well. Not holding my breath, since the default parameters already give such as small training error.

In [10]:
clf = tree.DecisionTreeRegressor(min_samples_split=3, max_depth=6)
# clf = tree.DecisionTreeRegressor()
clf.fit(train_num_feat, df_train["relevance"])
yhat_val = clf.predict(test_num_feat)
df_yhat = pd.DataFrame(yhat_val, columns=["relevance"], index=df_test["id"])

print min(df_yhat.loc[:, "relevance"]), max(df_yhat.loc[:, "relevance"])
df_yhat.to_csv('../output/decision_tree_yhat.csv')

yhat_train_val = clf.predict(train_num_feat)
err = (df_train["relevance"] - yhat_train_val) ** 2
mse_train = sum(err) / df_train.shape[0]
print mse_train

1.0 3.0
0.228578828314


### Cross-validation

In [None]:
np.linspace(1, 16, 5).astype(int)
print np.logspace(1, 100, 5)

In [38]:
tree_multi = tree.DecisionTreeRegressor()
# Create parameter grid
splitter = ["best", "random"]
max_features = np.linspace(1, 16, 5).astype(int)
max_depth = np.linspace(2, 10, 5)
min_samples_split = np.linspace(1, 5, 5)
min_samples_leaf = np.linspace(1, 5, 5)
presort = [True, False]

param_grid = dict(splitter=splitter, max_features=max_features,\
                  max_depth=max_depth,\
                  min_samples_split=min_samples_split,\
                  min_samples_leaf=min_samples_leaf,\
                  presort=presort
                 )

# Cross validate!
reg_multi = GridSearchCV(tree_multi, param_grid=param_grid, scoring=RMSE)
reg_multi.fit(train_num_feat, ytrain)

print_scores(reg_multi)

NameError: global name 'pprint' is not defined

### Export output and  some data for plotting

In [None]:
pd_

In [6]:
def export_data(yhat, predictor_name):
    
    df_word_cloud = pd.DataFrame(0, index=np.arange(len(df_train)),\
                                 columns=["product_title", "search_term", "Y", "Yhat", "diff"])
    
    df_num = pd.DataFrame(0, index=np.arange(len(df_train)),\
                                 columns=["Y", "Yhat", "diff"])
    
    df_result = pd.Data

    df_word_cloud["product_title"] = df_train["product_title"]
    df_word_cloud["search_term"] = df_train["search_term"]
    df_word_cloud["Y"] = df_train["relevance"]
    df_word_cloud["Yhat"] = yhat
    df_word_cloud["diff"] = (df_word_cloud["Y"] - df_word_cloud["Yhat"]) ** 2
    
    df_num["Y"] = df_train["relevance"]
    df_num["Yhat"] = yhat
    df_num["diff"] = df_word_cloud["diff"]

    output_file_words = "../output/" + predictor_name + "_word_cloud.csv"
    output_file_num = "../output/" + predictor_name + "_num.csv"
    
    df_word_cloud.to_csv(output_file_words, encoding="utf-8")
    df_num.to_csv(output_file_num, encoding="utf-8")

In [8]:
export_data(yhat_train_val, "decision_tree_depth_lim")

In [28]:
print df_test.loc[0, "product_title"]
print test_num_feat.iloc[[0]]

 simpson strong tie 12 gaug angl
   search_in_title  search_in_desc
0                0               0


In [9]:
df_test.loc[[df_test.shape[0] - 1]]

Unnamed: 0.1,Unnamed: 0,id,product_uid,product_title,search_term
166692,166692,240760,224428,bosch 4in. bi metal hole saw,4in. hole saw


In [49]:
df_test.loc[:, "product_title"].iloc[[1]]

1    Simpson Strong-Tie 12-Gauge Angle
Name: product_title, dtype: object