In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')

df_train = pd.read_csv('/Users/Hermione/MasterUCL/Info retrieval and data mining/coursework/dataset/train.csv', encoding="ISO-8859-1")
df_test = pd.read_csv('/Users/Hermione/MasterUCL/Info retrieval and data mining/coursework/dataset/test.csv', encoding="ISO-8859-1")
# df_attr = pd.read_csv('../input/attributes.csv')
df_pro_desc = pd.read_csv('/Users/Hermione/MasterUCL/Info retrieval and data mining/coursework/dataset/product_descriptions.csv')

num_train = df_train.shape[0]

def str_stemmer(s):
	return " ".join([stemmer.stem(word) for word in s.lower().split()])

def str_common_word(str1, str2):
	return sum(int(str2.find(word)>=0) for word in str1.split())


df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')

df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(x))
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stemmer(x))
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stemmer(x))

df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64)

df_all['product_info'] = df_all['search_term']+"\t"+df_all['product_title']+"\t"+df_all['product_description']

df_all['word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
df_all['word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[2]))

df_all = df_all.drop(['search_term','product_title','product_description','product_info'],axis=1)

df_train = df_all.iloc[:num_train]
df_test = df_all.iloc[num_train:]
id_test = df_test['id']

y_train = df_train['relevance'].values
X_train = df_train.drop(['id','relevance'],axis=1).values
X_test = df_test.drop(['id','relevance'],axis=1).values






In [15]:
df_train.head()

Unnamed: 0,id,product_uid,relevance,len_of_query,word_in_title,word_in_description
0,2,100001,3.0,2,1,1
1,3,100001,2.5,2,1,1
2,9,100002,3.0,2,1,1
3,16,100005,2.33,3,1,1
4,17,100005,2.67,3,3,2


In [8]:
from sklearn.ensemble import GradientBoostingRegressor
from time import time

In [13]:
t0 = time()
params = {'n_estimators': 500, 'max_depth': 6, 'min_samples_split': 2, 'min_samples_leaf':15, 'learning_rate': 0.035, 'loss': 'ls', 'verbose':1}
clf = GradientBoostingRegressor(**params)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_pred[y_pred<1.]=1.
y_pred[y_pred>3.]=3.


pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('GBRsubmission.csv',index=False)
sorted_idx = np.argsort(clf.feature_importances_)
pd.DataFrame({"name":df_all.keys().drop(['id','relevance'])[sorted_idx], "importance": clf.feature_importances_[sorted_idx]}).to_csv('feature_importances_benchmark_without_dummies.csv',index=False)

print("file saved") 
print('modelling time:',round((time()-t0)/60,1) ,'minutes\n')

      Iter       Train Loss   Remaining Time 
         1           0.2815           23.65s
         2           0.2782           21.98s
         3           0.2751           21.88s
         4           0.2722           22.02s
         5           0.2695           22.76s
         6           0.2669           22.34s
         7           0.2646           22.37s
         8           0.2624           22.44s
         9           0.2603           22.92s
        10           0.2584           23.45s
        20           0.2452           21.89s
        30           0.2384           20.88s
        40           0.2346           20.27s
        50           0.2323           20.01s
        60           0.2309           19.84s
        70           0.2299           19.51s
        80           0.2293           19.69s
        90           0.2287           19.44s
       100           0.2283           18.97s
       200           0.2259           12.46s
       300           0.2244            8.12s
       40

In [14]:
clf.feature_importances_

array([ 0.75216189,  0.08729953,  0.10431245,  0.05622614])

In [10]:
rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('RFsubmission.csv',index=False)

