In [21]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')

df_train = pd.read_csv('/Users/Hermione/MasterUCL/Info retrieval and data mining/coursework/dataset/train.csv', encoding="ISO-8859-1")
df_test = pd.read_csv('/Users/Hermione/MasterUCL/Info retrieval and data mining/coursework/dataset/test.csv', encoding="ISO-8859-1")
# df_attr = pd.read_csv('../input/attributes.csv')
df_pro_desc = pd.read_csv('/Users/Hermione/MasterUCL/Info retrieval and data mining/coursework/dataset/product_descriptions.csv')

num_train = df_train.shape[0]

def str_stemmer(s):
	return " ".join([stemmer.stem(word) for word in s.lower().split()])

def str_common_word(str1, str2):
	return sum(int(str2.find(word)>=0) for word in str1.split())


df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')

df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(x))
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stemmer(x))
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stemmer(x))

df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64)

df_all['product_info'] = df_all['search_term']+"\t"+df_all['product_title']+"\t"+df_all['product_description']

df_all['word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
df_all['word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[2]))

df_all = df_all.drop(['search_term','product_title','product_description','product_info'],axis=1)

df_train = df_all.iloc[:num_train]
df_test = df_all.iloc[num_train:]
id_test = df_test['id']

y_train = df_train['relevance'].values
X_train = df_train.drop(['id','relevance'],axis=1).values
X_test = df_test.drop(['id','relevance'],axis=1).values






In [15]:
df_train.head()

Unnamed: 0,id,product_uid,relevance,len_of_query,word_in_title,word_in_description
0,2,100001,3.0,2,1,1
1,3,100001,2.5,2,1,1
2,9,100002,3.0,2,1,1
3,16,100005,2.33,3,1,1
4,17,100005,2.67,3,3,2


In [8]:
from sklearn.ensemble import GradientBoostingRegressor
from time import time

In [13]:
t0 = time()
params = {'n_estimators': 500, 'max_depth': 6, 'min_samples_split': 2, 'min_samples_leaf':15, 'learning_rate': 0.035, 'loss': 'ls', 'verbose':1}
clf = GradientBoostingRegressor(**params)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_pred[y_pred<1.]=1.
y_pred[y_pred>3.]=3.


pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('GBRsubmission.csv',index=False)
sorted_idx = np.argsort(clf.feature_importances_)
pd.DataFrame({"name":df_all.keys().drop(['id','relevance'])[sorted_idx], "importance": clf.feature_importances_[sorted_idx]}).to_csv('feature_importances_benchmark_without_dummies.csv',index=False)

print("file saved") 
print('modelling time:',round((time()-t0)/60,1) ,'minutes\n')

      Iter       Train Loss   Remaining Time 
         1           0.2815           23.65s
         2           0.2782           21.98s
         3           0.2751           21.88s
         4           0.2722           22.02s
         5           0.2695           22.76s
         6           0.2669           22.34s
         7           0.2646           22.37s
         8           0.2624           22.44s
         9           0.2603           22.92s
        10           0.2584           23.45s
        20           0.2452           21.89s
        30           0.2384           20.88s
        40           0.2346           20.27s
        50           0.2323           20.01s
        60           0.2309           19.84s
        70           0.2299           19.51s
        80           0.2293           19.69s
        90           0.2287           19.44s
       100           0.2283           18.97s
       200           0.2259           12.46s
       300           0.2244            8.12s
       40

In [14]:
clf.feature_importances_

array([ 0.75216189,  0.08729953,  0.10431245,  0.05622614])

In [10]:
rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('RFsubmission.csv',index=False)



In [16]:
big_feat = pd.read_csv("/Users/Hermione/IRDM/Alice/big_feat.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [17]:
big_feat.head()

Unnamed: 0,id,len_of_query,len_of_title,len_of_description,len_of_brand,len_of_attr,len_of_attr_title,query_in_title,query_in_description,query_in_attr,...,title_BM25_score,attr_title_BM25_score,brand_BM25_score,brand_feature,search_term_feature,search_vs_pro_title,search_vs_brand,search_vs_attr,search_vs_attr_title,relevance
0,2,2,5,85,2,68,31,1,1,1,...,2.401365,0.0,0.0,1000,12,0.646181,1.0,0.900807,1.0,3.0
1,3,2,5,85,2,68,31,0,0,0,...,0.0,0.0,0.0,1000,9,1.0,1.0,1.0,1.0,2.5
2,9,1,12,113,4,147,70,0,1,1,...,0.0,0.091024,0.0,1003,4,1.0,1.0,0.848267,0.839295,3.0
3,16,3,11,64,1,105,65,1,1,2,...,0.498592,0.0,0.0,1006,16,0.883087,1.0,0.874585,1.0,2.33
4,17,2,11,64,1,105,65,2,2,2,...,1.019748,0.307303,0.0,1006,13,0.624534,1.0,0.878937,0.654852,2.67


In [35]:
big_feat.columns

Index(['id', 'len_of_query', 'len_of_title', 'len_of_description',
       'len_of_brand', 'len_of_attr', 'len_of_attr_title', 'query_in_title',
       'query_in_description', 'query_in_attr', 'query_in_attr_title',
       'query_last_word_in_title', 'query_last_word_in_description',
       'query_last_word_in_attr', 'query_last_word_in_attr_title',
       'word_in_title', 'word_in_description', 'word_in_attr',
       'word_in_attr_title', 'word_in_brand', 'desc_BM25_score',
       'attr_BM25_score', 'title_BM25_score', 'attr_title_BM25_score',
       'brand_BM25_score', 'brand_feature', 'search_term_feature',
       'search_vs_pro_title', 'search_vs_brand', 'search_vs_attr',
       'search_vs_attr_title', 'relevance'],
      dtype='object')

In [18]:
big_feat['word_in_title']=df_all['word_in_title']
big_feat['word_in_description'] =df_all['word_in_description'] 


In [19]:
big_train = big_feat.iloc[:num_train]
big_test = big_feat.iloc[num_train:]
id_test = big_test['id']

bigy_train = big_train['relevance'].values
bigX_train = big_train.drop(['id','relevance'],axis=1).values
bigX_test = big_test.drop(['id','relevance'],axis=1).values

t0 = time()
params = {'n_estimators': 500, 'max_depth': 6, 'min_samples_split': 2, 'min_samples_leaf':15, 'learning_rate': 0.035, 'loss': 'ls', 'verbose':1}
clf = GradientBoostingRegressor(**params)

clf.fit(bigX_train, bigy_train)

y_pred = clf.predict(bigX_test)
y_pred[y_pred<1.]=1.
y_pred[y_pred>3.]=3.


pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('GBRsubmission_bigfeat.csv',index=False)

print('modelling time:',round((time()-t0)/60,1) ,'minutes\n')

      Iter       Train Loss   Remaining Time 
         1           0.2813            4.40m
         2           0.2778            4.05m
         3           0.2746            3.93m
         4           0.2716            3.87m
         5           0.2687            3.83m
         6           0.2661            3.82m
         7           0.2635            3.91m
         8           0.2612            4.12m
         9           0.2590            4.19m
        10           0.2568            4.36m
        20           0.2413            4.02m
        30           0.2320            4.16m
        40           0.2263            4.10m
        50           0.2222            3.92m
        60           0.2192            3.70m
        70           0.2166            3.53m
        80           0.2145            3.38m
        90           0.2131            3.22m
       100           0.2120            3.06m
       200           0.2062            2.05m
       300           0.2014            1.37m
       40

In [25]:
feats = pd.read_csv("/Users/Hermione/MasterUCL/Info retrieval and data mining/coursework/feats.csv")
feats.head()

Unnamed: 0.1,Unnamed: 0,id,product_uid,relevance,bullet_count,flag_commercial,flag_residential,flag_indoor,flag_outdoor,flag_estar,...,4th_word_in_bl,5th_word_in_bl,6th_word_in_bl,7th_word_in_bl,8th_word_in_bl,9th_word_in_bl,brand_encoded,flag_attr_has_material,flag_attr_has_color,flag_has_attr
0,0,2,100001,3.0,7.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0,0,0,0,0,0,1000,1.0,0.0,1.0
1,1,3,100001,2.5,7.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0,0,0,0,0,0,1000,1.0,0.0,1.0
2,2,9,100002,3.0,10.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0,0,0,0,0,0,1010,0.0,1.0,1.0
3,3,16,100005,2.33,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0,0,0,0,0,0,1020,0.0,1.0,1.0
4,4,17,100005,2.67,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0,0,0,0,0,0,1020,0.0,1.0,1.0


In [28]:
feats.drop(feats.columns[0], axis=1).head()

Unnamed: 0,id,product_uid,relevance,bullet_count,flag_commercial,flag_residential,flag_indoor,flag_outdoor,flag_estar,match_commercial,...,4th_word_in_bl,5th_word_in_bl,6th_word_in_bl,7th_word_in_bl,8th_word_in_bl,9th_word_in_bl,brand_encoded,flag_attr_has_material,flag_attr_has_color,flag_has_attr
0,2,100001,3.0,7.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,...,0,0,0,0,0,0,1000,1.0,0.0,1.0
1,3,100001,2.5,7.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,...,0,0,0,0,0,0,1000,1.0,0.0,1.0
2,9,100002,3.0,10.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,...,0,0,0,0,0,0,1010,0.0,1.0,1.0
3,16,100005,2.33,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,...,0,0,0,0,0,0,1020,0.0,1.0,1.0
4,17,100005,2.67,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,...,0,0,0,0,0,0,1020,0.0,1.0,1.0


In [29]:
feats_train = feats.iloc[:num_train]
feats_test = feats.iloc[num_train:]
id_test = feats_test['id']

featsy_train = feats_train['relevance'].values
featsX_train = feats_train.drop(['id','relevance','product_uid'],axis=1).values
featsX_test = feats_test.drop(['id','relevance','product_uid'],axis=1).values

t0 = time()
params = {'n_estimators': 500, 'max_depth': 6, 'min_samples_split': 2, 'min_samples_leaf':15, 'learning_rate': 0.035, 'loss': 'ls', 'verbose':1}
clf = GradientBoostingRegressor(**params)

clf.fit(featsX_train, featsy_train)

y_pred = clf.predict(featsX_test)
y_pred[y_pred<1.]=1.
y_pred[y_pred>3.]=3.


pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('GBRsubmission_feats.csv',index=False)

print('modelling time:',round((time()-t0)/60,1) ,'minutes\n')

      Iter       Train Loss   Remaining Time 
         1           0.2812            4.07m
         2           0.2775            4.05m
         3           0.2741            4.15m
         4           0.2709            4.39m
         5           0.2679            4.58m
         6           0.2651            4.56m
         7           0.2625            4.49m
         8           0.2601            4.42m
         9           0.2579            4.46m
        10           0.2557            4.48m
        20           0.2408            4.46m
        30           0.2328            4.39m
        40           0.2279            4.19m
        50           0.2248            4.04m
        60           0.2224            3.97m
        70           0.2204            3.92m
        80           0.2188            3.80m
        90           0.2175            3.71m
       100           0.2163            3.59m
       200           0.2098            2.42m
       300           0.2068            1.45m
       40

In [33]:
index = [1,2,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,29,30,31,32,33,34,35,36,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52]
print("The number of important feature is:",len(index))
#generate new train data and validation data with only important features
f_train = feats.iloc[:num_train].drop(['id','relevance','product_uid'],axis=1)
f_test = feats.iloc[num_train:].drop(['id','relevance','product_uid'],axis=1)
f_train = np.array(f_train)
f_test = np.array(f_test)
new_train = f_train[:, index]
new_test = f_test[:, index]

The number of important feature is: 49


In [34]:
t0 = time()
params = {'n_estimators': 500, 'max_depth': 6, 'min_samples_split': 2, 'min_samples_leaf':15, 'learning_rate': 0.035, 'loss': 'ls', 'verbose':1}
clf = GradientBoostingRegressor(**params)

clf.fit(new_train, featsy_train)

y_pred = clf.predict(new_test)
y_pred[y_pred<1.]=1.
y_pred[y_pred>3.]=3.


pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('GBRsubmission_selectfeats.csv',index=False)

print('modelling time:',round((time()-t0)/60,1) ,'minutes\n')

      Iter       Train Loss   Remaining Time 
         1           0.2813            4.60m
         2           0.2778            4.31m
         3           0.2745            4.18m
         4           0.2714            4.12m
         5           0.2685            4.20m
         6           0.2658            4.33m
         7           0.2633            4.53m
         8           0.2610            4.57m
         9           0.2588            4.56m
        10           0.2568            4.53m
        20           0.2425            4.53m
        30           0.2347            4.43m
        40           0.2302            4.31m
        50           0.2273            4.17m
        60           0.2250            4.05m
        70           0.2232            3.99m
        80           0.2217            3.92m
        90           0.2204            3.80m
       100           0.2193            3.71m
       200           0.2129            2.54m
       300           0.2095            1.51m
       40