In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn import ensemble
from scipy.stats.stats import pearsonr
from sklearn.svm import LinearSVR

%matplotlib inline

In [2]:
# read dataframes
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
X_train = pd.read_csv('sel_X_train.csv')
X_test = pd.read_csv('sel_X_test.csv')

In [3]:
# delete unused ids 
train.drop('Unnamed: 0', 1, inplace=True)
X_train.drop('Unnamed: 0', 1, inplace=True)
X_test.drop('Unnamed: 0', 1, inplace=True)

In [4]:
# create target value
y_train = train['relevance'].tolist()

#Models testing

For test each model apply cross_val_score with 10 folds.

In [6]:
# Linear model with l2 regularization
print cross_val_score(linear_model.Ridge(), X_train, y_train, cv = 10, scoring = 'neg_mean_absolute_error').mean()

-0.409188530707


In [7]:
# Linear model with l2 regularization
print cross_val_score(linear_model.Lasso(), X_train, y_train, cv = 10, scoring = 'neg_mean_absolute_error').mean()

# And all coef equals zero
model = linear_model.Lasso()
model.fit(X_train, y_train)
print model.coef_
# It's not good information about our features

-0.439455082754
[-0.  0.  0.  0.  0.  0. -0.]


In [8]:
# linear svr
print cross_val_score(LinearSVR(), X_train, y_train, cv=10, scoring='neg_mean_absolute_error').mean()

-0.405874490697


In [12]:
# ensemble of decision trees
for est_num in range(30, 70, 10):
    for dep in range(4, 9, 1):
        print est_num, dep, cross_val_score(ensemble.RandomForestRegressor(n_estimators = est_num, max_depth=dep, random_state=0), 
                                            X_train, y_train, cv=10, scoring = 'neg_mean_absolute_error').mean()

30 4

 -0.403166858653
30 5

 -0.401711823048
30 6

 -0.40061236694
30 7

 -0.399968773154
30 8

 -0.399521811613
40 4

 -0.403188429154
40 5

 -0.401694958359
40 6

 -0.400562672731
40 7

 -0.399924677617
40 8

 -0.399457489107
50 4

 -0.403145173885
50 5

 -0.401661622795
50 6

 -0.40051506983
50 7

 -0.399895546761
50 8

 -0.399422614276
60 4

 -0.403126672843
60 5

 -0.401651069082
60 6

 -0.400495219567
60 7

 -0.399881373864
60 8

 -0.399388928011


Increasing number of tree decreases standard deviation, but much number of trees will slow down our model.
Increasing max depth of trees leads to overfitting.

Let's take number of trees equals 30 and max depth equals 6.

In [13]:
# ensemble of decision trees
print cross_val_score(ensemble.RandomForestRegressor(n_estimators=30, max_depth=6, random_state=0), 
                      X_train, y_train, cv=10, scoring='neg_mean_absolute_error').mean()

-0.40061236694


But in test set model with parameters (50, 7) earn lower score.

In [15]:
# ensemble of random forests with bagging
print cross_val_score(ensemble.BaggingRegressor(ensemble.RandomForestRegressor(n_estimators=50, max_depth=7, 
                                                                               random_state=0), 50, random_state=0), 
                      X_train, y_train, cv=10, scoring='neg_mean_absolute_error').mean()

-0.399839824176


In [5]:
# final calculation and create output file with prediction
random_forest = ensemble.RandomForestRegressor(n_estimators=50, max_depth=7, random_state=0)
model = ensemble.BaggingRegressor(random_forest, 50, random_state=0)

model.fit(X_train, y_train)

# calc mean_absolute_error for train set
res = 0
data2 = model.predict(X_train)
for i in range(len(data2)): # for linear models
    data2[i] = min(max(1.0, data2[i]), 3.0)
   
for i in range(len(y_train)):
    res += abs(y_train[i] - data2[i])

print res / len(y_train)
#

data = model.predict(X_test)
for i in range(len(data)): # for linear models
    data[i] = min(max(1.0, data[i]), 3.0)

output = pd.DataFrame({'id': test.id, 'relevance': data}).set_index('id')

output.to_csv('output.csv', sep=',')

0.383221308881
