In [119]:
import numpy as np
from urllib.request import urlopen

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from collections import namedtuple

import subprocess
from time import time

from math import sqrt, factorial
import re

In [4]:
import pdfcrowd
import sys

try:
    # create the API client instance
    client = pdfcrowd.HtmlToPdfClient('djockerok', 'f6f23a55d4cecc7d2345529a4bf60812')

    # configure the conversion
    client.setPageSize(u'A4')
    client.setContentAreaY(u'-0.94in')
    client.setViewportWidth(3840)
    client.setViewportHeight(2160)
    client.setRenderingMode(u'viewport')

    # run the conversion and write the result to a file
    client.convertUrlToFile('https://resume.io/r/yAzB8QANw', 'result.pdf')
except pdfcrowd.Error as why:
    # report the error
    sys.stderr.write('Pdfcrowd Error: {}\n'.format(why))

    # rethrow or handle the exception
    raise

In [120]:
POINTS_PATH = 'feature_points.txt'

In [121]:
def oracle(array):
    url = 'http://185.86.144.8/oracle?x1={:f}&x2={:f}&x3={:f}&x4={:f}&x5={:f}&x6={:f}&x7={:f}&x8={:f}&x9={:f}&x10={:f}'.format(
        *array)

    with urlopen(url) as response, open(POINTS_PATH, 'a') as out_file:
        data = response.read().decode('utf-8')
        out_file.write(str(array) + '\t' + data + '\n')
        parsed_data = re.split(r'^Function value = |\nAttempts left = ', data)
        if (data != 'UNDIFINED'):
            result, attempts = [np.float64(number)
                                for number in parsed_data if number]

        else:
            return np.inf

    return result

In [153]:
oracle(np.array([ 0., 10., 10., 10., 10., 10., 10., 10., 10., 10.]))

-95.4126097877

In [2]:
import multiprocessing
multiprocessing.cpu_count()

In [3]:
multiprocessing.cpu_count()

4

In [139]:
labels = np.empty(10000)
a = 10 * np.random.random_sample((10000, 10))
for i in range(len(a)):
    labels[i] = oracle(a[i])

In [140]:
print(labels.min(),labels.max())

-4904.75994586 9011.95767574


In [142]:
b = np.empty((10000,190))
b[:,:10] = a
k = 10
for i in range(10):
    for j in range(i+1,10):
        b[:,k] = a[:,i]*a[:,j]
        b[:,k+1] = a[:,i]+a[:,j]
        b[:,k+2] = a[:,i]/a[:,j]
        b[:,k+3] = a[:,j]/a[:,i]
        k += 4

In [143]:
np.save('10000_samples',a)

In [144]:
np.save('10000_labels',labels)

In [145]:
forest = RandomForestRegressor(n_estimators = 1000, max_features=1.)
forest.fit(b, labels)
ans = forest.predict(b)

rmse = sqrt(mean_squared_error(labels, ans))
print(rmse)

58.68218964931596


In [146]:
args = np.argsort(-forest.feature_importances_)

In [147]:
for feature, importance in zip(args[:10], forest.feature_importances_[args][:10]):
    print(feature, importance)

62 0.10489878784169951
184 0.07642882741601377
188 0.0739580613212587
49 0.05226737528944467
12 0.05055516528653012
44 0.04181714060745929
63 0.04093864510476451
22 0.03896592431683401
164 0.03668684862565145
48 0.0344595453599513


In [149]:
ultimate_features = args[:10]
ultimate_features = np.concatenate((ultimate_features,np.arange(10)))
ultimate_features

array([ 62, 184, 188,  49,  12,  44,  63,  22, 164,  48,   0,   1,   2,
         3,   4,   5,   6,   7,   8,   9], dtype=int64)

In [63]:
forest1 = RandomForestRegressor(n_estimators = 1000, max_features=0.8)
forest1.fit(b[:,:10], labels)
ans = forest1.predict(b[:,:10])

rmse = sqrt(mean_squared_error(labels, ans))
print(rmse)

1.4562243900828872e+46


In [47]:
args1 = np.argsort(-forest1.feature_importances_)

In [48]:
for feature, importance in zip(args1, forest1.feature_importances_[args1]):
    print(feature, importance)

1 0.3573114690539652
0 0.14361866249588695
2 0.13100794772310098
4 0.11353069043446275
9 0.06399184501803466
5 0.0497695690404563
7 0.04012250633403388
8 0.03616730067854955
3 0.03384800952007879
6 0.030631999701430823


In [89]:
ultimate_features = np.concatenate((args[:10], ultimate_features))

In [90]:
ultimate_features.sort()

In [94]:
ultimate_features = np.unique(ultimate_features)
ultimate_features

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  28,
        29,  30,  34,  35,  45,  46,  47,  51,  58,  59,  78,  79, 144,
       145, 147], dtype=int64)

In [75]:
forest2 = RandomForestRegressor(n_estimators = 1000, max_features=1.)
forest2.fit(b[:,ultimate_features], labels)
ans = forest2.predict(b[:,ultimate_features])

rmse = sqrt(mean_squared_error(labels, ans))
print(rmse)

1.1122846502076133e+46


In [68]:
Arguments = namedtuple('Arguments', 'test train')
args = Arguments('test.txt','train.txt')

In [69]:
train_data = np.loadtxt(args.train)
test_data  = np.loadtxt(args.test)

total_data = np.concatenate(([train_data, test_data]), axis=0)

In [100]:
total_data_e = np.empty((len(total_data),191))
total_data_e[:,:11] = total_data
k = 11
for i in range(1,11):
    for j in range(i+1,11):
        total_data_e[:,k] = total_data[:,i]*total_data[:,j]
        total_data_e[:,k+1] = total_data[:,i] + total_data[:,j]
        total_data_e[:,k+2] = total_data[:,i]/total_data[:,j]
        total_data_e[:,k+3] = total_data[:,j]/total_data[:,i]
        k += 4

In [1]:
for i in range(5):
    print(-i*1000)

0
-1000
-2000
-3000
-4000


In [101]:
ultimate_features

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  28,
        29,  30,  34,  35,  45,  46,  47,  51,  58,  59,  78,  79, 144,
       145, 147], dtype=int64)

In [102]:
forest = RandomForestRegressor(n_estimators = 1000, max_features=1.)
forest.fit(total_data_e[:, ultimate_features + 1], total_data_e[:, 0])
ans = forest.predict(total_data_e[:, ultimate_features + 1])

rmse = sqrt(mean_squared_error(total_data_e[:, 0], ans))
print(rmse)

10181854064.037718


In [105]:
len(forest.feature_importances_)

28

In [107]:
idxs = np.argsort(-forest.feature_importances_)

In [108]:
forest.feature_importances_[idxs]

array([0.50782594, 0.37034712, 0.02247834, 0.01564369, 0.01554232,
       0.01311995, 0.0108624 , 0.00661802, 0.00506222, 0.00404302,
       0.00290253, 0.00260435, 0.00224044, 0.00189829, 0.00176072,
       0.00173039, 0.0017028 , 0.00156494, 0.0015056 , 0.00138606,
       0.0013561 , 0.00129185, 0.00118147, 0.00116608, 0.00113303,
       0.00107501, 0.00098464, 0.00097268])

In [109]:
ultimate_features[idxs]

array([ 10,  11,  47,   2,  46,   0,  78,   1,  79,   8,   3,  51, 147,
        28,   6,   9,   4,  30,  45, 144, 145,  29,  35,  58,   5,   7,
        34,  59], dtype=int64)

In [None]:
18383855314.000107 1000

In [None]:
X = train_data[:len(train_data)//2,1:]
y_old = train_data[:len(train_data)//2,0]
y_new = []

start = time()
for i in np.arange(0,5):
    y_new.append(float(oracle(*X[i])))
stop = time()
print('time: ', stop - start)

for i,j in zip(y_old[:5],y_new):
    print(i,j)

In [None]:
ans = forest.predict(total_data[:, 1:])

rmse = sqrt(mean_squared_error(total_data[:, 0], ans))
print(rmse)

In [None]:
forest.feature_importances_