In [1]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
np.random.seed(42)

In [2]:
# сейвим картинки
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "exitimg"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# игнорим минорные ворнинги SciPy
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [3]:
def area(box):
    return (box[2] - box[0]) * (box[3] - box[1])


def intersection_over_union(boxes):
    assert(len(boxes) == 8)
    boxA = boxes[:4].values
    boxB = boxes[4:].values
    
    boxAArea = area(boxA)
    boxBArea = area(boxB)
    
    if (boxAArea == 0 or boxBArea == 0):
        return 0
        
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interArea = max(0, xB - xA) * max(0, yB - yA)

    
    iou = interArea / float(boxAArea + boxBArea - interArea)
    return iou

In [4]:
votes = pd.read_csv("final_data/train_data.csv")

In [5]:
answers = pd.read_csv("final_data/train_answers.csv")

In [6]:
quorum = votes.groupby("itemId")[['Xmin','Ymin', 'Xmax', 'Ymax']].mean().reset_index()

In [7]:
data = quorum.merge(answers, on=["itemId"])

In [8]:
data["iou"] = data[['Xmin','Ymin', 'Xmax', 'Ymax', 'Xmin_true',\
      'Ymin_true', 'Xmax_true','Ymax_true']].apply(intersection_over_union, axis=1)

In [9]:
data["iou"].mean()

0.5058725791429202

In [10]:
votes['Xstart'] = votes[['Xmax','Xmin']].apply(max, axis=1)
votes['Xstop'] = votes[['Xmax','Xmin']].apply(min, axis=1)
votes['width'] = votes['Xstart'] - votes['Xstop']
votes['Ystart'] = votes[['Ymax','Ymin']].apply(max, axis=1)
votes['Ystop'] = votes[['Ymax','Ymin']].apply(min, axis=1)
votes['height'] = votes['Ystart'] - votes['Ystop']

In [11]:
answers['Xstart_true'] = answers[['Xmax_true','Xmin_true']].apply(max, axis=1)
answers['Xstop_true'] = answers[['Xmax_true','Xmin_true']].apply(min, axis=1)
answers['width_true'] = answers['Xstart_true'] - answers['Xstop_true']
answers['Ystart_true'] = answers[['Ymax_true','Ymin_true']].apply(max, axis=1)
answers['Ystop_true'] = answers[['Ymax_true','Ymin_true']].apply(min, axis=1)
answers['height_true'] = answers['Ystart_true'] - answers['Ystop_true']

In [12]:
synt_quorum = votes.groupby("itemId")[['Xmin','Ymin', 'Xmax', 'Ymax', 'Xstart',
                                       'Xstop', 'width', 'Ystart', 'Ystop', 'height']].mean().reset_index()
synt_data = synt_quorum.merge(answers, on=["itemId"])

In [13]:
data_workflow = synt_data.drop(['itemId', 'Xmin_true','Ymin_true', 'Xmax_true', 'Ymax_true', 'Xstart_true',
                                       'Xstop_true', 'width_true', 'Ystart_true', 'Ystop_true', 'height_true'], axis=1)
data_labels = synt_data.drop(['itemId', 'Xmin','Ymin', 'Xmax', 'Ymax', 'Xstart',
                                       'Xstop', 'width', 'Ystart', 'Ystop', 'height'], axis=1)

In [14]:
# pd -> np
np_votes = votes.values
np_answers = answers.values
np_data_workflow = answers.values
np_data_labels = data_labels.values
np_synt_data = synt_data.values

In [15]:
# приведение к стандартизованному масштабу
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

transform_pipeline = Pipeline([
        ('std_scaler', StandardScaler()),
    ])

np_data_workflow = transform_pipeline.fit_transform(np_data_workflow)
np_data_labels = transform_pipeline.fit_transform(np_data_labels)



In [16]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [17]:
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(np_data_workflow, np_data_labels)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=42, splitter='best')

In [18]:
scores = cross_val_score(tree_reg, np_data_workflow, np_data_labels,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

In [19]:
display_scores(tree_rmse_scores)
scores = cross_val_score(tree_reg, np_data_workflow, np_data_labels, scoring="neg_mean_squared_error", cv=10)
pd.Series(np.sqrt(-scores)).describe()

Scores: [0.31249113 0.39716758 0.30439563 0.29125041 0.29438986 0.29975548
 0.30332429 0.27645226 0.30227102 0.30172513]
Mean: 0.30832227916791466
Standard deviation: 0.03099363372419432


count    10.000000
mean      0.308322
std       0.032670
min       0.276452
25%       0.295731
50%       0.301998
75%       0.304128
max       0.397168
dtype: float64

In [20]:
lin_reg = LinearRegression()
lin_reg.fit(np_data_workflow, np_data_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [21]:
lin_scores = cross_val_score(lin_reg, np_data_workflow, np_data_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)
scores = cross_val_score(lin_reg, np_data_workflow, np_data_labels, scoring="neg_mean_squared_error", cv=10)
pd.Series(np.sqrt(-scores)).describe()

Scores: [5.36238231e-16 9.90683099e-16 9.03394591e-16 5.93301360e-16
 4.36454751e-16 1.91168379e-15 7.22472012e-16 9.63909109e-16
 8.54572760e-16 8.63558997e-16]
Mean: 8.776268702525325e-16
Standard deviation: 3.881992393932244e-16


count    1.000000e+01
mean     8.776269e-16
std      4.091979e-16
min      4.364548e-16
25%      6.255940e-16
50%      8.590659e-16
75%      9.487805e-16
max      1.911684e-15
dtype: float64

In [22]:
forest_reg = RandomForestRegressor(n_estimators=10, random_state=42)
forest_reg.fit(np_data_workflow, np_data_labels)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [23]:
forest_scores = cross_val_score(forest_reg, np_data_workflow, np_data_labels,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)
scores = cross_val_score(forest_reg, np_data_workflow, np_data_labels, scoring="neg_mean_squared_error", cv=10)
pd.Series(np.sqrt(-scores)).describe()

Scores: [0.22741542 0.23901797 0.1728618  0.17853601 0.19696049 0.17986918
 0.17484865 0.16244017 0.18372022 0.17865108]
Mean: 0.18943209948418233
Standard deviation: 0.023522359128673037


count    10.000000
mean      0.189432
std       0.024795
min       0.162440
25%       0.175770
50%       0.179260
75%       0.193650
max       0.239018
dtype: float64

In [24]:
# протестим сеты из 18 гиперпараметрах. 6 прогонов, итого 90 раундов. Решающие деревья
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(np_data_workflow, np_data_labels)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [25]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 30}

In [26]:
grid_search.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=6, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=30, n_jobs=None, oob_score=False, random_state=42,
           verbose=0, warm_start=False)

In [27]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

0.2725327195342152 {'max_features': 2, 'n_estimators': 3}
0.21330606935780277 {'max_features': 2, 'n_estimators': 10}
0.18847461467642915 {'max_features': 2, 'n_estimators': 30}
0.24494111508382824 {'max_features': 4, 'n_estimators': 3}
0.19263131092495342 {'max_features': 4, 'n_estimators': 10}
0.17674136650762634 {'max_features': 4, 'n_estimators': 30}
0.24504499206435007 {'max_features': 6, 'n_estimators': 3}
0.1907238742621561 {'max_features': 6, 'n_estimators': 10}
0.17632781442699672 {'max_features': 6, 'n_estimators': 30}
0.23637429694715914 {'max_features': 8, 'n_estimators': 3}
0.1904379869686594 {'max_features': 8, 'n_estimators': 10}
0.17759130380346638 {'max_features': 8, 'n_estimators': 30}
0.2478773910445847 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
0.19120966532468558 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
0.23706995525347557 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
0.18188400397425505 {'bootstrap': False, 'max_feat

In [28]:
# относительная значимость
importance_for_Rforest = grid_search.best_estimator_.feature_importances_
importance_for_Rforest

array([0.0025039 , 0.08724773, 0.10103802, 0.20918652, 0.09343469,
       0.12753617, 0.0921263 , 0.07141955, 0.09092969, 0.08647129,
       0.03810615])