In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score

In [8]:
df = pd.read_csv("features/train_second.csv")

In [3]:
df

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Cabin_deck,Cabin_num,Cabin_side,Transported
0,Europa,False,1,39.0,0,0.0,0.0,0.0,0.0,0.0,2,0,1,0
1,Earth,False,1,24.0,0,109.0,9.0,25.0,549.0,44.0,6,0,2,1
2,Europa,False,1,58.0,1,43.0,3576.0,0.0,6715.0,49.0,1,0,2,0
3,Europa,False,1,33.0,0,0.0,1283.0,371.0,3329.0,193.0,1,0,2,0
4,Earth,False,1,16.0,0,303.0,70.0,151.0,565.0,2.0,6,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,3,41.0,1,0.0,6819.0,0.0,1643.0,74.0,1,98,1,0
8689,Earth,True,2,18.0,0,0.0,0.0,0.0,0.0,0.0,7,1499,2,0
8690,Earth,False,1,26.0,0,0.0,0.0,1872.0,1.0,0.0,7,1500,2,1
8691,Europa,False,3,32.0,0,0.0,1049.0,0.0,353.0,3235.0,5,608,2,0


In [9]:
x_train, x_test, y_train, y_test = train_test_split(df.drop("Transported", axis=1), df['Transported'])

#### Функция для оценки модели

In [5]:
def estimate_model(my_model):
    pred = my_model.predict(x_test)
    rmse = (np.sqrt(mean_squared_error(y_test, pred)))
    r2 = r2_score(y_test, pred)
    score = my_model.score(x_test, y_test)
    local_score = my_model.score(x_train, y_train)
    print("Testing performance")
    print("RMSE: {:.2f}".format(rmse))
    print("R2: {:.2f}".format(r2))
    print("Score: {:.4f}".format(score))
    print("Local Score: {:.4f}".format(local_score))

    print("Best params: ", my_model.get_params())

#### Просто RandomForestClassifier

In [6]:
model1 = RandomForestClassifier()

In [10]:
model1.fit(x_train, y_train)

In [20]:
estimate_model(model1)

Testing performance
RMSE: 0.46
R2: 0.16
Score: 0.7902
Local Score: 0.9997
Best params:  {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


#### Добавим CrosValidator

In [32]:
model2 = RandomForestClassifier()

In [33]:
cv_scores = cross_val_score(model2, x_train, y_train, cv=10)

#### Делаем GridSearchCV

In [17]:
model3 = RandomForestClassifier()

In [16]:
param_grid = {
    'n_estimators': [400],
    'max_depth': [15, 20],
    'min_samples_split': [12, 11],
    "min_samples_leaf": [2, 3],
    "criterion": ['gini']
}

In [18]:
grid_search = GridSearchCV(model3, param_grid, cv=5)
grid_search.fit(x_train, y_train)
model3 = grid_search.best_estimator_

In [19]:
estimate_model(model3)

Testing performance
RMSE: 0.46
R2: 0.16
Score: 0.7902
Local Score: 0.8889
Best params:  {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 15, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 2, 'min_samples_split': 12, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 400, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


#### Делаем предсказания на тесте

In [21]:
test = pd.read_csv("features/test_second.csv")

In [23]:
pred = model1.predict(test.drop(["PassengerId"], axis=1))

In [24]:
test["Transported"] = pred.astype(bool)

In [25]:
test[['PassengerId', "Transported"]].to_csv("results/second_data_model1rf.csv", index=False)