In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
df = pd.read_csv("features/train_third.csv")

In [4]:
df = df.drop("PassengerId", axis=1)

In [5]:
x_train, x_test, y_train, y_test = train_test_split(df.drop("Transported", axis=1), df['Transported'].astype(int))

In [29]:
x_train.loc[x_train['Sex'].isna(),'Sex']

Series([], Name: Sex, dtype: float64)

#### Функция для оценки модели

In [7]:
def estimate_model(my_model):
    pred = my_model.predict(x_test)
    rmse = (np.sqrt(mean_squared_error(y_test, pred)))
    r2 = r2_score(y_test, pred)
    score = my_model.score(x_test, y_test)
    local_score = my_model.score(x_train, y_train)
    print("Testing performance")
    print("RMSE: {:.2f}".format(rmse))
    print("R2: {:.2f}".format(r2))
    print("Score: {:.4f}".format(score))
    print("Local Score: {:.4f}".format(local_score))

    print("Best params: ", my_model.get_params())

#### Просто RandomForestClassifier

In [8]:
model1 = RandomForestClassifier()

In [9]:
model1.fit(x_train, y_train)

ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [16]:
estimate_model(model1)

Testing performance
RMSE: 0.46
R2: 0.15
Score: 0.7866
Local Score: 0.9475
Best params:  {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


#### Добавим CrosValidator

In [32]:
model2 = RandomForestClassifier()

In [33]:
cv_scores = cross_val_score(model2, x_train, y_train, cv=10)

#### Делаем GridSearchCV

In [40]:
model3 = RandomForestClassifier()

In [63]:
param_grid = {
    'n_estimators': [800],
    'max_depth': [14],
    'min_samples_split': [14],
    "min_samples_leaf": [2, 3],
    "criterion": ['gini']
}

In [66]:
grid_search = GridSearchCV(model3, param_grid, cv=5)
grid_search.fit(x_train, y_train)
model3 = grid_search.best_estimator_

In [67]:
estimate_model(model3)

Testing performance
RMSE: 0.45
R2: 0.20
Score: 0.8008
Local Score: 0.8463
Best params:  {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 14, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 3, 'min_samples_split': 14, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 800, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


#### Делаем предсказания на тесте

In [50]:
test = pd.read_csv("features/Test_data_gpt3predcited.csv")

In [55]:
pred = model3.predict_proba(test.drop(["PassengerId", "Unnamed: 0", "Transported"], axis=1))

In [56]:
test["Transported"] = pred

In [59]:
test[['PassengerId', "Transported"]].to_csv("results/proba_gpt_random_forest.csv", index=False)