<a href="https://colab.research.google.com/github/Grashch/Data-science/blob/main/Titanic_Tree1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [54]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.tree import DecisionTreeClassifier

In [14]:
train_data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')
train_data.shape, test_data.shape

((891, 12), (418, 11))

In [15]:
train_data.head().transpose()

Unnamed: 0,0,1,2,3,4
PassengerId,1,2,3,4,5
Survived,0,1,1,1,0
Pclass,3,1,3,1,3
Name,"Braund, Mr. Owen Harris","Cumings, Mrs. John Bradley (Florence Briggs Th...","Heikkinen, Miss. Laina","Futrelle, Mrs. Jacques Heath (Lily May Peel)","Allen, Mr. William Henry"
Sex,male,female,female,female,male
Age,22.0,38.0,26.0,35.0,35.0
SibSp,1,1,0,1,0
Parch,0,0,0,0,0
Ticket,A/5 21171,PC 17599,STON/O2. 3101282,113803,373450
Fare,7.25,71.2833,7.925,53.1,8.05


In [21]:
train_data.isna().mean(axis=0)

Unnamed: 0,0
PassengerId,0.0
Survived,0.0
Pclass,0.0
Name,0.0
Sex,0.0
Age,0.198653
SibSp,0.0
Parch,0.0
Ticket,0.0
Fare,0.0


In [18]:
train_data['Sex'].value_counts()

Unnamed: 0_level_0,count
Sex,Unnamed: 1_level_1
male,577
female,314


In [30]:
train_data['Age'].fillna(round(train_data['Age'].mean(), 2))
round(train_data['Age'].mean(), 2)

np.float64(29.7)

In [22]:
train_data['Embarked'].value_counts(dropna=False)

Unnamed: 0_level_0,count
Embarked,Unnamed: 1_level_1
S,644
C,168
Q,77
,2


In [33]:
categorical_features = []
numerical_features = []
numerical_features.extend(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'])
categorical_features.extend(['Sex', 'Embarked'])

In [38]:
encoder = OneHotEncoder()
encoder.fit(train_data[categorical_features])

In [42]:
encoder.categories_

[array(['female', 'male'], dtype=object),
 array(['C', 'Q', 'S', nan], dtype=object)]

In [43]:
categories = []
for i, feature in enumerate(categorical_features):
  categories.extend([f'{feature}:{value}' for value in encoder.categories_[i]])
categories

['Sex:female',
 'Sex:male',
 'Embarked:C',
 'Embarked:Q',
 'Embarked:S',
 'Embarked:nan']

In [46]:
train_data.loc[:,categories] = encoder.transform(train_data[categorical_features]).toarray()

In [49]:
X = train_data[categories + numerical_features]
y = train_data['Survived']

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [62]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

In [65]:
def get_score(model):
  result = []
  y_pred = model.predict_proba(X_train)
  roc_auc = roc_auc_score(y_train, y_pred[:, 1])
  accuracy = accuracy_score(y_train, y_pred.argmax(axis=1))
  f1 = f1_score(y_train, y_pred.argmax(axis=1))
  result.append(['Train', roc_auc, accuracy, f1])
  y_pred = model.predict_proba(X_test)
  roc_auc = roc_auc_score(y_test, y_pred[:, 1])
  accuracy = accuracy_score(y_test, y_pred.argmax(axis=1))
  f1 = f1_score(y_test, y_pred.argmax(axis=1))
  result.append(['Test', roc_auc, accuracy, f1])
  return pd.DataFrame(columns=['Type', 'roc_auc', 'accuracy', 'f1'], data=result)

In [66]:
get_score(tree)

Unnamed: 0,Type,roc_auc,accuracy,f1
0,Train,0.999041,0.980738,0.974359
1,Test,0.746896,0.776119,0.69697


In [None]:
def data_processing(df):
  df['Sex'] = df['Sex'].map({'male': 0, 'female': 1}).astype(int)
  df['Embarked'] = df['Embarked'].map({'S': 1, 'C': 2, 'Q': 3}).astype(int)
  df['Age'] = df['Age'].fillna(df['Age'].mean())


In [29]:
train_data['Age'].mean()

np.float64(29.69911764705882)

In [25]:
train_data['Sex'] = train_data['Sex'].map({'male': 0, 'female': 1}).astype(int)
test_data['Sex'] = test_data['Sex'].map({'male': 0, 'female': 1}).astype(int)
train_data['Age'].mean()

np.float64(29.69911764705882)

In [5]:
train_data = train_data.dropna(subset=['Embarked'])
train_data['Embarked'] = train_data['Embarked'].map({'S': 1, 'C': 2, 'Q': 3}).astype(int)
train_data['Age'] = train_data['Age'].fillna(0)
test_data = test_data.dropna(subset=['Fare'])
test_data['Embarked'] = test_data['Embarked'].map({'S': 1, 'C': 2, 'Q': 3}).astype(int)
test_data['Age'] = test_data['Age'].fillna(0)

In [6]:
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = train_data[features]
y = train_data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify= X['Pclass'])

tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

In [7]:
y_predict = tree.predict(X_test)

accuracy = accuracy_score(y_true=y_test, y_pred=y_predict)  # точность
precision = precision_score(y_true=y_test, y_pred=y_predict, average='macro')  # точность (средняя макропрецизия)
recall = recall_score(y_true=y_test, y_pred=y_predict, average='macro')  # полноту (средняя макро-полнота)
f1 = f1_score(y_true=y_test, y_pred=y_predict, average='macro')  # F1-метрика (средняя макро-F1)
accuracy, precision, recall, f1

(0.7640449438202247,
 0.7558193398957731,
 0.7565323423528045,
 0.7561643835616438)

In [8]:
grid = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [1, 2, 3, 5],
    'min_samples_leaf': [27, 30, 31, 33]
}

search = GridSearchCV(tree, grid)

search.fit(X_train, y_train)

In [9]:
search.best_params_

{'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 31}

In [10]:
tree1 = DecisionTreeClassifier(**search.best_params_)
tree1.fit(X_train, y_train)

In [14]:
y_predict = tree1.predict(X_test)

accuracy = accuracy_score(y_true=y_test, y_pred=y_predict)  # точность
precision = precision_score(y_true=y_test, y_pred=y_predict, average='macro')  # точность (средняя макропрецизия)
recall = recall_score(y_true=y_test, y_pred=y_predict, average='macro')  # полноту (средняя макро-полнота)
f1 = f1_score(y_true=y_test, y_pred=y_predict, average='macro')  # F1-метрика (средняя макро-F1)
accuracy, precision, recall, f1

(0.8314606741573034, 0.847824653922215, 0.806381372662873, 0.8166012852410972)

In [12]:
def get_score(model):
    results = []

    # Оценка на обучающей выборке
    y_pred = model.predict_p(X_train)
    # Оцениваем качество классификации
    accuracy = accuracy_score(y_true=y_test, y_pred=y_predict)  # точность
    precision = precision_score(y_true=y_test, y_pred=y_predict, average='macro')  # точность (средняя макропрецизия)
    recall = recall_score(y_true=y_test, y_pred=y_predict, average='macro')  # полноту (средняя макро-полнота)
    f1 = f1_score(y_true=y_test, y_pred=y_predict, average='macro')  # F1-метрика (средняя макро-F1)
    # Выводим результаты оценки качества модели
    print(f"Точность (Accuracy): {accuracy:.4f}")
    print(f"Прецизионность (Precision): {precision:.4f}")
    print(f"Полный охват (Recall): {recall:.4f}")
    print(f"F1-оценка (F1 Score): {f1:.4f}")
    mse = mean_squared_error(y_train, y_pred)
    mae = mean_absolute_error(y_train, y_pred)
    results.append(['Train', mse, mae])

    # Оценка на тестовой выборке
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    results.append(['Test', mse, mae])

    # Возвращаем результаты в виде таблицы
    return pd.DataFrame(columns=['Type', 'MSE', 'MAE'], data=results)

In [13]:
get_score(tree)

Точность (Accuracy): 0.8315
Прецизионность (Precision): 0.8478
Полный охват (Recall): 0.8064
F1-оценка (F1 Score): 0.8166


NameError: name 'mean_squared_error' is not defined

In [None]:
grid = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [5, 10, 15],
    'min_samples_leaf': [1, 2, 3]
}

search = GridSearchCV(tree, grid)

search.fit(X_train, y_train)

In [None]:
search.best_params_

In [None]:
tree1 = DecisionTreeClassifier(**search.best_params_)
tree1.fit(X_train, y_train)

In [None]:
get_score(tree1)

In [None]:
X = test_data[features]

y_pred = tree1.predict(X)

In [None]:
test_data['Survived'] = y_pred
test_data.head().transpose()