<a href="https://colab.research.google.com/github/Grashch/Data-science/blob/main/Titanic_Tree1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.tree import DecisionTreeClassifier

In [2]:
train_data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')
train_data.shape, test_data.shape

((891, 12), (418, 11))

In [3]:
train_data.head().transpose()

Unnamed: 0,0,1,2,3,4
PassengerId,1,2,3,4,5
Survived,0,1,1,1,0
Pclass,3,1,3,1,3
Name,"Braund, Mr. Owen Harris","Cumings, Mrs. John Bradley (Florence Briggs Th...","Heikkinen, Miss. Laina","Futrelle, Mrs. Jacques Heath (Lily May Peel)","Allen, Mr. William Henry"
Sex,male,female,female,female,male
Age,22.0,38.0,26.0,35.0,35.0
SibSp,1,1,0,1,0
Parch,0,0,0,0,0
Ticket,A/5 21171,PC 17599,STON/O2. 3101282,113803,373450
Fare,7.25,71.2833,7.925,53.1,8.05


In [10]:
train_data.isna().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,0
SibSp,0
Parch,0
Ticket,0
Fare,0


In [11]:
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].mean())

In [12]:
cat_features = ['Sex', 'Embarked']
num_features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [18]:
cat_features

['Sex', 'Embarked']

In [23]:
encoder = OneHotEncoder()
encoder.fit(train_data[cat_features])

In [24]:
encoder.categories_

[array(['female', 'male'], dtype=object),
 array(['C', 'Q', 'S', nan], dtype=object)]

In [25]:
categories = []
for i, feature in enumerate(cat_features):
    categories.extend([f'{feature}:{value}' for value in encoder.categories_[i]])
categories

['Sex:female',
 'Sex:male',
 'Embarked:C',
 'Embarked:Q',
 'Embarked:S',
 'Embarked:nan']

In [27]:
train_data.loc[:, categories] = encoder.transform(train_data[cat_features]).toarray()
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex:female,Sex:male,Embarked:C,Embarked:Q,Embarked:S,Embarked:nan
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0.0,1.0,0.0,0.0,1.0,0.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1.0,0.0,1.0,0.0,0.0,0.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1.0,0.0,0.0,0.0,1.0,0.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1.0,0.0,0.0,0.0,1.0,0.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0.0,1.0,0.0,0.0,1.0,0.0


In [31]:
X = train_data[categories + num_features]
y = train_data['Survived']
X.shape, y.shape

((891, 11), (891,))

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
round(y_train.value_counts() / y_train.shape * 100, 2), round(y_test.value_counts() / y_test.shape * 100, 2)

(Survived
 0    61.64
 1    38.36
 Name: count, dtype: float64,
 Survived
 0    61.57
 1    38.43
 Name: count, dtype: float64)

In [52]:
def get_score(model):
  result = []
  y_pred = model.predict_proba(X_train)
  roc_auc = roc_auc_score(y_train, y_pred[:, 1])
  accuracy = accuracy_score(y_train, y_pred.argmax(axis=1))
  f1 = f1_score(y_train, y_pred.argmax(axis=1))
  result.append(['Train', roc_auc, accuracy, f1])
  y_pred = model.predict_proba(X_test)
  roc_auc = roc_auc_score(y_test, y_pred[:, 1])
  accuracy = accuracy_score(y_test, y_pred.argmax(axis=1))
  f1 = f1_score(y_test, y_pred.argmax(axis=1))
  result.append(['Test', roc_auc, accuracy, f1])
  return pd.DataFrame(columns=['Type', 'roc_auc', 'accuracy', 'f1'], data=result)

In [44]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

In [53]:
get_score(tree)

Unnamed: 0,Type,roc_auc,accuracy,f1
0,Train,0.99909,0.980738,0.974359
1,Test,0.74722,0.768657,0.686869


In [70]:
grid = {
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'max_depth' : [2, 3, 5, 7],
    'min_samples_leaf' : [4, 5, 6, 7],
    'min_impurity_decrease' : [0.005, 0.007, 0.01, 0.012]
}

search = GridSearchCV(tree, grid)
search.fit(X_train, y_train)

In [71]:
search.best_params_

{'criterion': 'entropy',
 'max_depth': 5,
 'min_impurity_decrease': 0.01,
 'min_samples_leaf': 4}

In [72]:
tree1 = DecisionTreeClassifier(**search.best_params_)
tree1.fit(X_train, y_train)

In [73]:
get_score(tree1)

Unnamed: 0,Type,roc_auc,accuracy,f1
0,Train,0.863603,0.839486,0.757282
1,Test,0.829067,0.787313,0.674286


In [78]:
tree1.feature_importances_

array([0.59618954, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.23496215, 0.12651063, 0.        , 0.        ,
       0.04233768])

In [82]:
pd.DataFrame({
    'features' : categories + num_features,
    'importances' : tree1.feature_importances_
}).sort_values('importances', ascending=False).iloc[:5]

Unnamed: 0,features,importances
0,Sex:female,0.59619
6,Pclass,0.234962
7,Age,0.126511
10,Fare,0.042338
1,Sex:male,0.0
