### Importação de Libs

In [1]:
import pandas as pd
import numpy as np
from category_encoders import OneHotEncoder
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from skopt import dummy_minimize
from sklearn.impute import KNNImputer
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

### Making Functions and Most Important variables/constants

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
ID = test['PassengerId']
SEED = 4
IMPUTER = KNNImputer(weights='distance')
SCALER = MinMaxScaler()

In [3]:
def search_title(x):
    y = list(x)
    if y.index(','):
        y = y[y.index(',') + 2:y.index(" ", y.index(',') + 2)]
        return "".join(y)

In [4]:
def to_miss(x):
    arr = ['Mlle.', 'Mme.', 'Lady.', 'Ms.', 'Dona.']
    if x in arr:
        x = 'Miss.'
    return x

In [5]:
def to_mr(x):
    arr = ['Dr.', 'Rev.', 'Col.', 'Major.', 'Capt.', 'the.', 'Jonkheer.', 'Sir.', 'Don.', 'the' , 'Master.']
    if x in arr:
        x = 'Mr.'
    return x

In [6]:
def is_married(x):
    if x == 'Mrs.':
        return 1
    return 0

In [7]:
def is_alone(x):
    if x == 0:
        return 1
    return 0

In [8]:
def below_age(x):
    if x < 18:
        return 1
    return 0

##### Imputing Age

In [9]:
train[['Age']] = IMPUTER.fit_transform(train[['Age']])

##### Feature Engineering

In [10]:
train['Title'] = train['Name'].apply(search_title) #make a column of titles
train['Title'] = train['Title'].apply(to_miss) #separate titles that can go to Miss
train['Title'] = train['Title'].apply(to_mr) #separate titles that can go to Mr.

In [11]:
train['is_married'] = train['Title'].apply(is_married) #check if the person are married or not

In [12]:
train['below_age'] = train['Age'].apply(below_age) #check if the passanger is a child or not

In [13]:
train['family_size'] = train['SibSp'] + train['Parch'] #make a column about the family size on Titanic

In [14]:
train['is_alone'] = train['family_size'].apply(is_alone) #check if the passanger are alone

In [15]:
train['group_by_age'] = pd.qcut(train['Age'], q=4, labels=[1,2,3,4]).astype(int) #divide age by groups of Quantiles

##### Dropping NaN and Unwanted Columns

In [16]:
train.drop(columns=['PassengerId', 'Cabin', 'Ticket','Name'], inplace=True)

In [17]:
embarked_rows_to_drop = train[train[['Embarked']].isnull().any(axis=1)].index.values

In [18]:
train.drop(index=embarked_rows_to_drop, inplace=True)

### OneHotEncoding

In [19]:
ohe = OneHotEncoder(use_cat_names=True)
ohe_train = ohe.fit_transform(train)

  elif pd.api.types.is_categorical(cols):


In [20]:
X = ohe_train.drop(columns='Survived')
y = ohe_train['Survived']

### Scaling

In [21]:
X = SCALER.fit_transform(X)

In [22]:
X = pd.DataFrame(X, columns=ohe_train.columns[1:])

In [23]:
X

Unnamed: 0,Pclass,Sex_male,Sex_female,Age,SibSp,Parch,Fare,Embarked_S,Embarked_C,Embarked_Q,Title_Mr.,Title_Mrs.,Title_Miss.,is_married,below_age,family_size,is_alone,group_by_age
0,1.0,1.0,0.0,0.271174,0.125,0.000000,0.014151,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.1,0.0,0.000000
1,0.0,0.0,1.0,0.472229,0.125,0.000000,0.139136,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.1,0.0,1.000000
2,1.0,0.0,1.0,0.321438,0.000,0.000000,0.015469,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.333333
3,0.0,0.0,1.0,0.434531,0.125,0.000000,0.103644,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.1,0.0,0.666667
4,1.0,1.0,0.0,0.434531,0.000,0.000000,0.015713,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,0.5,1.0,0.0,0.334004,0.000,0.000000,0.025374,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333
885,0.0,0.0,1.0,0.233476,0.000,0.000000,0.058556,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.000000
886,1.0,0.0,1.0,0.367921,0.125,0.333333,0.045771,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.3,0.0,0.333333
887,0.0,1.0,0.0,0.321438,0.000,0.000000,0.058556,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333


#### Train Test Split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(
X, 
y, 
random_state=SEED,
test_size=0.20, 
stratify=y)

#### Validation

In [25]:
from lightgbm import LGBMClassifier

In [29]:
def search_hyperparams(params):
    SEED = 4

    learning_rate = params[0]
    num_leaves = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    n_estimators = params[5]

    print(params, '\n')
    
    model = LGBMClassifier(random_state=SEED,
    learning_rate = learning_rate, 
    num_leaves = num_leaves, 
    min_child_samples = min_child_samples, 
    subsample = subsample,
    colsample_bytree = colsample_bytree,
    subsample_freq = 1,
    n_estimators = n_estimators)
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)

    return - accuracy_score(y_test, y_pred)

space = [(1e-3, 1e-1, 'log-uniform'), #learning rate
(2, 128), #num_leaves
(1, 100), #min_child_samples
(0.05, 1.0), #subsample
(0.1, 1.0), #colsample_bytree
(100, 1000)] #n_estimators

result = dummy_minimize(search_hyperparams, 
space,
random_state=SEED, 
verbose=1,
n_calls = 30)

learning_rate, num_leaves, min_child_samples, subsample, colsample_bytree, n_estimators = result.x

Iteration No: 1 started. Evaluating function at random point.
[0.06327656730105531, 71, 2, 0.8628399009188652, 0.6481320413049753, 493] 

Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.3850
Function value obtained: -0.7697
Current minimum: -0.7697
Iteration No: 2 started. Evaluating function at random point.
[0.0027050730096210155, 105, 95, 0.7134992241139206, 0.9128606713660984, 776] 

Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.0888
Function value obtained: -0.7247
Current minimum: -0.7697
Iteration No: 3 started. Evaluating function at random point.
[0.03620477422893411, 40, 53, 0.9842306433165472, 0.24745801726422886, 249] 

Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.0459
Function value obtained: -0.7865
Current minimum: -0.7865
Iteration No: 4 started. Evaluating function at random point.
[0.042062540164342105, 127, 57, 0.0919520550349246, 0.9609876709428125, 748] 

Iteration No: 4 ended. Evaluation done a

In [31]:
model = LGBMClassifier(learning_rate=learning_rate,
                      num_leaves=num_leaves,
                      min_child_samples=min_child_samples,
                      subsample=subsample,
                      colsample_bytree=colsample_bytree,
                      n_estimators=n_estimators,
                        subsample_freq = 1
                      )
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.797752808988764

### Definitive Model

In [None]:
model.fit(X, y)

##### Imputing Age and Fare

In [None]:
test[['Age']] = IMPUTER.fit_transform(test[['Age']])
test[['Fare']] = IMPUTER.fit_transform(test[['Fare']])

##### Feature Engineering on Df Set

In [None]:
test['Title'] = test['Name'].apply(search_title) #make a column of titles
test['Title'] = test['Title'].apply(to_miss) #separate titles that can go to Miss
test['Title'] = test['Title'].apply(to_mr) #separate titles that can go to Mr.
test['is_married'] = test['Title'].apply(is_married) #check if the person are married or not
test['below_age'] = test['Age'].apply(below_age) #check if the passanger is a child or not
test['family_size'] = test['SibSp'] + test['Parch'] #make a column about the family size on Titanic
test['is_alone'] = test['family_size'].apply(is_alone) #check if the passanger are alone
test['group_by_age'] = pd.qcut(test['Age'], q=4, labels=[1,2,3,4]).astype(int) #divide age by groups of Quantiles

#### Dropping Unwanted Columns

In [None]:
test.drop(columns=['PassengerId', 'Ticket', 'Cabin', 'Name'], inplace=True)

#### OneHotEncoding, Scaling and Predicting

In [None]:
test = ohe.fit_transform(test)
test = SCALER.fit_transform(test)
result = model.predict(test)
result = result.astype(int)

In [None]:
sub = pd.Series(result, index=ID, name='Survived')
sub.to_csv("model.csv", header=True)