### Importação de Libs

In [1]:
import pandas as pd
import numpy as np
from category_encoders import OneHotEncoder
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from skopt import dummy_minimize
from sklearn.impute import KNNImputer
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

### Pré-processamento de Dados

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
ID = test['PassengerId']
SEED = 4
imputer = KNNImputer(weights='distance')
scaler = StandardScaler()

In [3]:
def search_title(x):
    y = list(x)
    if y.index(','):
        y = y[y.index(',') + 2:y.index(" ", y.index(',') + 2)]
        return "".join(y)

In [4]:
def to_miss(x):
    arr = ['Mlle.', 'Mme.', 'Lady.', 'Ms.', 'Dona.']
    if x in arr:
        x = 'Miss.'
    return x

In [5]:
def to_mr(x):
    arr = ['Dr.', 'Rev.', 'Col.', 'Major.', 'Capt.', 'the.', 'Jonkheer.', 'Sir.', 'Don.', 'the' , 'Master.']
    if x in arr:
        x = 'Mr.'
    return x

In [6]:
train['Title'] = train['Name'].apply(search_title)
train['Title'] = train['Title'].apply(to_miss)
train['Title'] = train['Title'].apply(to_mr)
train = train.drop(columns=['PassengerId', 'Cabin', 'Ticket'])

In [7]:
def is_married(x):
    if x == 'Mrs.':
        return 1
    return 0

In [8]:
train['is_married'] = train['Title'].apply(is_married)

In [9]:
train.drop(columns='Name', inplace=True)

In [10]:
embarked_rows_to_drop = train[train[['Embarked']].isnull().any(axis=1)].index.values

In [11]:
train.drop(index=embarked_rows_to_drop, inplace=True)

### Escalando & Tornando Colunas Categóricas em Dummies

In [12]:
ohe = OneHotEncoder(use_cat_names=True)
ohe_train = ohe.fit_transform(train)

  elif pd.api.types.is_categorical(cols):


In [13]:
ohe_columns = ohe_train.columns.values

In [14]:
ohe_train = imputer.fit_transform(ohe_train)
ohe_train = pd.DataFrame(ohe_train, columns=ohe_columns)

In [15]:
X = ohe_train.drop(columns='Survived')
y = ohe_train['Survived']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
X, 
y, 
random_state=SEED,
test_size=0.20, 
stratify=y)

In [17]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
X = scaler.fit_transform(X)

### Tuning

In [19]:
(0.001, 1000), #'C'
('l1', 'l2'), #'penalty'
(100, 1000), #'max_iter'
('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga') #'solver'


('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga')

### Validação

In [20]:
model = LogisticRegression()

In [21]:
kfold = KFold(n_splits = 10, random_state = SEED, shuffle=True)
results = cross_validate(model, 
                         X_train, 
                         y_train, 
                         cv = kfold, 
                         return_train_score=False,
                        scoring='accuracy')
cv = round(results['test_score'].mean(), 2)

In [22]:
results['test_score']

array([0.81944444, 0.77464789, 0.87323944, 0.84507042, 0.83098592,
       0.81690141, 0.81690141, 0.84507042, 0.76056338, 0.81690141])

In [23]:
display(f'Taxa Média de Acerto de Modelo Light GBM: {cv * 100:.2f} %')

'Taxa Média de Acerto de Modelo Light GBM: 82.00 %'

### Modelo Final

In [24]:
model.fit(X, y)

LogisticRegression()

In [25]:
test.drop(columns=['PassengerId', 'Ticket', 'Cabin'], inplace=True)

In [26]:
test['Title'] = test['Name'].apply(search_title)
test['Title'] = test['Title'].apply(to_miss)
test['Title'] = test['Title'].apply(to_mr)
test['is_married'] = test['Title'].apply(is_married)

In [27]:
test.drop(columns=['Name'], inplace=True)

In [28]:
test['Age'] = imputer.fit_transform(test[['Age']])
test['Fare'] = imputer.fit_transform(test[['Fare']])

In [29]:
test = ohe.fit_transform(test)

  elif pd.api.types.is_categorical(cols):


In [30]:
test = scaler.fit_transform(test)

In [31]:
test = pd.DataFrame(test, columns=ohe_columns[1:])

In [32]:
result = model.predict(test)

In [33]:
result = result.astype(int)

In [34]:
sub = pd.Series(result, index=ID, name='Survived')

In [35]:
sub.to_csv("model.csv", header=True)