### Importação de Libs

In [1]:
import pandas as pd
import numpy as np
from category_encoders import OneHotEncoder
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from skopt import dummy_minimize
from sklearn.impute import KNNImputer
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

### Making Functions and Most Important variables/constants

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
ID = test['PassengerId']
SEED = 4
IMPUTER = KNNImputer(weights='distance')
SCALER = StandardScaler()

In [3]:
def search_title(x):
    y = list(x)
    if y.index(','):
        y = y[y.index(',') + 2:y.index(" ", y.index(',') + 2)]
        return "".join(y)

In [4]:
def to_miss(x):
    arr = ['Mlle.', 'Mme.', 'Lady.', 'Ms.', 'Dona.']
    if x in arr:
        x = 'Miss.'
    return x

In [5]:
def to_mr(x):
    arr = ['Dr.', 'Rev.', 'Col.', 'Major.', 'Capt.', 'the.', 'Jonkheer.', 'Sir.', 'Don.', 'the' , 'Master.']
    if x in arr:
        x = 'Mr.'
    return x

In [6]:
def is_married(x):
    if x == 'Mrs.':
        return 1
    return 0

In [7]:
def is_alone(x):
    if x == 0:
        return 1
    return 0

In [8]:
def below_age(x):
    if x < 18:
        return 1
    return 0

##### Imputing Age

In [9]:
train[['Age']] = IMPUTER.fit_transform(train[['Age']])

##### Feature Engineering

In [10]:
train['Title'] = train['Name'].apply(search_title) #make a column of titles
train['Title'] = train['Title'].apply(to_miss) #separate titles that can go to Miss
train['Title'] = train['Title'].apply(to_mr) #separate titles that can go to Mr.

In [11]:
train['is_married'] = train['Title'].apply(is_married) #check if the person are married or not

In [12]:
train['below_age'] = train['Age'].apply(below_age) #check if the passanger is a child or not

In [13]:
train['family_size'] = train['SibSp'] + train['Parch'] #make a column about the family size on Titanic

In [14]:
train['is_alone'] = train['family_size'].apply(is_alone) #check if the passanger are alone

In [15]:
train['group_by_age'] = pd.qcut(train['Age'], q=4, labels=[1,2,3,4]).astype(int) #divide age by groups of Quantiles

##### Dropping NaN and Unwanted Columns

In [16]:
train.drop(columns=['PassengerId', 'Cabin', 'Ticket','Name'], inplace=True)

In [17]:
embarked_rows_to_drop = train[train[['Embarked']].isnull().any(axis=1)].index.values

In [18]:
train.drop(index=embarked_rows_to_drop, inplace=True)

### OneHotEncoding

In [19]:
ohe = OneHotEncoder(use_cat_names=True)
ohe_train = ohe.fit_transform(train)

  elif pd.api.types.is_categorical(cols):


In [20]:
X = ohe_train.drop(columns='Survived')
y = ohe_train['Survived']

### Scaling

In [21]:
X = SCALER.fit_transform(X)

#### Train Test Split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
X, 
y, 
random_state=SEED,
test_size=0.20, 
stratify=y)

#### Validation

In [23]:
model = LogisticRegression()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.7640449438202247

### Definitive Model

In [24]:
model.fit(X, y)

LogisticRegression()

##### Imputing Age and Fare

In [25]:
test[['Age']] = IMPUTER.fit_transform(test[['Age']])
test[['Fare']] = IMPUTER.fit_transform(test[['Fare']])

##### Feature Engineering on Df Set

In [26]:
test['Title'] = test['Name'].apply(search_title) #make a column of titles
test['Title'] = test['Title'].apply(to_miss) #separate titles that can go to Miss
test['Title'] = test['Title'].apply(to_mr) #separate titles that can go to Mr.
test['is_married'] = test['Title'].apply(is_married) #check if the person are married or not
test['below_age'] = test['Age'].apply(below_age) #check if the passanger is a child or not
test['family_size'] = test['SibSp'] + test['Parch'] #make a column about the family size on Titanic
test['is_alone'] = test['family_size'].apply(is_alone) #check if the passanger are alone
test['group_by_age'] = pd.qcut(test['Age'], q=4, labels=[1,2,3,4]).astype(int) #divide age by groups of Quantiles

#### Dropping Unwanted Columns

In [27]:
test.drop(columns=['PassengerId', 'Ticket', 'Cabin', 'Name'], inplace=True)

#### OneHotEncoding, Scaling and Predicting

In [28]:
test = ohe.fit_transform(test)
test = SCALER.fit_transform(test)
result = model.predict(test)
result = result.astype(int)

  elif pd.api.types.is_categorical(cols):


In [29]:
sub = pd.Series(result, index=ID, name='Survived')
sub.to_csv("model.csv", header=True)