In [10]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
import pickle

In [11]:
df = pd.read_csv('train.csv')

In [12]:
import sys
sys.path.append('../')  # ou o caminho até a raiz do projeto
from src.data_processing import CategoricalFeatureEngineer


In [None]:
from sklearn.impute import SimpleImputer

cat_ohe_cols = ['Embarked', 'Title_Mapped', 'Deck', 'Age_Group', 'Fare_Group', 'Ticket_Prefix']
ordinal_cat_features = ['Pclass', 'Alone', 'Sex']  # sex já binarizado
numeric_features = ['Family_Size', 'Cabin_Count', 'Ticket_Group_Size']

# Imputers
cat_imputer = SimpleImputer(strategy='most_frequent')
num_imputer = SimpleImputer(strategy='median')

preprocessor = ColumnTransformer(transformers=[
    ('cat_impute', cat_imputer, cat_ohe_cols),
    ('ord_impute', cat_imputer, ordinal_cat_features),
    ('num_impute', num_imputer, numeric_features),
    ('ohe', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), cat_ohe_cols),
    ('ord', 'passthrough', ordinal_cat_features),
    ('num', StandardScaler(), numeric_features),
])

from sklearn.pipeline import Pipeline

full_pipeline = Pipeline(steps=[
    ('feature_engineering', CategoricalFeatureEngineer()),
    ('preprocessing', preprocessor)
])

In [14]:
from sklearn.pipeline import Pipeline
from cloudpickle import pickle

full_pipeline = Pipeline(steps=[
    ('feature_engineering', CategoricalFeatureEngineer()),
    ('preprocessing', preprocessor)
])

In [15]:
X_transformed = full_pipeline.fit_transform(df)

# Recupera nomes das colunas OHE
ohe_cols = full_pipeline.named_steps['preprocessing'].named_transformers_['ohe'].get_feature_names_out(cat_ohe_cols)

# Final feature names
final_columns = list(ohe_cols) + ordinal_cat_features + numeric_features

# DataFrame final
df_final = pd.DataFrame(X_transformed, columns=final_columns, index=df.index)

with open('../src/model/pipeline.pkl', 'wb') as f:
    pickle.dump(full_pipeline, f)

# Output
print("=== VARIÁVEIS TRANSFORMADAS ===")
df_final.head()

=== VARIÁVEIS TRANSFORMADAS ===


Unnamed: 0,Embarked_Q,Embarked_S,Title_Mapped_Miss,Title_Mapped_Mr,Title_Mapped_Mrs,Title_Mapped_Rare,Deck_B,Deck_C,Deck_D,Deck_E,...,Ticket_Prefix_STON,Ticket_Prefix_SW,Ticket_Prefix_W,Ticket_Prefix_WE,Pclass,Alone,Sex,Family_Size,Cabin_Count,Ticket_Group_Size
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.05916,-0.488483,-0.579162
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.05916,1.340249,-0.579162
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,3.0,1.0,0.0,-0.560975,-0.488483,-0.579162
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.05916,1.340249,0.155928
4,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,1.0,1.0,-0.560975,-0.488483,-0.579162


In [16]:
from cloudpickle import pickle

with open('../src/model/pipeline.pkl', 'rb') as f:
    pipeline = pickle.load(f)

In [17]:
pipeline

0,1,2
,steps,"[('feature_engineering', ...), ('preprocessing', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('ohe', ...), ('ord', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [20]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Carregar dados
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Carregar pipeline de pré-processamento
with open('../src/model/pipeline.pkl', 'rb') as f:
    pipeline = pickle.load(f)

# Separar features e target
X = df.drop(columns=['Survived'])
y = df['Survived']

# Processar dados de treino
X_processed = pipeline.transform(X)

# Separar treino e validação
X_train, X_val, y_train, y_val = train_test_split(X_processed, y, test_size=0.2, random_state=42, stratify=y)

# Treinar modelo
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Avaliar no conjunto de validação
val_preds = model.predict(X_val)
print("Validação - Previsões:", val_preds)

# Processar dados de teste
X_test_processed = pipeline.transform(df_test)

# Prever no conjunto de teste
test_preds = model.predict(X_test_processed)
print("Predições para o dataset de teste:")
print(test_preds)

# Salvar modelo treinado
with open('../src/model/model.pkl', 'wb') as f:
    pickle.dump(model, f)

Validação - Previsões: [0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 1 0 1 0 1 1 0 1 1 1
 0 0 1 1 1 0 1 1 0 0 1 1 1 1 0 1 0 0 0 0 1 1 1 1 1 0 0 0 0 1 0 0 0 1 0 1 1
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 0 0 1 0
 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 1 0 1 0 1 0 0 0 1 1 0 0 0 0
 1 1 0 0 1 0 0 0 0 0 0 1 0 1 1 1 1 0 0 0 1 0 0 0 0 0 1 0 0 1 0]
Predições para o dataset de teste:
[0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 0 1 0 1 1 0 1 0 1 0 0 1 0 0 1 0 1 0 0
 0 0 0 0 1 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 1 1 1 0 0 1
 1 1 0 1 0 1 1 0 0 0 0 0 1 1 1 1 0 0 1 0 1 0 1 0 1 0 1 0 0 0 1 0 0 0 0 1 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 1 0 1 0 0 0 0 0 1 1 1 1 1 1 1 0 0 1 0 0 1 0 0 0 0 0 0 1 1 1 1 1 0 0 1 0 1
 0 1 0 0 0 0 1 1 0 1 0 1 0 0 1 1 1 1 1 0 0 1 0 1 0 0 0 0 1 1 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 1 0 1 1 1 1 0 0 0 1 1 0 1 0 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 1 1 



# Teste Load

In [28]:
import pandas as pd
import pickle
import requests
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression  # Alterado para LogisticRegression

# Carregar dados
df = pd.read_csv('train.csv')

# Carregar pipeline de pré-processamento
with open('../src/model/pipeline.pkl', 'rb') as f:
    pipeline = pickle.load(f)

# Separar features e target
X = df.drop(columns=['Survived'])
y = df['Survived']

# Processar dados de treino
X_processed = pipeline.transform(X)

# Separar treino e validação
X_train, X_val, y_train, y_val = train_test_split(X_processed, y, test_size=0.2, random_state=42, stratify=y)

# Treinar modelo (agora LogisticRegression)
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train, y_train)

# Salvar modelo em pickle
model_path = 'new_model.pkl'
with open(model_path, 'wb') as f:
    pickle.dump(model, f)

# Enviar pickle via API
url = "http://localhost:8000/load"
with open(model_path, 'rb') as f:
    response = requests.post(url, files={'file': (model_path, f, 'application/octet-stream')})

print("Status code:", response.status_code)
print("Response:", response.json())

Status code: 200
Response: {'message': 'Model new_model.pkl loaded successfully'}
