In [1]:
import pandas as pd
import pickle
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC

from sklearn import set_config
set_config(display="diagram")
from sklearn.base import BaseEstimator, TransformerMixin

from data_cleaner import clean, change, DataCleaner

In [7]:
data = pd.read_csv('data/data2.csv')
data.dropna(inplace=True)

In [8]:
X = data.drop('Dangerous', axis=1)
y = data['Dangerous']

In [9]:
categorical_columns = ['AnimalName', 'symptoms1', 'symptoms2', 'symptoms3', 'symptoms4', 'symptoms5']

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('Encoder', OneHotEncoder(sparse_output=False, handle_unknown='infrequent_if_exist'), categorical_columns)
    ], remainder='passthrough'
)

In [11]:
pipeline = Pipeline(steps=[
    ('cleaner', DataCleaner(clean_func=clean, change_func=change)),
    ('Preprocessor', preprocessor),
    # ('OneHotEncoder', OneHotEncoder(), categorical_columns),
    ('MinMaxScaler', MinMaxScaler()),
    ('SVC', SVC())
])

In [14]:
pipeline.fit(X, y)

In [16]:
with open('models/opoilv4.pkl', 'wb') as f:
    pickle.dump(pipeline, f, protocol=pickle.HIGHEST_PROTOCOL)