In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import pickle
import matplotlib.pyplot as plt
import warnings
from imblearn.over_sampling import RandomOverSampler
from sklearn.svm import SVC 
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score 
from sklearn.preprocessing import OneHotEncoder, RobustScaler 
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier 
from category_encoders import OrdinalEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    classification_report,
    confusion_matrix,
)
from preprocessing import data_wrangler
warnings.filterwarnings('ignore')

In [11]:
# Load and preprocess the data using the `data_wrangler` function
model_df = data_wrangler("comptab_2018-01-29 16_00_comma_separated.csv")

# Split the data into features (X) and the target (y)
X = model_df.drop("App.", axis=1)
y = model_df["App."]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [12]:
# Obtain the training data using the `split_data` function
# X_train, y_train = split_data()[0], split_data()[2]

# Perform random oversampling on the training data
X_train_over, y_train_over = RandomOverSampler(random_state=42).fit_resample(X_train, y_train)

In [13]:
# Define a list of base estimators
estimator = []
estimator.append(('LR', LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=200)))
estimator.append(('SVC', SVC(gamma='auto', probability=True)))
estimator.append(('DTC', DecisionTreeClassifier(max_depth=10)))
estimator.append(('RFC', RandomForestClassifier(random_state=42)))
estimator.append(('GBC', GradientBoostingClassifier()))

# Create a Voting Classifier with the specified voting method
model = make_pipeline(
    OrdinalEncoder(),
    RobustScaler(),
    VotingClassifier(estimators=estimator, voting="hard")
)

In [14]:
# Fit the model with the oversampled data
model.fit(X_train_over, y_train_over)

In [18]:
# Enter the preceding directory
os.chdir("..")

# Specify the file path where you want to save the model
model_path = f'serve/model/model_{pd.Timestamp.now().isoformat()}.pkl'

# Save the model to the file using pickle
with open(model_path, 'wb') as file:
    pickle.dump(model, file)

print(f'Model saved to {model_path}')

Model saved to serve/model/model_2023-11-07T14:38:53.407380.pkl
