<a href="https://www.kaggle.com/code/andrey36912/notebooke9cda60ba3?scriptVersionId=194529437" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score
import optuna
import joblib

In [None]:
df = pd.read_csv('/kaggle/input/exam-1/data.csv')

In [None]:
def convert_stay_to_days(stay):
    stay_ranges = {
        '0-10': 5,
        '11-20': 15,
        '21-30': 25,
        '31-40': 35,
        '41-50': 45,
        '51-60': 55,
        '61-70': 65,
        '71-80': 75,
        '81-90': 85,
        '91-100': 95,
        'More than 100 Days': 110
    }
    return stay_ranges.get(stay, np.nan)

df['Stay_Days'] = df['Stay'].apply(convert_stay_to_days)

In [None]:
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

for feature in numerical_features:
    df[feature] = df[feature].fillna(df[feature].median())
for feature in categorical_features:
    df[feature] = df[feature].fillna(df[feature].mode()[0])

In [None]:
Q1 = df[numerical_features].quantile(0.25)
Q3 = df[numerical_features].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_cleaned = df[~((df[numerical_features] < lower_bound) | (df[numerical_features] > upper_bound)).any(axis=1)]
print("Розмір даних до видалення викидів:", df.shape)
print("Розмір даних після видалення викидів:", df_cleaned.shape)

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
X = df_cleaned.drop(columns=['Stay_Days', 'Stay'])
y_regression = df_cleaned['Stay_Days']

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y_regression, test_size=0.2, random_state=42)

In [None]:
def objective_regression(trial):
    regressor_name = trial.suggest_categorical('regressor', ['LinearRegression', 'Ridge', 'RandomForest'])
    
    if regressor_name == 'LinearRegression':
        regressor = LinearRegression()
    elif regressor_name == 'Ridge':
        alpha = trial.suggest_float('alpha', 1e-5, 10.0)
        regressor = Ridge(alpha=alpha)
    elif regressor_name == 'RandomForest':
        max_depth = trial.suggest_int('max_depth', 2, 32)
        n_estimators = trial.suggest_int('n_estimators', 10, 300)
        regressor = RandomForestRegressor(max_depth=max_depth, n_estimators=n_estimators)
    
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', regressor)])
    
    scores = cross_val_score(pipeline, X_train_reg, y_train_reg, cv=3, scoring='neg_mean_squared_error')
    mse = -scores.mean()
    
    return mse

study_regression = optuna.create_study(direction='minimize')
study_regression.optimize(objective_regression, n_trials=50)

best_params_reg = study_regression.best_trial.params
if best_params_reg['regressor'] == 'LinearRegression':
    best_model_reg = LinearRegression()
elif best_params_reg['regressor'] == 'Ridge':
    best_model_reg = Ridge(alpha=best_params_reg['alpha'])
else:
    best_model_reg = RandomForestRegressor(max_depth=best_params_reg['max_depth'], n_estimators=best_params_reg['n_estimators'])

pipeline_regression = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', best_model_reg)])
pipeline_regression.fit(X_train_reg, y_train_reg)

In [None]:
y_pred_reg = pipeline_regression.predict(X_test_reg)
mse_reg = mean_squared_error(y_test_reg, y_pred_reg)
r2_reg = r2_score(y_test_reg, y_pred_reg)

print(f'Mean Squared Error (Регресія): {mse_reg}')
print(f'R^2 Score (Регресія): {r2_reg}')

In [None]:
joblib.dump(pipeline_regression, 'best_regression_model.pkl')

X = df_cleaned.drop(columns=['Stay_Days', 'Stay'])
y_classification = df_cleaned['Stay']

X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X, y_classification, test_size=0.2, random_state=42)

In [None]:
def objective_classification(trial):
    classifier_name = trial.suggest_categorical('classifier', ['LogisticRegression', 'RandomForest'])
    
    if classifier_name == 'LogisticRegression':
        C = trial.suggest_float('C', 1e-5, 10.0)
        classifier = LogisticRegression(C=C, max_iter=1000)
    elif classifier_name == 'RandomForest':
        max_depth = trial.suggest_int('max_depth', 2, 32)
        n_estimators = trial.suggest_int('n_estimators', 10, 300)
        classifier = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators)
    
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', classifier)])
    
    scores = cross_val_score(pipeline, X_train_cls, y_train_cls, cv=3, scoring='accuracy')
    accuracy = scores.mean()
    
    return accuracy

study_classification = optuna.create_study(direction='maximize')
study_classification.optimize(objective_classification, n_trials=50)

best_params_cls = study_classification.best_trial.params
if best_params_cls['classifier'] == 'LogisticRegression':
    best_model_cls = LogisticRegression(C=best_params_cls['C'], max_iter=1000)
else:
    best_model_cls = RandomForestClassifier(max_depth=best_params_cls['max_depth'], n_estimators=best_params_cls['n_estimators'])

pipeline_classification = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', best_model_cls)])
pipeline_classification.fit(X_train_cls, y_train_cls)

In [None]:
y_pred_cls = pipeline_classification.predict(X_test_cls)
accuracy_cls = accuracy_score(y_test_cls, y_pred_cls)
f1_cls = f1_score(y_test_cls, y_pred_cls, average='weighted')

print(f'Accuracy (Класифікація): {accuracy_cls}')
print(f'F1 Score (Класифікація): {f1_cls}')


In [None]:
joblib.dump(pipeline_classification, 'best_classification_model.pkl')