In [None]:
!pip install -r /root/requirements.txt

In [2]:
!apt install vim

Reading package lists... Done
Building dependency tree       
Reading state information... Done
E: Unable to locate package vim


In [None]:
import pandas as pd
import pickle

# Setting the working directory
base_path = "/home/sagemaker-user/"
import os
os.chdir(base_path)

submission = pd.read_csv("data/raw/test/test.csv")[["Id"]]
test = pd.read_csv("data/processed/test/test.csv")

# Load the saved model from disk
latest_model = os.listdir("models")[1]
with open(f"models/{latest_model}/model.bin", 'rb') as f:
    model = pickle.load(f)

pred_val = model.predict(test)

submission["quality"] = pred_val

def scale(df):
    df["quality"] = df["quality"] + 3
    return df

submission = scale(submission)

In [101]:
sorted(os.listdir("models/lgbm"))

['2023-02-05_07-12-57', '2023-02-05_07-17-07', '2023-02-05_07-29-38']

In [None]:
submission.to_csv('data/output/submission.csv', index=False)

In [99]:
import numpy as np
import pandas as pd 
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
import xgboost as xgb 
import lightgbm as lgbm
from HROCH import PHCRegressor
import optuna
from optuna.samplers import TPESampler
import datetime
import pickle
import json

# Suppressing warnings
import warnings
warnings.simplefilter("ignore")

# Setting the working directory
import os
try:
    base_path = "/home/sagemaker-user/"
    os.chdir(base_path)
except:
    base_path = "/root/"
    os.chdir(base_path)

# Loading the data
train = pd.read_csv("data/processed/train/train.csv")

# Setting the target and feature variables
target = "quality"
features = [col for col in train.columns if col != target]
n_classes = len(train[target].unique())

def objective_lgbm(trial):    
    params_lgbm = {
        'lambda_l1': trial.suggest_float('lambda_l1', 0.0, 1.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 1.0, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 10, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.0, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.0, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'num_iterations': 10000,
        'objective' : "multiclass",
        'metric' :'multi_logloss'
    }
        
    cv = StratifiedKFold(5, shuffle=True, random_state=42)
    fold_scores = []
    for i, (train_idx, val_idx) in enumerate(cv.split(train[features], train[target])):
        X_train, y_train = train.loc[train_idx, features],train.loc[train_idx, target]
        X_val, y_val = train.loc[val_idx, features],train.loc[val_idx, target]

        model = lgbm.LGBMClassifier(**params_lgbm)
        model.fit(X_train,
                  y_train,
                  eval_set=[(X_val, y_val)],
                  early_stopping_rounds=50,
                  verbose=500)

        pred_val = model.predict(X_val)

        score = cohen_kappa_score(y_val, pred_val)
        fold_scores.append(score)
    return np.mean(fold_scores)

study = optuna.create_study(direction='maximize', sampler = TPESampler())
study.optimize(func=objective_lgbm, n_trials=1)

model = lgbm.LGBMClassifier(**study.best_params)
model.fit(train.loc[:, features],
                 train.loc[:, target])

date_time_str = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
file_path = base_path + "models/lgbm/" + date_time_str
os.mkdir(file_path)

with open(file_path + '/model.bin', 'wb') as f:
    pickle.dump(model, f)
    
with open(file_path + "/params.json", 'w') as f:
    json.dump(study.best_params, f)

[32m[I 2023-02-05 07:12:55,177][0m A new study created in memory with name: no-name-2fd8ceb0-4b53-4515-b223-009d7c3c701e[0m




[32m[I 2023-02-05 07:12:57,094][0m Trial 0 finished with value: 0.3522418597474428 and parameters: {'lambda_l1': 0.4779971398496391, 'lambda_l2': 3.454665595286564, 'num_leaves': 42, 'feature_fraction': 0.18409412753724097, 'bagging_fraction': 0.6073410675936389, 'bagging_freq': 8, 'min_child_samples': 14, 'min_data_in_leaf': 69, 'max_depth': 10}. Best is trial 0 with value: 0.3522418597474428.[0m




In [97]:
os.listdir()

['src',
 '.config',
 'notebooks',
 'models',
 '.cache',
 'requirements.txt',
 'data',
 '.ipynb_checkpoints',
 'submission.csv',
 '.sagemaker-jumpstart-tasks-status.json',
 '.python_history',
 '.jupyter',
 '.ipython',
 '.aws',
 '.local',
 '.bash_history',
 '.yarnrc']