In [None]:
import os
import pandas as pd
import boto3
from sagemaker import session, get_execution_role
from sagemaker.sklearn.estimator import SKLearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sagemaker.serializers import CSVSerializer

# Constants

In [None]:
DATA_KEY = 'data.xlsx'
TARGET_COLUMN = "осложнения есть/нет"

# Setup the environment variables

In [None]:
region = boto3.Session().region_name
sagemaker_session = session.Session()
default_bucket = sagemaker_session.default_bucket()
role = get_execution_role()

# Preprocessing

## Reading the data from the S3 and removing the redundant columns

In [None]:
data_location = f's3://{BUCKET}/{DATA_KEY}'
data = pd.read_excel(data_location)

data

In [None]:
row_num, col_num = data.shape
row_num, col_num

In [None]:
data = data.drop(['N', 'осложнения объед'], axis=1)

## Drop records with missing values

In [None]:
delete_flags = []
for index, row in data.iterrows():
    null_count = data.loc[[index]].isna().sum().sum()
    if null_count >= (col_num // 10):
        delete_flags.append(True)
    else:
        delete_flags.append(False)
data["delete_flag"] = delete_flags
data

In [None]:
data = data.drop(data[data.delete_flag == True].index)
data

In [None]:
data = data.drop(['delete_flag'], axis=1)
data

## Delete columns with too many nulls and update the remaining nulls with the column mean value

In [None]:
columns_to_be_deleted = []
columns_to_be_updated = []
n = len(data)
for series_name, series in data.items():
    if series.isna().sum() >= n // 10:
        columns_to_be_deleted.append(series_name)
    elif series.isna().sum() > 0:
        columns_to_be_updated.append(series_name)
print(columns_to_be_deleted)
print(columns_to_be_updated)

In [None]:
data = data.drop(columns_to_be_deleted, axis=1)

In [None]:
for col in columns_to_be_updated:
    data[col] = data[col].fillna(data[col].mean())
data

## Binarization of the quality features

In [None]:
quality_features_columns = [
    'группа', 'подгруппа', 'операция', 'стенокардия ФК',
    'СН ФК', 'ЦАГ перетоки', 'КТ очаг ишемии'
]
data = pd.get_dummies(data, columns=quality_features_columns, drop_first=True)
data

## Split the data into the train and test datasets

In [None]:
y = data[[TARGET_COLUMN]]
x = data.drop(TARGET_COLUMN, axis=1)

In [None]:
x

In [None]:
y

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    x,
    y[TARGET_COLUMN],
    test_size=0.7,
    random_state=42
)

In [None]:
regularisation_tests = {
    i: LogisticRegression(
        solver='liblinear', C=i
    ).fit(x_train, y_train).score(x_train, y_train)
    for i in range(1, 1001)
}

In [None]:
tests_results = pd.DataFrame({
    'C': regularisation_tests.keys(),
    'score': regularisation_tests.values()
})
max_preciseness = tests_results['score'].max()
regularisation_parameter = tests_results.loc[
    tests_results['score'] == max_preciseness
]['C'].iloc[0]

In [None]:
regularisation_parameter

In [None]:
model = LogisticRegression(
    solver='liblinear',
    C=2
)
trained_model = model.fit(x_train, y_train)

In [None]:
trained_model.score(x_train, y_train)

In [None]:
trained_model.score(x_test, y_test)

In [None]:
feature_selector = SequentialFeatureSelector(
    model,
    n_features_to_select=4,
    direction='backward',
    scoring='accuracy'
).fit(x_train, y_train)

In [None]:
feature_selector.get_feature_names_out()

In [None]:
new_x_train = feature_selector.transform(x_train)
new_x_test = feature_selector.transform(x_test)

In [None]:
new_trained_model = model.fit(new_x_train, y_train)

In [None]:
new_trained_model.score(new_x_train, y_train)

In [None]:
new_trained_model.score(new_x_test, y_test)

In [None]:
work_directory = 'data'
if not os.path.exists(work_directory):
    os.mkdir(work_directory)

In [None]:
x_train.to_csv('data/x_train.csv', index=False)

In [None]:
x_test.to_csv('data/x_test.csv', index=False)

In [None]:
y_train.to_csv('data/y_train.csv', index=False)

In [None]:
y_test.to_csv('data/y_test.csv', index=False)

## Upload the data into the default bucket

In [None]:
train_input = sagemaker_session.upload_data(
    work_directory, key_prefix="{}".format(work_directory)
)

# Model creation and training

In [None]:
FRAMEWORK_VERSION = "1.2-1"
script_path = "scripts/train.py"

sklearn = SKLearn(
    entry_point=script_path,
    framework_version=FRAMEWORK_VERSION,
    instance_type="ml.t3.medium",
    role=role,
    sagemaker_session=sagemaker_session,
    hyperparameters={"n_features_to_select": 4}
)

In [None]:
sklearn.fit({"train": train_input}, wait=True)

# Deploy

In [None]:
xgb_predictor = sklearn.deploy(
    initial_instance_count=1,
    instance_type='ml.t3.medium',
    serializer=CSVSerializer()
)

# Check the batch transformation

In [None]:
prefix = "batch_transformation_check"
# The location of the test dataset
batch_input = 's3://{}/{}/test'.format(default_bucket, prefix)

# The location to store the results of the batch transform job
batch_output = 's3://{}/{}/batch-prediction'.format(default_bucket, prefix)

In [None]:
transformer = sklearn.transformer(
    instance_count=1,
    instance_type='ml.t3.medium',
    output_path=batch_output
)

In [None]:
transformer.transform(
    data=batch_input,
    data_type='S3Prefix',
    content_type='text/csv',
    split_type='Line'
)
transformer.wait()