In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_selection import VarianceThreshold

In [None]:
## Constants

In [None]:
BUCKET = 'sagemaker-medical-logistical-regression-data-storage'
DATA_KEY = 'data.xlsx'
TARGET_COLUMN = "осложнения есть/нет"

In [None]:
## Reading the data from the S3 and removing the redundant columns

In [None]:
data_location = f's3://{BUCKET}/{DATA_KEY}'
data = pd.read_excel(data_location)

data

In [None]:
row_num, col_num = data.shape
row_num, col_num

In [None]:
data = data.drop(['N', 'осложнения объед'], axis=1)

In [None]:
## Drop records with missing values

In [None]:
delete_flags = []
for index, row in data.iterrows():
    null_count = data.loc[[index]].isna().sum().sum()
    if null_count >= (col_num // 10):
        delete_flags.append(True)
    else:
        delete_flags.append(False)
delete_flags

In [None]:
data["delete_flag"] = delete_flags
data

In [None]:
data = data.drop(data[data.delete_flag == True].index)
data

In [None]:
data = data.drop(['delete_flag'], axis=1)
data

In [None]:
## Delete columns with too many nulls and update the remaining nulls with the column mean value

In [None]:
columns_to_be_deleted = []
columns_to_be_updated = []
n = len(data)
for series_name, series in data.items():
    if series.isna().sum() >= n // 10:
        columns_to_be_deleted.append(series_name)
    elif series.isna().sum() > 0:
        columns_to_be_updated.append(series_name)
print(columns_to_be_deleted)
print(columns_to_be_updated)

In [None]:
data = data.drop(columns_to_be_deleted, axis=1)

In [None]:
for col in columns_to_be_updated:
    data[col] = data[col].fillna(data[col].mean())
data

In [None]:
## Binarization of the quality features

In [None]:
quality_features_columns = ['группа', 'подгруппа', 'операция', 'стенокардия ФК', 'СН ФК', 'ЦАГ перетоки', 'КТ очаг ишемии']
data = pd.get_dummies(data, columns=quality_features_columns, drop_first=True)
data

In [None]:
## Split the data into the train and test datasets

In [None]:
Y = data[TARGET_COLUMN]
X = data.drop(TARGET_COLUMN, axis=1)

In [None]:
X

In [None]:
Y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.7, random_state=42)

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test

In [None]:
## Model creation and evaluation

In [None]:
regularisation_tests = {
    i: LogisticRegression(solver='liblinear', C=i).fit(X_train, y_train).score(X_test, y_test) for i in range(1, 101)
}
tests_results = pd.DataFrame({'C': regularisation_tests.keys(), 'score': regularisation_tests.values()})

In [None]:
max_preciseness = tests_results['score'].max()
tests_results.loc[tests_results['score'] == max_preciseness]['C'].iloc[0]

In [None]:
model = LogisticRegression(solver='liblinear', C=2).fit(X_train, y_train)

In [None]:
model.predict(X_test)

In [None]:
model.score(X_test, y_test)

In [None]:
model.score(X_train, y_train)