# Импорт необходимых библиотек

In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from catboost import CatBoostRegressor

from sklearn.metrics  import mean_absolute_error
            
from sklearn.model_selection import cross_val_score


# Проверка модели

In [40]:
def data_preprocess(filename = "../dataset/covid_data_train.csv"):
    """Call 'check_saved_model' to preprocess data before model checks."""

    df = pd.read_csv(filename, sep=',', encoding='utf-8') #Чтение файла
    df.drop('Unnamed: 0', axis=1, inplace=True)

    df.drop_duplicates(subset=["name", "district"], keep = 'first', inplace = True) #Удаление дубликатов
    df.dropna(subset = ["inf_rate"], axis = 0, inplace = True)
    
    #Удаление малозначащих признаков
    drop_columns = []
    drop_columns += list(df.loc[:,"life_quality_place_rating":"life_costs"].columns)
    drop_columns += list(['lng'])
    df.drop(drop_columns, axis=1, inplace=True)

    #Удаление колонок с именами городов и повторным названием регионв
    df.drop(["name","region_x"], axis=1, inplace=True)
    
    #Заполнение пропусков
    df.fillna(0, inplace = True)
    
    #Превращение признаков в категориальные
    df["district"] = df["district"].astype("category").cat.codes
    df["subject"] = df["subject"].astype("category").cat.codes

    return df

In [41]:
def check_saved_model(model_path = "../model/CBR_model_v1"):
    """Call 'check_saved_model' to check if the model is valid. This function returns True if it is."""
    model = CatBoostRegressor()
    model.load_model(fname=model_path, format='cbm')

    df = data_preprocess()
    X = df.drop("inf_rate", axis = 1)
    y = df["inf_rate"]

    result = cross_val_score(estimator=model, 
        X=X, y=y, scoring='neg_mean_absolute_error', cv=5, verbose=False)

    if (-result.mean() <= 0.01): return True
    return False

In [42]:
print(check_saved_model("../model/CBR_model_v1"))

True


# Получение и сохранение результата на тестовом датасете

In [46]:
def get_test_data(filename='../dataset/covid_data_test.csv'):
    """Call 'get_test_data' to read and preprocess test data"""
    df = pd.read_csv(filename, sep=',', encoding='utf-8')
    df.drop('Unnamed: 0', axis=1, inplace=True)

    drop_columns = []
    drop_columns += list(df.loc[:,"life_quality_place_rating":"life_costs"].columns)
    drop_columns += list(['lng'])
    df.drop(drop_columns, axis=1, inplace=True)

    df.drop(["name","region_x"], axis=1, inplace=True)

    df.fillna(0, inplace = True)

    df["district"] = df["district"].astype("category").cat.codes
    df["subject"] = df["subject"].astype("category").cat.codes

    return df 

In [49]:
def predict_and_save_value(filename="../dataset/covid_data_test.csv", model_path = "../model/CBR_model_v1"):
    """Call 'predict_and_save_value' to get the model prediction. This function takes one argument:
    path to csv-file that contains data to use for prediction.
    Produces: Team IVE file in current folder.
    P.S. It also produces catboost_info folder."""
    
    data = get_test_data(filename)
    model = CatBoostRegressor()
    model.load_model(fname=model_path, format='cbm')

    predicted = pd.Series(model.predict(data), name='inf_rate')

    predicted.to_csv('../result/Team IVE.csv', index=True)

In [50]:
predict_and_save_value(filename="../dataset/covid_data_test.csv", model_path="../model/CBR_model_v1")