In [15]:
import os

import numpy as np
import pandas as pd

import joblib

import yaml
import json

import warnings
warnings.filterwarnings("ignore")

In [16]:
config_path = '../config/params.yml'
config = yaml.load(open(config_path), Loader=yaml.FullLoader)

preproc = config['preprocessing']
training = config['train']
evaluate = config['evaluate']


# проверка, что уникальные значения сохранены
column_sequence_path = preproc['unique_values_path']
with open(column_sequence_path) as json_file:
    column_sequence = json.load(json_file)

In [17]:
preproc

{'change_type_columns': {'authors': 'category',
  'language_code': 'category',
  'publisher': 'category',
  'genre': 'category',
  'publication_year': 'int64',
  'century': 'category'},
 'date_transform': {'publication_date': 'publication_year'},
 'copy': {'publication_year': 'century'},
 'map_bins_columns': {'century': [[1950, 2000], ['20', '20.5', '21']]},
 'drop_columns': ['bookID',
  'isbn',
  'isbn13',
  'title',
  'publication_date',
  'text_reviews_count'],
 'train_path': '../data/processed/books_processed.csv',
 'unique_values_path': '../data/processed/unique_values.json',
 'test_size': 0.25,
 'target_column': 'average_rating',
 'random_state': 10,
 'train_path_proc': '../data/processed/train.csv',
 'test_path_proc': '../data/processed/test.csv',
 'raw_data_path': '../data/raw/books.csv'}

# Импорт данных

In [18]:
data_test = pd.read_csv(preproc['raw_data_path'])
data_test[:4]

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,genre
0,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic,Fantasy
1,9,"Unauthorized Harry Potter Book Seven News: ""Ha...",W. Frederick Zimmerman,3.74,976540606,9780976540601,en-US,152,19,1,4/26/2005,Nimble Books,Fiction
2,12,The Ultimate Hitchhiker's Guide: Five Complete...,Douglas Adams,4.38,517226952,9780517226957,eng,815,3628,254,11/1/2005,Gramercy Books,Science Fiction
3,14,The Hitchhiker's Guide to the Galaxy (Hitchhik...,Douglas Adams,4.22,1400052920,9781400052929,eng,215,4930,460,8/3/2004,Crown,Science Fiction


In [19]:
data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9953 entries, 0 to 9952
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   bookID              9953 non-null   int64  
 1   title               9953 non-null   object 
 2   authors             9953 non-null   object 
 3   average_rating      9953 non-null   float64
 4   isbn                9953 non-null   object 
 5   isbn13              9953 non-null   int64  
 6   language_code       9953 non-null   object 
 7   num_pages           9953 non-null   int64  
 8   ratings_count       9953 non-null   int64  
 9   text_reviews_count  9953 non-null   int64  
 10  publication_date    9953 non-null   object 
 11  publisher           9953 non-null   object 
 12  genre               9953 non-null   object 
dtypes: float64(1), int64(5), object(7)
memory usage: 1011.0+ KB


# Препроцессинг

In [20]:
def date_transform(data: pd.DataFrame, 
                   date_columns: dict, 
                   copy_columns: dict) -> pd.DataFrame:
    """
    Препроцессинг данных, содержащих дату
    :param data: датасет
    :param date_columns: словарь признаков, которые содержат дату
    :param copy_columns: названия новых признаков, созданных на основе имеющихся
    :return: датасет
    """
    
    for k1, v1 in date_columns.items():
        data[v1] = data[k1].str[-4:]
    
    for k2, v2 in copy_columns.items():
        data[v2] = data[k2].astype(int)
      
    return data


def get_bins(data: (int, float), 
             first_val: (int, float), 
             second_val: (int, float),
             labels: list) -> pd.DataFrame:
    """
    Генерация бинов для признаков
    :param data: датасет
    :param first_val: первый порог значения для разбиения на бины
    :param second_val: второй порог значения для разбиения на бины
    :param labels: метки бинов
    :return: датасет
    """
    
    assert isinstance(data, (int, float)), "Проблема с типом данных в признаке"
    result = (
        labels[0]
        if data <= first_val
        else labels[1]
        if first_val < data <= second_val
        else labels[2]
    )
    return result


def transform_types(data: pd.DataFrame, change_type_columns: dict) -> pd.DataFrame:
    """
    Преобразование признаков в заданный тип данных
    :param data: датасет
    :param change_type_columns: словарь с признаками и типами данных
    :return: датасет
    """
    
    return data.astype(change_type_columns, errors="raise")


def check_columns_evaluate(data: pd.DataFrame, unique_values_path: str) -> pd.DataFrame:
    """
    Проверка на наличие признаков из train и упорядочивание признаков согласно train
    :param data: датасет test
    :param unique_values_path: путь до списока с признаками train для сравнения
    :return: датасет test
    """
    
    with open(unique_values_path) as json_file:
        unique_values = json.load(json_file)

    column_sequence = unique_values.keys()

    assert set(column_sequence) == set(data.columns), "Разные признаки"
    return data[column_sequence]

In [21]:
def pipeline_preprocess(data: pd.DataFrame, flg_evaluate: bool = True, **kwargs):
    """
    Пайплайн по предобработке данных
    :param data: датасет
    :param flg_evaluate: флаг для evaluate
    :return: датасет
    """
    
    # обработка даты
    date_transform(data, kwargs["date_transform"], kwargs["copy"])
       
    # создание бинов
    for key in kwargs["map_bins_columns"].keys():
        data[key] = data[key].apply(
            lambda x: get_bins(
                x,
                first_val=kwargs["map_bins_columns"][key][0][0],
                second_val=kwargs["map_bins_columns"][key][0][1],
                labels=kwargs["map_bins_columns"][key][1]
            ))
        
    # изменение типа данных
    data = transform_types(data=data, change_type_columns=kwargs["change_type_columns"])    
    
    # удалнение колонок
    data = data.drop(kwargs["drop_columns"], axis=1, errors="ignore")
    data = data.drop(kwargs["target_column"], axis=1, errors="ignore")
    
    # проверка dataset на совпадение с признаками из train
    # либо сохранение уникальных данных с признаками из train
    if flg_evaluate:
        data = check_columns_evaluate(
            data=data, unique_values_path=kwargs["unique_values_path"]
        )
    else:
        save_unique_train_data(
            data=data,
            drop_columns=kwargs["drop_columns"],
            target_column=kwargs["target_column"],
            unique_values_path=kwargs["unique_values_path"],
        )
        
    
    return data

In [22]:
data_proc_test = pipeline_preprocess(data=data_test, **preproc)

In [23]:
data_proc_test

Unnamed: 0,authors,language_code,num_pages,ratings_count,publisher,genre,publication_year,century
0,J.K. Rowling,eng,352,6333,Scholastic,Fantasy,2003,21
1,W. Frederick Zimmerman,en-US,152,19,Nimble Books,Fiction,2005,21
2,Douglas Adams,eng,815,3628,Gramercy Books,Science Fiction,2005,21
3,Douglas Adams,eng,215,4930,Crown,Science Fiction,2004,21
4,Douglas Adams/Stephen Fry,eng,6,1266,Random House Audio,Science Fiction,2005,21
...,...,...,...,...,...,...,...,...
9948,William T. Vollmann/Larry McCaffery/Michael He...,eng,512,156,Da Capo Press,Fantasy,2004,21
9949,William T. Vollmann,eng,635,783,Penguin Books,Fantasy,1988,20.5
9950,William T. Vollmann,eng,415,820,Penguin Books,Fantasy,1993,20.5
9951,William T. Vollmann,eng,434,769,Ecco,Fiction,2007,21


In [24]:
data_proc_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9953 entries, 0 to 9952
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   authors           9953 non-null   category
 1   language_code     9953 non-null   category
 2   num_pages         9953 non-null   int64   
 3   ratings_count     9953 non-null   int64   
 4   publisher         9953 non-null   category
 5   genre             9953 non-null   category
 6   publication_year  9953 non-null   int64   
 7   century           9953 non-null   category
dtypes: category(5), int64(3)
memory usage: 580.7 KB


In [27]:
data_proc_test.iloc[:10].to_csv(evaluate['predict_path'], index=False)

In [26]:
dict(data_proc_test.iloc[100])

{'authors': 'Edward Lear/Laura Huliska-Beith/Edward Mendelson',
 'language_code': 'eng',
 'num_pages': 48,
 'ratings_count': 66,
 'publisher': 'Sterling',
 'genre': 'Poetry',
 'publication_year': 2001,
 'century': '21'}

In [28]:
ddt = pd.read_csv(evaluate['predict_path'])
ddt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   authors           10 non-null     object 
 1   language_code     10 non-null     object 
 2   num_pages         10 non-null     int64  
 3   ratings_count     10 non-null     int64  
 4   publisher         10 non-null     object 
 5   genre             10 non-null     object 
 6   publication_year  10 non-null     int64  
 7   century           10 non-null     float64
dtypes: float64(1), int64(3), object(4)
memory usage: 768.0+ bytes


# Загрузка модели

In [11]:
training['model_path']

'../models/model.joblib'

In [12]:
model = joblib.load(training['model_path'])

In [13]:
data_proc_test['rating_predict'] = model.predict(data_proc_test)

In [14]:
data_proc_test

Unnamed: 0,authors,language_code,num_pages,ratings_count,publisher,genre,publication_year,century,rating_predict
0,J.K. Rowling,eng,352,6333,Scholastic,Fantasy,2003,21,4.129884
1,W. Frederick Zimmerman,en-US,152,19,Nimble Books,Fiction,2005,21,3.636753
2,Douglas Adams,eng,815,3628,Gramercy Books,Science Fiction,2005,21,4.359472
3,Douglas Adams,eng,215,4930,Crown,Science Fiction,2004,21,4.083691
4,Douglas Adams/Stephen Fry,eng,6,1266,Random House Audio,Science Fiction,2005,21,4.097188
...,...,...,...,...,...,...,...,...,...
9948,William T. Vollmann/Larry McCaffery/Michael He...,eng,512,156,Da Capo Press,Fantasy,2004,21,4.075848
9949,William T. Vollmann,eng,635,783,Penguin Books,Fantasy,1988,20.5,4.012947
9950,William T. Vollmann,eng,415,820,Penguin Books,Fantasy,1993,20.5,3.922579
9951,William T. Vollmann,eng,434,769,Ecco,Fiction,2007,21,3.846701
