In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool, cv
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [42]:
# Считываем данные из датасета
df = pd.read_csv('book_data.csv')
df.head(2)

Unnamed: 0,book_authors,book_desc,book_edition,book_format,book_isbn,book_pages,book_rating,book_rating_count,book_review_count,book_title,genres,image_url
0,Suzanne Collins,Winning will make you famous. Losing means cer...,,Hardcover,9780440000000.0,374 pages,4.33,5519135,160706,The Hunger Games,Young Adult|Fiction|Science Fiction|Dystopia|F...,https://images.gr-assets.com/books/1447303603l...
1,J.K. Rowling|Mary GrandPré,There is a door at the end of a silent corrido...,US Edition,Paperback,9780440000000.0,870 pages,4.48,2041594,33264,Harry Potter and the Order of the Phoenix,Fantasy|Young Adult|Fiction,https://images.gr-assets.com/books/1255614970l...


In [43]:
# Редактируем датасет
df['book_isbn'] = (~df['book_isbn'].isna()).astype(int)

df['book_pages'] = df['book_pages'].apply(
    lambda x: int(x.split()[0]) if pd.notnull(x) else np.nan)

del df['image_url']
del df['book_edition']
del df['book_rating_count']
del df['book_review_count']

for col in ('book_authors', 'book_desc', 'book_title', 'genres'):
    df[col].fillna('', inplace=True)

for col in ('book_format', 'book_isbn'):
    df[col].fillna(-999, inplace=True)

df['genres'] = df['genres'].apply(lambda x: x.replace('|', ' , '))

df.head(2)

Unnamed: 0,book_authors,book_desc,book_format,book_isbn,book_pages,book_rating,book_title,genres
0,Suzanne Collins,Winning will make you famous. Losing means cer...,Hardcover,1,374.0,4.33,The Hunger Games,"Young Adult , Fiction , Science Fiction , Dyst..."
1,J.K. Rowling|Mary GrandPré,There is a door at the end of a silent corrido...,Paperback,1,870.0,4.48,Harry Potter and the Order of the Phoenix,"Fantasy , Young Adult , Fiction"


In [44]:
# Разделение данных на обучающую и тестовую выборки
train_data = df.sample(frac=0.8, random_state=1)
test_data = df.drop(train_data.index)

In [45]:
# Определение целевой переменной и признаков
target = 'book_rating'
features = [col for col in train_data.columns if col != target]


In [46]:
# Определение признаков
categorical_features = [2, 3]
text_features = [0, 1, 5, 6]


In [47]:
# Разделение обучающего набора данных на обучающую и валидационную выборки
features_train, features_validation, target_train, target_validation = train_test_split(
    train_data[features], train_data[target], train_size=0.75, random_state=42)

features_test = test_data.drop('book_rating', axis=1)


In [48]:
# Моделька)
model = CatBoostRegressor(
    iterations=1000,
    random_seed=42,
    eval_metric='RMSE',
    thread_count=-1,
    use_best_model=True,
    task_type='GPU',
    logging_level='Silent'
)


In [49]:
# ЗАПУСК
model.fit(
    features_train, target_train,
    cat_features=categorical_features,
    text_features=text_features,
    eval_set=(features_validation, target_validation),
    plot=True
)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x1f9c63de050>

In [21]:
# Кросс-валидация модели
cv_params = model.get_params()
cv_data = cv(Pool(
    train_data[features],
    train_data[target],
    cat_features=categorical_features,
    text_features=text_features),
    cv_params,
    plot=True
)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [22]:
submisstion = pd.DataFrame()
submisstion['book_title'] = features_test['book_title']
submisstion['book_rating'] = model.predict(features_test)
submisstion.to_csv('submission.csv', index=False)

submisstion

Unnamed: 0,book_title,book_rating
9,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...,4.316881
14,The Da Vinci Code,3.950541
15,Memoirs of a Geisha,3.933447
18,Les Misérables,4.181789
20,Jane Eyre,3.982859
...,...,...
54284,The World God Only Knows 17,4.257336
54287,The World God Only Knows 03 (The World God Onl...,4.226160
54290,The World God Only Knows vol. 06 (The World Go...,4.094324
54299,He Wanted the Moon: The Madness and Medical Ge...,3.878354


In [50]:
# Оценка качества модели на тестовой выборке
mse = mean_squared_error(test_data[target], model.predict(test_data[features]))
print(f'MSE: {mse}')

MSE: 0.1081537905057378


In [33]:
model.get_feature_importance()

array([ 9.91637331, 19.67030493,  0.27352741,  2.86130878, 14.44836228,
       12.51764538, 40.31247792])

In [51]:
model2 = CatBoostRegressor(
    iterations=1000,
    random_seed=42,
    eval_metric='RMSE',
    thread_count=-1,
    use_best_model=True,
    task_type='GPU',
    logging_level='Silent'
)


In [70]:
# Считываем данные из датасета
df = pd.read_csv('book_data.csv')
df.head(2)


Unnamed: 0,book_authors,book_desc,book_edition,book_format,book_isbn,book_pages,book_rating,book_rating_count,book_review_count,book_title,genres,image_url
0,Suzanne Collins,Winning will make you famous. Losing means cer...,,Hardcover,9780440000000.0,374 pages,4.33,5519135,160706,The Hunger Games,Young Adult|Fiction|Science Fiction|Dystopia|F...,https://images.gr-assets.com/books/1447303603l...
1,J.K. Rowling|Mary GrandPré,There is a door at the end of a silent corrido...,US Edition,Paperback,9780440000000.0,870 pages,4.48,2041594,33264,Harry Potter and the Order of the Phoenix,Fantasy|Young Adult|Fiction,https://images.gr-assets.com/books/1255614970l...


In [71]:
# Редактируем датасет
df['book_isbn'] = (~df['book_isbn'].isna()).astype(int)

df['book_pages'] = df['book_pages'].apply(
    lambda x: int(x.split()[0]) if pd.notnull(x) else np.nan)

del df['image_url']
del df['book_edition']
del df['book_review_count']
del df['book_rating']

for col in ('book_authors', 'book_desc', 'book_title', 'genres'):
    df[col].fillna('', inplace=True)

for col in ('book_format', 'book_isbn'):
    df[col].fillna(-999, inplace=True)

df['genres'] = df['genres'].apply(lambda x: x.replace('|', ' , '))

df.head(2)


Unnamed: 0,book_authors,book_desc,book_format,book_isbn,book_pages,book_rating_count,book_title,genres
0,Suzanne Collins,Winning will make you famous. Losing means cer...,Hardcover,1,374.0,5519135,The Hunger Games,"Young Adult , Fiction , Science Fiction , Dyst..."
1,J.K. Rowling|Mary GrandPré,There is a door at the end of a silent corrido...,Paperback,1,870.0,2041594,Harry Potter and the Order of the Phoenix,"Fantasy , Young Adult , Fiction"


In [72]:
# Определение целевой переменной и признаков
target = 'book_rating_count'
features = [col for col in train_data.columns if col != target]


In [73]:
# Разделение данных на обучающую и тестовую выборки
train_data = df.sample(frac=0.8, random_state=1)
test_data = df.drop(train_data.index)


In [74]:
categorical_features = [2, 3]
text_features = [0, 1, 5, 6]
# Разделение обучающего набора данных на обучающую и валидационную выборки
features_train, features_validation, target_train, target_validation = train_test_split(
    train_data[features], train_data[target], train_size=0.75, random_state=42)

features_test = test_data.drop('book_rating_count', axis=1)


In [75]:
model2.fit(
    features_train, target_train,
    cat_features=categorical_features,
    text_features=text_features,
    eval_set=(features_validation, target_validation),
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x1f9cab11d80>