In [1]:
!pip install lightfm
!pip install pandas
!pip install kaggle
!kaggle datasets download -d rdoume/beerreviews
!unzip beerreviews.zip
!pip install catboost

Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/316.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m153.6/316.4 kB[0m [31m4.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.17-cp310-cp310-linux_x86_64.whl size=808329 sha256=be1383fb2768b7b835cb23bc6f45202a9a727952f92bd261440e97f8464f0120
  Stored in directory: /root/.cache/pip/wheels/4f/9b/7e/0b256f2168511d8fa4dae4fae0200fdbd729eb424a912ad636
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.17
Dataset URL: https:/

In [20]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter
from sklearn.utils import shuffle
from catboost import CatBoostClassifier
from google.colab import files
import joblib

In [3]:
data = pd.read_csv('beer_reviews.csv')

In [4]:
data = data[["review_profilename", "beer_beerid", "beer_name", "review_overall"]]

In [5]:
data['id'] = range(1, len(data) + 1)

In [6]:
unique_users = data['review_profilename'].unique()
num_users = len(unique_users)
user_features = pd.DataFrame({
    'review_profilename': unique_users,
    'age': np.random.randint(18, 65, size=num_users),
    'gender': np.random.choice(['М', 'Ж'], size=num_users),
    'location': np.random.choice(['Москва', 'Питер', 'Казань', 'Белгород'], size=num_users),
})

In [7]:
data = pd.merge(data, user_features, on='review_profilename')

In [8]:
le_beer = LabelEncoder()
data['beer_beerid_encoded'] = le_beer.fit_transform(data['beer_beerid'])

In [9]:
X = data[['age', 'gender', 'location']]
X = pd.get_dummies(X, columns=['gender', 'location'])
y = data['beer_beerid_encoded']

In [10]:
subset_size = 1000
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_small, y_train_small = shuffle(X_train, y_train, random_state=42)
X_train_subset = X_train_small[:subset_size]
y_train_subset = y_train_small[:subset_size]

In [11]:
scaler = StandardScaler()
X_train_subset = scaler.fit_transform(X_train_subset)

In [12]:
min_samples = 3
class_counts = Counter(y_train_subset)
valid_classes = {cls for cls, count in class_counts.items() if count >= min_samples}

X_train_filtered = []
y_train_filtered = []

for x, y in zip(X_train_subset, y_train_subset):
    if y in valid_classes:
        X_train_filtered.append(x)
        y_train_filtered.append(y)

X_train_filtered = np.array(X_train_filtered)
y_train_filtered = np.array(y_train_filtered)

print("Распределение классов после фильтрации:", Counter(y_train_filtered))

Распределение классов после фильтрации: Counter({45728: 4, 1512: 4, 2877: 4, 7438: 3, 8862: 3, 956: 3, 509: 3, 874: 3, 2278: 3, 384: 3, 2140: 3, 1485: 3, 602: 3, 47620: 3, 92: 3, 3439: 3, 20804: 3, 3414: 3, 5072: 3, 13856: 3, 666: 3})


In [13]:
model = CatBoostClassifier(n_estimators=50, max_depth=3, random_state=42, loss_function='MultiClass')
model.fit(X_train_filtered, y_train_filtered)
y_pred = model.predict(X_train_filtered)

Learning rate set to 0.5
0:	learn: 2.9489290	total: 47ms	remaining: 2.3s
1:	learn: 2.8286536	total: 48ms	remaining: 1.15s
2:	learn: 2.7415904	total: 48.7ms	remaining: 763ms
3:	learn: 2.6799030	total: 49.6ms	remaining: 570ms
4:	learn: 2.6284938	total: 50.4ms	remaining: 454ms
5:	learn: 2.5904309	total: 51.1ms	remaining: 375ms
6:	learn: 2.5235118	total: 51.8ms	remaining: 318ms
7:	learn: 2.4554865	total: 52.5ms	remaining: 275ms
8:	learn: 2.3941899	total: 53.2ms	remaining: 242ms
9:	learn: 2.3329364	total: 53.9ms	remaining: 216ms
10:	learn: 2.2737352	total: 54.7ms	remaining: 194ms
11:	learn: 2.2417645	total: 55.5ms	remaining: 176ms
12:	learn: 2.1932839	total: 56.2ms	remaining: 160ms
13:	learn: 2.1492581	total: 56.9ms	remaining: 146ms
14:	learn: 2.1102522	total: 57.6ms	remaining: 135ms
15:	learn: 2.0575854	total: 58.4ms	remaining: 124ms
16:	learn: 2.0229454	total: 59.2ms	remaining: 115ms
17:	learn: 1.9790016	total: 59.9ms	remaining: 106ms
18:	learn: 1.9625255	total: 60.6ms	remaining: 98.9ms
1

In [None]:
print("Введите ваши данные:")
try:
    user_age = int(input("Ваш возраст: "))
    user_gender = input("Ваш пол (м/ж): ").strip().lower()
    user_location = input("Ваш город: ").strip()

    if user_gender == 'м':
        user_gender = 'Male'
    elif user_gender == 'ж':
        user_gender = 'Female'
    else:
        print("Ошибка: Некорректный ввод пола. Используйте 'м' или 'ж'.")
        exit()

    if user_location not in ['Москва', 'Питер', 'Казань', 'Белгород']:
        print("Ошибка: Город должен быть одним из ['Москва', 'Питер', 'Казань', 'Белгород'].")
        exit()

    user_data = pd.DataFrame([{
        'age': user_age,
        'gender': user_gender,
        'location': user_location,
    }])
    user_data = pd.get_dummies(user_data, columns=['gender', 'location'])
    user_data = user_data.reindex(columns=X.columns, fill_value=0)

    predictions_proba = model.predict_proba(user_data)[0]

    top_5_indices = predictions_proba.argsort()[-5:][::-1]
    top_5_beers = le_beer.inverse_transform(top_5_indices)

    beer_names_dict = {id_: f"Пиво {id_}" for id_ in top_5_beers}

    print("Мы рекомендуем вам попробовать следующие 5 сортов пива:")
    for beer_id in top_5_beers:
        beer_name = data.loc[beer_id, 'beer_name'] if beer_id in data.index else "Неизвестное пиво"
        print(f"- {beer_name} (ID: {beer_id})")
except ValueError:
    print("Ошибка: Пожалуйста, введите числовое значение для возраста.")

Введите ваши данные:
Ваш возраст: 19
Ваш пол (м/ж): ж
Ваш город: USA
Мы рекомендуем вам попробовать следующие 5 сортов пива:
- Rauch Ür Bock (ID: 24)
- Rauch Ür Bock (ID: 23)
- Caldera OBF 15 (ID: 17)
- Caldera Ginger Beer (ID: 13)
- Cauldron DIPA (ID: 4)




In [22]:
joblib.dump(model, 'model.joblib')
files.download('model.joblib')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>