In [1]:
!pip install lightfm
!pip install pandas
!pip install kaggle
!kaggle datasets download -d rdoume/beerreviews
!unzip beerreviews.zip
!pip install catboost

Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/316.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m307.2/316.4 kB[0m [31m11.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.17-cp310-cp310-linux_x86_64.whl size=808330 sha256=8a23477e9ef9d6c73d8c5bf94de67f70f7bc71ff015dc6c36237249fed1eadd8
  Stored in directory: /root/.cache/pip/wheels/4f/9b/7e/0b256f2168511d8fa4dae4fae0200fdbd729eb424a912ad636
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.17
Dataset URL: https:

In [17]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter
from sklearn.utils import shuffle
from catboost import CatBoostClassifier
from google.colab import files

In [3]:
data = pd.read_csv('beer_reviews.csv')

In [4]:
data = data.dropna(subset=["review_profilename", "beer_name", "beer_beerid"])
data = data[["review_profilename", "beer_beerid", "beer_name", "review_overall"]]

In [5]:
data['id'] = range(1, len(data) + 1)

In [6]:
unique_users = data['review_profilename'].unique()
num_users = len(unique_users)
user_features = pd.DataFrame({
    'review_profilename': unique_users,
    'age': np.random.randint(18, 65, size=num_users),
    'gender': np.random.choice(['М', 'Ж'], size=num_users),
    'location': np.random.choice(['Москва', 'Питер', 'Казань', 'Белгород'], size=num_users),
})

In [7]:
data = pd.merge(data, user_features, on='review_profilename')

In [8]:
le_beer = LabelEncoder()
data['beer_beerid_encoded'] = le_beer.fit_transform(data['beer_beerid'])

In [9]:
X = data[['age', 'gender', 'location']]
X = pd.get_dummies(X, columns=['gender', 'location'])
y = data['beer_beerid_encoded']

In [13]:
subset_size = 1000
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_small, y_train_small = shuffle(X_train, y_train, random_state=42)
X_train_subset = X_train_small[:subset_size]
y_train_subset = y_train_small[:subset_size]

In [14]:
scaler = StandardScaler()
X_train_subset = scaler.fit_transform(X_train_subset)

In [15]:
min_samples = 3
class_counts = Counter(y_train_subset)
valid_classes = {cls for cls, count in class_counts.items() if count >= min_samples}

X_train_filtered = []
y_train_filtered = []

for x, y in zip(X_train_subset, y_train_subset):
    if y in valid_classes:
        X_train_filtered.append(x)
        y_train_filtered.append(y)

X_train_filtered = np.array(X_train_filtered)
y_train_filtered = np.array(y_train_filtered)

print("Распределение классов после фильтрации:", Counter(y_train_filtered))

Распределение классов после фильтрации: Counter({2877: 5, 22886: 4, 874: 4, 9546: 4, 1166: 3, 1003: 3, 115: 3, 1819: 3, 692: 3, 92: 3, 934: 3, 1783: 3, 91: 3, 1621: 3, 24513: 3, 64: 3, 179: 3, 714: 3, 400: 3, 312: 3, 27365: 3, 44810: 3, 2898: 3, 666: 3, 1233: 3})


In [21]:
model = CatBoostClassifier(n_estimators=50, max_depth=3, random_state=42, loss_function='MultiClass')
model.fit(X_train_filtered, y_train_filtered)
y_pred = model.predict(X_train_filtered)

Learning rate set to 0.5
0:	learn: 3.1319090	total: 584us	remaining: 28.6ms
1:	learn: 3.0298801	total: 1.74ms	remaining: 41.7ms
2:	learn: 2.9252311	total: 2.88ms	remaining: 45.1ms
3:	learn: 2.8754657	total: 3.96ms	remaining: 45.5ms
4:	learn: 2.8242373	total: 5.04ms	remaining: 45.4ms
5:	learn: 2.7479633	total: 6.12ms	remaining: 44.9ms
6:	learn: 2.6964476	total: 7.1ms	remaining: 43.6ms
7:	learn: 2.6232177	total: 7.55ms	remaining: 39.6ms
8:	learn: 2.5704339	total: 8.33ms	remaining: 38ms
9:	learn: 2.5230225	total: 9.08ms	remaining: 36.3ms
10:	learn: 2.4743506	total: 9.82ms	remaining: 34.8ms
11:	learn: 2.4496809	total: 10.5ms	remaining: 33.4ms
12:	learn: 2.3799548	total: 11.3ms	remaining: 32ms
13:	learn: 2.3121560	total: 12ms	remaining: 30.9ms
14:	learn: 2.2560903	total: 12.8ms	remaining: 29.8ms
15:	learn: 2.2026000	total: 13.5ms	remaining: 28.6ms
16:	learn: 2.1432214	total: 14.2ms	remaining: 27.6ms
17:	learn: 2.1098132	total: 14.9ms	remaining: 26.5ms
18:	learn: 2.0725379	total: 15.6ms	rema

In [None]:
print("Введите ваши данные:")
try:
    user_age = int(input("Ваш возраст: "))
    user_gender = input("Ваш пол (м/ж): ").strip().lower()
    user_location = input("Ваш город: ").strip()

    if user_gender == 'м':
        user_gender = 'Male'
    elif user_gender == 'ж':
        user_gender = 'Female'
    else:
        print("Ошибка: Некорректный ввод пола. Используйте 'м' или 'ж'.")
        exit()

    if user_location not in ['Москва', 'Питер', 'Казань', 'Белгород']:
        print("Ошибка: Город должен быть одним из ['Москва', 'Питер', 'Казань', 'Белгород'].")
        exit()

    user_data = pd.DataFrame([{
        'age': user_age,
        'gender': user_gender,
        'location': user_location,
    }])
    user_data = pd.get_dummies(user_data, columns=['gender', 'location'])
    user_data = user_data.reindex(columns=X.columns, fill_value=0)

    predictions_proba = model.predict_proba(user_data)[0]

    top_5_indices = predictions_proba.argsort()[-5:][::-1]
    top_5_beers = le_beer.inverse_transform(top_5_indices)

    beer_names_dict = {id_: f"Пиво {id_}" for id_ in top_5_beers}

    print("Мы рекомендуем вам попробовать следующие 5 сортов пива:")
    for beer_id in top_5_beers:
        beer_name = data.loc[beer_id, 'beer_name'] if beer_id in data.index else "Неизвестное пиво"
        print(f"- {beer_name} (ID: {beer_id})")
except ValueError:
    print("Ошибка: Пожалуйста, введите числовое значение для возраста.")

Введите ваши данные:
Ваш возраст: 19
Ваш пол (м/ж): ж
Ваш город: USA
Мы рекомендуем вам попробовать следующие 5 сортов пива:
- Rauch Ür Bock (ID: 24)
- Rauch Ür Bock (ID: 23)
- Caldera OBF 15 (ID: 17)
- Caldera Ginger Beer (ID: 13)
- Cauldron DIPA (ID: 4)




In [None]:
files.download('model.joblib')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>