In [None]:
!pip install scikit-learn
!pip install joblib
!pip install catboost

In [None]:
import pandas as pd
import random
from datetime import datetime, timedelta

# 데이터프레임을 위한 빈 리스트들
data = {
    'id': [],
    'body': [],
    'created_at': [],
    'name_id': [],
    'post_id': [],
    'color': [],
    'carbon': [],
    'flavor': [],
    'sour': [],
    'sweet': [],
    'total': []
}

# 데이터 생성
num_rows = 100 * 929  # 100개의 row를 929번 생성
for post_id in range(929):
    carbon_tendency = random.randint(1, 5)
    color_tendency = random.randint(1, 5)
    flavor_tendency = random.randint(1, 5)
    sour_tendency = random.randint(1, 5)
    sweet_tendency = random.randint(1, 5)
    total_tendency = carbon_tendency + flavor_tendency + sour_tendency + sweet_tendency + color_tendency


    for _ in range(100):
        data['id'].append(len(data['id']))
        data['body'].append('test')
        
        # 생성 날짜 생성
        base_date = datetime(2023, 8, 9)
        random_seconds = random.randint(0, 86400)  # 0~86400 사이의 랜덤 초
        created_at = base_date + timedelta(seconds=random_seconds)
        data['created_at'].append(created_at)
        
        data['name_id'].append(random.choice([2, 3, 4]))
        data['post_id'].append(post_id)
        
        # 각 컬럼의 값을 경향성에 맞게 생성
        data['carbon'].append(min(max(1, carbon_tendency + random.randint(-1, 1)), 5))
        data['color'].append(min(max(1, carbon_tendency + random.randint(-1, 1)), 5))
        data['flavor'].append(min(max(1, flavor_tendency + random.randint(-1, 1)), 5))
        data['sour'].append(min(max(1, sour_tendency + random.randint(-1, 1)), 5))
        data['sweet'].append(min(max(1, sweet_tendency + random.randint(-1, 1)), 5))
        data['total'].append(min(max(1, total_tendency + random.randint(-1, 1)), 5))

# 데이터프레임 생성
df = pd.DataFrame(data)

# 결과 출력
print(df)

In [None]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

# 데이터 프레임 생성 (이전에 생성한 데이터 활용)
# df = ...

# 'post_id'를 target으로 사용하기 위해 unique한 값을 생성합니다.
post_id_mapping = {post_id: idx for idx, post_id in enumerate(df['post_id'].unique())}
df['post_id_encoded'] = df['post_id'].map(post_id_mapping)

# 입력 특성과 타겟 데이터 설정
features = ['color', 'carbon', 'flavor', 'sour', 'sweet']
target = 'post_id_encoded'

X = df[features]
y = df[target]

# 학습 데이터와 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# CatBoost 데이터 풀 생성
train_pool = Pool(X_train, label=y_train)
test_pool = Pool(X_test, label=y_test)

# CatBoost Classifier 모델 학습
model = CatBoostClassifier(iterations=25, depth=10, learning_rate=0.3, loss_function='MultiClass', random_seed=42, task_type='GPU')
# model = CatBoostClassifier(loss_function='MultiClass', learning_rate=0.5, random_seed=42)
# model.fit(train_pool, eval_set=test_pool, verbose=100)
model.fit(train_pool, eval_set=test_pool)

# 훈련 데이터 및 테스트 데이터의 예측
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# F1 스코어 계산
train_f1 = f1_score(y_train, y_train_pred, average='weighted')
test_f1 = f1_score(y_test, y_test_pred, average='weighted')

# 정확도 계산
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# 결과 출력
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

In [37]:
from catboost import CatBoostClassifier
import joblib
import numpy as np
model = CatBoostClassifier()
model.load_model('model.dump') 
input_data = np.array([2, 2, 3, 2, 2])
prediction = model.predict(input_data)
print(prediction)

[9]


In [35]:
model_filename = 'catboost_model.pkl'
loaded_model = joblib.load(model_filename)

In [36]:
input_data = np.array([2, 2, 3, 2, 2])
prediction = model.predict(input_data)
print(prediction)

[9]
