In [2]:
import pandas as pd
import warnings
warnings.filterwarnings(action = 'ignore')

final_df = pd.read_csv('/content/drive/MyDrive/VOD Project/준호/데이터스쿨3차_2308월/final_df.csv', index_col=0)
final_df

Unnamed: 0,userid,program,score,main_cat,sub_cat
0,59879000,소방서 옆 경찰서,0.122238,TV드라마,기타
1,59879000,신성한 이혼,0.861328,TV드라마,기타
2,59900000,초대: 스와핑 데이,0.292893,영화,멜로
3,59900000,후궁-제왕의첩,0.292893,영화,멜로
4,59900000,범죄도시3,0.250000,영화,액션/어드벤쳐
...,...,...,...,...,...
1257,67107000,고창 2부,0.000000,우리동네,연예/오락
1258,67107000,마크맨,0.292893,영화,액션/어드벤쳐
1259,67107000,고창 1부,0.000000,우리동네,연예/오락
1260,67107000,전라남도 여수 2부,0.000000,우리동네,연예/오락


In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate
from tensorflow.keras.optimizers import Adam
from collections import defaultdict

df = final_df.copy()

# 사용자 및 아이템에 대한 Label Encoding
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

df['user'] = user_encoder.fit_transform(df['userid'])
df['item'] = item_encoder.fit_transform(df['program'])

# 층화추출을 하기 위해 프로그램을 2개 이상 본 사람들만 남김
counts = df['user'].value_counts()
selected_users = counts[counts >= 2].index
df = df[df['user'].isin(selected_users)]

# 데이터 분할
X = df[['user', 'item']]
y = df['score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify = X.user)

# 신경망 모델 구축
def RecommenderNet(num_users, num_items, embedding_size):
    # Input layers
    user_input = Input(shape=(1,), name='user_input')
    item_input = Input(shape=(1,), name='item_input')

    # Embedding layers
    user_embedding = Embedding(output_dim=embedding_size, input_dim=num_users, input_length=1, name='user_embedding')(user_input)
    item_embedding = Embedding(output_dim=embedding_size, input_dim=num_items, input_length=1, name='item_embedding')(item_input)

    # Flatten the embedding layers
    user_vector = Flatten()(user_embedding)
    item_vector = Flatten()(item_embedding)

    # Concatenate the flattened embedding layers
    concat = Concatenate()([user_vector, item_vector])

    # Dense layer
    dense = Dense(128, activation='relu')(concat)
    output = Dense(1)(dense)

    model = Model(inputs=[user_input, item_input], outputs=output)
    return model

# 모델 정의 및 컴파일
embedding_size = 50
model = RecommenderNet(len(user_encoder.classes_), len(item_encoder.classes_), embedding_size)
model.compile(optimizer=Adam(0.001), loss='mean_squared_error')

# 모델 훈련
model.fit([X_train.user, X_train.item], y_train, batch_size=64, epochs=5, verbose=1)

# 추천 함수
def recommend_items(model, user_id, all_items, user_encoder, item_encoder, top_n=10):
    user_idx = user_encoder.transform([user_id])[0]
    user_idx_array = np.array([user_idx for _ in range(len(all_items))])
    items_idx_array = np.array([i for i in range(len(all_items))])

    predictions = model.predict([user_idx_array, items_idx_array]).flatten()
    top_items_idx = predictions.argsort()[-top_n:][::-1]
    top_items_ids = item_encoder.inverse_transform(top_items_idx)

    return top_items_ids

# precision@k 계산
def precision_at_k(model, X_test, y_test, k=10, threshold=4.0):
    user_item_pairs = X_test.to_numpy()
    actual_scores = y_test.to_numpy()
    predictions = model.predict([user_item_pairs[:, 0], user_item_pairs[:, 1]]).flatten()

    precision_scores = defaultdict(list)
    for idx, (user, item, prediction, actual) in enumerate(zip(user_item_pairs[:, 0], user_item_pairs[:, 1], predictions, actual_scores)):
        if actual >= threshold:
            precision_scores[user].append((prediction, 1))
        else:
            precision_scores[user].append((prediction, 0))

    precision_at_k_scores = []
    for user, user_scores in precision_scores.items():
        user_scores.sort(key=lambda x: x[0], reverse=True)
        top_k_scores = user_scores[:k]
        num_relevant_items = sum([score[1] for score in top_k_scores])
        precision_at_k_scores.append(num_relevant_items / k)

    return np.mean(precision_at_k_scores)

# 특정 사용자에 대한 추천
test_user_id = df['userid'].iloc[0]
all_items = df['program'].unique()
recommended_items = recommend_items(model, test_user_id, all_items, user_encoder, item_encoder, top_n=10)
print('recommended_items :', recommended_items)

# 평가 지표 계산
precision_score = precision_at_k(model, X_test, y_test, k=10)
print('precision_score :', precision_score)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
recommended_items : ['천원짜리 변호사' '경경아심 : 두근두근 내 마음을 들어봐' '아적자위여해1 : 나의 고슴도치 그녀1' '청락' '구해줘! 홈즈'
 'TV동물농장' '뽀로로 인기 동요' '하늘의 인연' '가가니별포 : 네가 어디 있든' '짱구는 못말려 23기']
precision_score : 0.0


In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate
from tensorflow.keras.optimizers import Adam
from collections import defaultdict

df = final_df.copy()

# 사용자 및 아이템에 대한 Label Encoding
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

df['user'] = user_encoder.fit_transform(df['userid'])
df['item'] = item_encoder.fit_transform(df['program'])

# 데이터 분할
X = df[['user', 'item']]
y = df['score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 신경망 모델 구축
def RecommenderNet(num_users, num_items, embedding_size):
    # Input layers
    user_input = Input(shape=(1,), name='user_input')
    item_input = Input(shape=(1,), name='item_input')

    # Embedding layers
    user_embedding = Embedding(output_dim=embedding_size, input_dim=num_users, input_length=1, name='user_embedding')(user_input)
    item_embedding = Embedding(output_dim=embedding_size, input_dim=num_items, input_length=1, name='item_embedding')(item_input)

    # Flatten the embedding layers
    user_vector = Flatten()(user_embedding)
    item_vector = Flatten()(item_embedding)

    # Concatenate the flattened embedding layers
    concat = Concatenate()([user_vector, item_vector])

    # Dense layer
    dense = Dense(128, activation='relu')(concat)
    output = Dense(1)(dense)

    model = Model(inputs=[user_input, item_input], outputs=output)
    return model

# 모델 정의 및 컴파일
embedding_size = 25
model = RecommenderNet(len(user_encoder.classes_), len(item_encoder.classes_), embedding_size)
model.compile(optimizer=Adam(0.001), loss='mean_squared_error')

# 모델 훈련
model.fit([X_train.user, X_train.item], y_train, batch_size=64, epochs=5, verbose=1)

# 추천 함수
def recommend_items(model, user_id, all_items, user_encoder, item_encoder, top_n=10):
    user_idx = user_encoder.transform([user_id])[0]
    user_idx_array = np.array([user_idx for _ in range(len(all_items))])
    items_idx_array = np.array([i for i in range(len(all_items))])

    predictions = model.predict([user_idx_array, items_idx_array]).flatten()
    top_items_idx = predictions.argsort()[-top_n:][::-1]
    top_items_ids = item_encoder.inverse_transform(top_items_idx)

    return top_items_ids

# precision@k 계산
def precision_at_k(model, X_test, y_test, k=10, threshold=4.0):
    user_item_pairs = X_test.to_numpy()
    actual_scores = y_test.to_numpy()
    predictions = model.predict([user_item_pairs[:, 0], user_item_pairs[:, 1]]).flatten()

    precision_scores = defaultdict(list)
    for idx, (user, item, prediction, actual) in enumerate(zip(user_item_pairs[:, 0], user_item_pairs[:, 1], predictions, actual_scores)):
        if actual >= threshold:
            precision_scores[user].append((prediction, 1))
        else:
            precision_scores[user].append((prediction, 0))

    precision_at_k_scores = []
    for user, user_scores in precision_scores.items():
        user_scores.sort(key=lambda x: x[0], reverse=True)
        top_k_scores = user_scores[:k]
        num_relevant_items = sum([score[1] for score in top_k_scores])
        precision_at_k_scores.append(num_relevant_items / k)

    return np.mean(precision_at_k_scores)

# 특정 사용자에 대한 추천
test_user_id = df['userid'].iloc[0]
all_items = df['program'].unique()
recommended_items = recommend_items(model, test_user_id, all_items, user_encoder, item_encoder, top_n=10)
print('recommended_items :', recommended_items)

# 평가 지표 계산
precision_score = precision_at_k(model, X_test, y_test, k=5)
print('precision_score :', precision_score)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
recommended_items : ['연인 파트1' '뽀로로 인기 동요' '킹더랜드' '심야괴담회' '타요의 씽씽극장 동요2' '응답하라1994'
 '신발 벗고 돌싱포맨' '천원짜리 변호사' '가가니별포 : 네가 어디 있든' '가슴이 뛴다']
precision_score : 0.0


In [12]:
recommended_items

array(['연인 파트1', '짱구는 못말려 22기', '킹더랜드', '신발 벗고 돌싱포맨', '뽀로로 인기 동요',
       '천원짜리 변호사', '아씨두리안', '심야괴담회', '편의점 샛별이', '인간극장'], dtype=object)

In [13]:
precision_score

0.0

In [14]:
X_test

Unnamed: 0,user,item
1206,258,492
868,201,468
532,112,269
344,67,92
405,81,60
...,...,...
163,32,371
485,96,343
679,138,610
221,43,43
