In [1]:
# (Optional) install and import additional libraries here (numpy, pandas, etc.)
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity as cs

In [2]:
# read input.txt 
def read_user_id():
    with open('./input.txt', 'r') as f:
        return [l.strip() for l in  f.readlines()]

In [3]:
# write to output file output.txt
def write_output(prediction):
    prediction.to_csv('output.txt',index = False)

In [4]:

def genre_tfidf_func():    
    # Preprocessing data
    df_movies = pd.read_csv('./data/movies_w_imgurl.csv')
    df_movies['genres'] = df_movies['genres'].str.split('|')
    
    # Count
    df_exploded = df_movies.explode('genres')
    total_count = df_movies.shape[0]
    genre_count = df_exploded['genres'].value_counts()
    
    # Calculate IDF 
    idf = np.log10(total_count / genre_count)
    
    # Create a dataframe
    unique_genres = genre_count.index.tolist()
    unique_genres.sort()
    genre_tfidf = pd.DataFrame(df_movies['movieId'])
    
    # Fill TF-IDF values
    for genre in unique_genres:
        genre_tfidf[genre] = df_movies['genres'].apply(lambda x: idf[genre] if genre in x else 0)
    genre_tfidf.set_index('movieId', inplace=True)
    return genre_tfidf

def tag_tfidf_func():

    # Read
    tags_df = pd.read_csv('./data/tags.csv')
    tags_df['tag'] = tags_df['tag'].str.split(',').apply(lambda x: [tag.strip() for tag in x])
    
    # Count
    tags_df_exploded = tags_df.explode('tag')
    total_count = tags_df['movieId'].nunique()
    tag_count = tags_df_exploded['tag'].value_counts()
    
    #Calculate idf
    idf = np.log10(total_count / tag_count)
    
    
    # Calculate tf
    tag_counts_pivot = tags_df_exploded.pivot_table(index='movieId', columns='tag', aggfunc='size', fill_value=0)
    total_tags_per_movie = tag_counts_pivot.sum(axis=1)
    tf_pivot = tag_counts_pivot.div(total_tags_per_movie, axis=0)
    tf_tag = pd.DataFrame(tf_pivot.to_records()).set_index('movieId')
    
    
    #calculatr tf
    for tag in tf_tag.columns:
        tf_tag[tag] *= idf[tag]
    tag_tfidf= tf_tag 
    return tag_tfidf
    
def tfidf():
    genre_tfidf = genre_tfidf_func()
    tag_tfidf = tag_tfidf_func()
    movies_representation_df = pd.concat([genre_tfidf, tag_tfidf], axis=1, sort=False).fillna(0)
    # print(movies_representation_df.shape)
    
    
    simmat = pd.DataFrame(cs(movies_representation_df))
    simmat.index = movies_representation_df.index
    simmat.columns = movies_representation_df.index

    return simmat


# 특정 사용자 (예: userId = 1)의 영화 평점 정보 추출
def do(ids):
    ratings_df = pd.read_csv('./data/ratings.csv')
    simmat = tfidf()
    total_movies = simmat.shape[0]
    df_final = pd.DataFrame(columns = ['userId', 'movieId', 'prediction_score'] )
    for user_id in ids:
        user_id = int(user_id)
        user_ratings = ratings_df[ratings_df['userId'] == user_id][['movieId', 'rating']]
        movie_ids = user_ratings['movieId'].unique()
        n_movies = len(movie_ids)  # 사용자가 평가한 영화의 수

   
        user_sim = np.array(simmat[movie_ids]).T
        user_rating = user_ratings['rating'].values.reshape(-1, 1)
        sim_sum = np.sum(user_sim, axis=0).reshape(total_movies, 1)


        user_index = np.expand_dims(np.ones(simmat.shape[0])*user_id, -1)
        movie_index = np.expand_dims(simmat.index, -1).astype(np.int64)
        estimated_scores = (np.matmul(user_sim.T, user_rating) / (sim_sum + 1))
        estimated_scores = np.round(estimated_scores, 4)
        ls= np.hstack([user_index, movie_index ,estimated_scores])
        df = pd.DataFrame(ls, columns = ['userId', 'movieId', 'prediction_score'])
        df = df.sort_values(by=['prediction_score', 'movieId'], ascending=[False, True])
        # print(df)
        df_final = pd.concat([df_final, df[:30]])
        # print(np.concatenate(ls, axis=1).shape)
    df_final = df_final.reset_index(drop=True)
    df_final = df_final.astype({'userId':'int32', 'movieId':'int32'})

    return df_final
        

# Main

In [5]:
user_ids = read_user_id()
result = do(user_ids)
write_output(result)