In [54]:
# module
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import numpy as np

In [55]:
# Util Functions
from enum import Enum

# 파일을 읽어와 pandas의 DataFrame으로 반환해줌
def get_df_from(path: str, sep: str = ','):
    extension = path.split('.')[-1]
    try:
        if extension == "csv" or extension == "txt":
            df = pd.read_csv(path, sep=sep, on_bad_lines='skip')
            if df.empty:
                print("W: DataFrame is empty.")
                return None
            return df
        else:
            print("E: File extension is not supported.")
            return None
    except FileNotFoundError:
        print(f"E: File not found. Check the path: {path}")
        return None
    except pd.errors.ParserError as e:
        print(f"E: Error parsing CSV file: {e}")
        return None
    except Exception as e:
        print(f"E: An unexpected error occured: {e}")
        return None

# 리스트의 평균을 반환.(기본값 0)
def calculate_mean(lst):
    if lst: # 빈 리스트가 아닌 경우에만 평균 계산
        return np.mean(lst)
    return 0 # 빈 리스트인 경우 NaN 반환

# 리스트의 합을 반환.(기본값 0)
def calculate_sum(lst):
    if lst:
        return np.sum(lst)
    return 0


class ChartShape(Enum):
    BAR = 1
    HISTOGRAM = 2
    LINE = 3
    HEATMAP = 4

# Chart 뽑는 함수
def get_chart_from(df: pd.DataFrame, x_col: str, y_col: str, shape: ChartShape = ChartShape.BAR, real_time: bool = False):
    plt.figure(figsize=(10, 6))
    if shape == ChartShape.BAR:
        plt.bar(df[x_col], df[y_col])
    elif shape == ChartShape.HISTOGRAM:
        plt.hist(df[x_col], bins=10)
    elif shape == ChartShape.LINE:
        plt.plot(df[x_col], df[y_col])
    else:
        plt.plot(df[x_col], df[y_col])

    if real_time:
        plt.gca().xaxis.set_major_locator(mdates.MonthLocator()) # 월 단위로 큰 눈금 표시
        plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d')) # 날짜 형식 지정
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.grid(True)
    plt.tight_layout()
    plt.xticks(df[x_col], rotation=45, ha='right')
    plt.show()

def get_chart_from_series(sr: pd.Series, x_col: str, y_col: str, shape: ChartShape = ChartShape.BAR):
    if shape == ChartShape.BAR:
        sr.plot.bar()
    if shape == ChartShape.LINE:
        sr.plot.line()
    if shape == ChartShape.HISTOGRAM:
        sr.plot.hist()
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.show()

def get_integer(question: str):
    try:
        id = int(input(question))
        return id
    except:
        print(f"E: invalid integer")
        return 0

In [56]:
# Function

folder_path = '../public/kmrd-small/'

file_paths = {
    'castings': folder_path + 'castings.csv',
    'countries': folder_path + 'countries.csv',
    'genres': folder_path + 'genres.csv',
    'movies': folder_path + 'movies.txt',
    'peoples': folder_path + 'peoples.txt',
    'rates': folder_path + 'rates.csv',
}

def get_df_strict_from(path: str, sep: str = ','):
    df = get_df_from(path, sep)
    for _ in range(3):
        if df is not None:
            return df
        new_path = input('오류가 발생했습니다. 정확한 경로를 확인해주세요: ')
        new_sep = input('구분자를 입력해주세요: ')
        df = get_df_from(new_path, new_sep)
    exit()

- 랜덤 영화 추천

> MovieDataLoader
  - 데이터를 load 하고 전처리하는 class
  - 입력 파라미터: 파일 경로
  - return value: 데이터

> RandomRecommender
  - rates.csv, movies.txt 사용
  - 추천 실행 함수: run()
  - rate_random column 도 추가, random 값을 반영해서 결과를 return 한다.

In [57]:
dfs = {}
names = ['rates', 'movies']
for name in names:
    path = file_paths[name]
    if name == 'movies' or name == 'peoples':
        dfs[name] = get_df_strict_from(path, '\t')
    else:
        dfs[name] = get_df_strict_from(path)

In [58]:
class MovieDateLoader:
    def __init__(self, name: str):
        path = file_paths[name]
        if name == 'movies' or name == 'peoples':
            self.df =  get_df_strict_from(path, '\t')
        else:
            self.df = get_df_strict_from(path)

        if name == 'rates':
            self.rates_mean = self.df[self.df.notnull()]['rate'].mean()
            self.df['rate'].fillna(self.rates_mean)
        self.df.fillna(0)
    
class RandomRecommender:
    def __init__(self):
        self.dfs = {}
        self.dfs['rates'] = MovieDateLoader('rates')
        self.dfs['movies'] = MovieDateLoader('movies')

    def run(self, cnt):
        rates_df = self.dfs['rates'].df
        movies_df = self.dfs['movies'].df
        rates_mean = self.dfs['rates'].rates_mean

        # 랜덤으로 영화 선택
        rand_nums = np.random.choice(len(movies_df), size=cnt, replace=False)
        res = movies_df.loc[rand_nums]

        # 영화별 별점 평균값 df 생성
        rates_mean_df = pd.DataFrame(rates_df.groupby('movie')['rate'].mean())
        rates_mean_df.columns = ['rates_mean']
        res = pd.merge(res, rates_mean_df, left_on='movie', right_index=True, how='left')
        res['rates_mean'] = res['rates_mean'].fillna(rates_mean)
        res['rate_random'] = np.random.uniform(1, 10, size=cnt).round(3)
        res = res.sort_values(by='rate_random', ascending=False)
        return res.reset_index()
        

> Analyzer

- 결과를 분석하는 class
  - 평균 절대 오차(MAE)
  - 평균 제곱 오차(MSE)
  - 평균 제곱근 오차(RSME)
  - 평균 절대 비율 오차(MAPE)

In [59]:
class Analyzer():
    def __init__(self):
        pass

    def mae(self, df: pd.DataFrame, real: str, pred: str):
        cnt = len(df)
        return np.sum(np.abs(df[real] - df[pred])) / cnt
        
    def mse(self, df: pd.DataFrame, real: str, pred: str):
        cnt = len(df)
        return np.sum(np.power(df[real] - df[pred], 2)) / cnt
    
    def rmse(self, df: pd.DataFrame, real: str, pred: str):
        return np.sqrt(self.mse(df, real, pred))
    
    def mape(self, df: pd.DataFrame, real: str, pred: str):
        cnt = len(df)
        return np.sum(np.abs(df[real] - df[pred]) / np.abs(df[real])) / cnt
        
    def analyze(self, df: pd.DataFrame, real: str, pred: str):
        print(f"MAE: {self.mae(df, real, pred)}")
        print(f"MSE: {self.mse(df, real, pred)}")
        print(f"RMSE: {self.rmse(df, real, pred)}")
        print(f"MAPE: {self.mape(df, real, pred)}")
        

In [60]:
recommender = RandomRecommender()
analyzer = Analyzer()
cnt = get_integer('몇개의 영화를 추천드릴까요?')
recommends = recommender.run(cnt)

print(recommends)
analyzer.analyze(recommends, 'rates_mean', 'rate_random')


   index  movie   title                                          title_eng  \
0    139  10140   매드 맥스                                     Mad Max , 1979   
1    764  10765  공군 대전략                           Battle Of Britain , 1969   
2    473  10474     알파빌  Alphaville, Une Etrange Aventure De Lemmy Caut...   
3    842  10843  칠수와 만수                          Chil-su And Man-su , 1988   
4    576  10577  정글의 여인                         Dance Of The Dwarfs , 1984   

     year     grade  rates_mean  rate_random  
0  1980.0  청소년 관람불가    7.601695        8.500  
1     NaN         G    8.953258        8.163  
2     NaN        NR    8.953258        7.576  
3  1988.0   15세 관람가    8.686047        4.642  
4     NaN        PG    8.953258        3.467  
MAE: 2.519225404201178
MSE: 9.956329126078261
RMSE: 3.1553651335587554
MAPE: 0.2877221081249853
