In [159]:
# module
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import numpy as np

In [160]:
# Util Functions
from enum import Enum

# 파일을 읽어와 pandas의 DataFrame으로 반환해줌
def get_df_from(path: str, sep: str = ','):
    extension = path.split('.')[-1]
    try:
        if extension == "csv" or extension == "txt":
            df = pd.read_csv(path, sep=sep, on_bad_lines='skip')
            if df.empty:
                print("W: DataFrame is empty.")
                return None
            return df
        else:
            print("E: File extension is not supported.")
            return None
    except FileNotFoundError:
        print(f"E: File not found. Check the path: {path}")
        return None
    except pd.errors.ParserError as e:
        print(f"E: Error parsing CSV file: {e}")
        return None
    except Exception as e:
        print(f"E: An unexpected error occured: {e}")
        return None

# 리스트의 평균을 반환.(기본값 0)
def calculate_mean(lst):
    if lst: # 빈 리스트가 아닌 경우에만 평균 계산
        return np.mean(lst)
    return 0 # 빈 리스트인 경우 NaN 반환

# 리스트의 합을 반환.(기본값 0)
def calculate_sum(lst):
    if lst:
        return np.sum(lst)
    return 0


class ChartShape(Enum):
    BAR = 1
    HISTOGRAM = 2
    LINE = 3
    HEATMAP = 4

# Chart 뽑는 함수
def get_chart_from(df: pd.DataFrame, x_col: str, y_col: str, shape: ChartShape = ChartShape.BAR, real_time: bool = False):
    plt.figure(figsize=(10, 6))
    if shape == ChartShape.BAR:
        plt.bar(df[x_col], df[y_col])
    elif shape == ChartShape.HISTOGRAM:
        plt.hist(df[x_col], bins=10)
    elif shape == ChartShape.LINE:
        plt.plot(df[x_col], df[y_col])
    else:
        plt.plot(df[x_col], df[y_col])

    if real_time:
        plt.gca().xaxis.set_major_locator(mdates.MonthLocator()) # 월 단위로 큰 눈금 표시
        plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d')) # 날짜 형식 지정
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.grid(True)
    plt.tight_layout()
    plt.xticks(df[x_col], rotation=45, ha='right')
    plt.show()

def get_chart_from_series(sr: pd.Series, x_col: str, y_col: str, shape: ChartShape = ChartShape.BAR):
    if shape == ChartShape.BAR:
        sr.plot.bar()
    if shape == ChartShape.LINE:
        sr.plot.line()
    if shape == ChartShape.HISTOGRAM:
        sr.plot.hist()
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.show()

def get_integer(question: str):
    try:
        id = int(input(question))
        return id
    except:
        print(f"E: invalid integer")
        return 0

In [161]:
# Function

folder_path = '../public/kmrd-small/'

file_paths = {
    'castings': folder_path + 'castings.csv',
    'countries': folder_path + 'countries.csv',
    'genres': folder_path + 'genres.csv',
    'movies': folder_path + 'movies.txt',
    'peoples': folder_path + 'peoples.txt',
    'rates': folder_path + 'rates.csv',
}

def get_df_strict_from(path: str, sep: str = ','):
    df = get_df_from(path, sep)
    for _ in range(3):
        if df is not None:
            return df
        new_path = input('오류가 발생했습니다. 정확한 경로를 확인해주세요: ')
        new_sep = input('구분자를 입력해주세요: ')
        df = get_df_from(new_path, new_sep)
    exit()

### 분류 평가

- `rate_random_class` 필드를 추가해서 평점을 반올림
- 이를 이용해 분류 평가한다.
  - Confusion Matrix
  - Accuracy
  - Precision
  - Recall
  - F1 Score

In [None]:
dfs = {}

def movie_data_loader(names):
    global dfs

    for name in names:
        path = file_paths[name]
        if name == 'movies' or name == 'peoples':
            dfs[name] = get_df_strict_from(path, '\t')
        else:
            dfs[name] = get_df_strict_from(path)

class AvgMovieRecommender:
    def __init__(self):
        pass

    def run(self, cnt):
        rates_df = dfs['rates']
        rates_mean = rates_df['rate'].mean()

        # 특정 개수 미만 제거
        n = 3
        movie_counts = rates_df['movie'].value_counts()
        ids_over_n = movie_counts[movie_counts >= n].index 
        rates_df = rates_df[rates_df['movie'].isin(ids_over_n)]
        rates_mean_df = pd.DataFrame(rates_df.groupby('movie')['rate'].mean())
        rates_mean_df.columns = ['rate_avg_movie']

        user_mean_df = pd.DataFrame(rates_df.groupby('user')['rate'].mean())
        user_mean_df.columns = ['rate_avg_user']
        rates_mean_df.sort_values('rate_avg_user', ascending=False)

        # 영화별 별점 평균값 column 생성
        res = pd.merge(res, rates_mean_df, left_on='movie', right_index=True, how='left')
        res['rate_avg_movie'] = res['rate_avg_movie'].fillna(rates_mean)
        res['rate_avg_movie_class'] = np.round(res['rate_avg_movie'], 0).astype(int)

        # 유저별 별점 평균값 column 생성
        res = rates_df.drop_duplicates(subset=['movie'])
        res['']
        

        res = res.sort_values('rate_avg_movie', ascending=False)
        # 영화별 랜덤 별점 column 생성
        res = res.head(cnt)
        res['rate_random'] = np.random.uniform(0, 10, size=cnt).round(3)
        res['rate_random_class'] = np.round(res['rate_random'], 0).astype(int)
        return res.reset_index()
        
movie_data_loader(['rates', 'movies'])
        
recommender = AvgMovieRecommender()
recommender.run(3)

Unnamed: 0,index,user,movie,rate,time,rate_avg_movie,rate_avg_movie_class,rate_random,rate_random_class
0,129634,41731,10518,10,1549526640,10.0,10,2.379,2
1,101847,22382,10384,10,1528194300,10.0,10,0.719,1
2,14837,824,10094,9,1169752920,9.875,10,8.326,8


> Analyzer

- 결과를 분석하는 class
  - 평균 절대 오차(MAE)
  - 평균 제곱 오차(MSE)
  - 평균 제곱근 오차(RSME)
  - 평균 절대 비율 오차(MAPE)

In [163]:
def create_confusion_matrix(y_true, y_pred):
    """sklearn 없이 혼동 행렬 생성"""
    if len(y_true) != len(y_pred):
        raise ValueError("y_true와 y_pred의 길이가 같아야 합니다.")

    n_labels = 10
    cm = np.zeros((n_labels, n_labels), dtype=int)         

    for true_label, pred_label in zip(y_true, y_pred):
        if not isinstance(true_label, np.ndarray):
            continue
        for index, cnt in enumerate(true_label):
            cm[index][pred_label - 1] += cnt
    return cm



In [164]:
def get_freq_list(x):
    res = np.zeros(10)
    for item in x:
        res[item - 1] += 1
    return res

class Analyzer():
    def __init__(self):
        pass

    def mae(self, df: pd.DataFrame, real: str, pred: str):
        cnt = len(df)
        return np.sum(np.abs(df[real] - df[pred])) / cnt
        
    def mse(self, df: pd.DataFrame, real: str, pred: str):
        cnt = len(df)
        return np.sum(np.power(df[real] - df[pred], 2)) / cnt
    
    def rmse(self, df: pd.DataFrame, real: str, pred: str):
        return np.sqrt(self.mse(df, real, pred))
    
    def mape(self, df: pd.DataFrame, real: str, pred: str):
        cnt = len(df)
        return (np.sum(np.abs(df[real] - df[pred]) / np.abs(df[real])) / cnt) * 100
        
    def print_regression(self, df:pd.DataFrame, title: str, y_true:str, y_pred: str):
        print(f"--------- {title} Regression --------\n")
        print(f"MAE: {self.mae(df, y_true, y_pred)}")
        print(f"MSE: {self.mse(df, y_true, y_pred)}")
        print(f"RMSE: {self.rmse(df, y_true, y_pred)}")
        print(f"MAPE: {self.mape(df, y_true, y_pred)}")

    def analyze(self, df: pd.DataFrame):
        self.print_regression(df, 'Random', 'rate', 'rate_random')
        self.print_regression(df, 'Random', 'rate', 'rate_avg_movie')

    def confusion_items(self, df: pd.DataFrame):
        tp = np.diag(df)
        fp = np.sum(df, axis=0) - tp # 세로합 
        tn = 0
        fn = np.sum(df, axis=1) - tp # 가로합
        return {
            'tp': tp, 'fp': fp, 'tn': tn, 'fn': fn
        }

    def analyze_classify(self, df: pd.DataFrame, y_pred: str):
        rates_df = dfs['rates']
        rates_classify = pd.DataFrame(rates_df.groupby('movie')['rate'].apply(list).apply(get_freq_list))
        rates_classify.columns = ['rates_freq']
        # print(rates_classify)
        res = pd.merge(df, rates_classify, how='left', left_on='movie', right_index=True)
        confus_mat = create_confusion_matrix(res['rates_freq'], res[y_pred])
        print(confus_mat)
        items = self.confusion_items(confus_mat)
        accu = (items['tp'].sum() + items['tn']) / (items['tp'].sum() + items['tn'] + items['fp'].sum() + items['fn'].sum())
        prec = np.empty_like(items['tp'], dtype=float)
        recall = np.empty_like(items['tp'], dtype=float)
        f1_score = np.empty_like(items['tp'], dtype=float)
        np.divide(items['tp'], (items['tp'] + items['fp']), out=prec, where=(items['tp'] + items['fp']) != 0)
        np.divide(items['tp'], (items['tp'] + items['fn']), out=recall, where=(items['tp'] + items['fn']) != 0)
        np.divide(2 * prec * recall, prec + recall, out=f1_score, where=(prec + recall != 0))
        print(f"Accuracy : {accu}")
        print(f"Precision :{prec}")
        print(f"Recall :{recall}")
        print(f"F1_Score :{f1_score}")



In [167]:
recommender = AvgMovieRecommender()
analyzer = Analyzer()
# cnt = get_integer('몇개의 영화를 추천드릴까요?')
cnt = 550
try:
    recommends = recommender.run(cnt)

    print(recommends)
    analyzer.analyze(recommends)

    analyzer.analyze_classify(recommends, 'rate_avg_movie_class')
    analyzer.analyze_classify(recommends, 'rate_random_class')
except:
    print(f"Error occured")



      index   user  movie  rate        time  rate_avg_movie  \
0    129634  41731  10518    10  1549526640       10.000000   
1    101847  22382  10384    10  1528194300       10.000000   
2     14837    824  10094     9  1169752920        9.875000   
3     22616   1525  10499    10  1315860540        9.800000   
4      5672    170  10294    10  1217863200        9.764706   
..      ...    ...    ...   ...         ...             ...   
545     795     25  10836     7  1497869580        6.210526   
546    2040     70  10408     6  1274207880        6.157895   
547    3155     92  10524     1  1561040760        6.145161   
548     261      4  10920     6  1442566620        6.064516   
549    2305     74  10392     8  1378137900        6.045455   

     rate_avg_movie_class  rate_random  rate_random_class  
0                      10        4.193                  4  
1                      10        8.047                  8  
2                      10        3.591                  4  
3  