In [3]:
# module
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import numpy as np

In [4]:
# Util Functions
from enum import Enum

# 파일을 읽어와 pandas의 DataFrame으로 반환해줌
def get_df_from(path: str, sep: str = ','):
    extension = path.split('.')[-1]
    try:
        if extension == "csv" or extension == "txt":
            df = pd.read_csv(path, sep=sep, on_bad_lines='skip')
            if df.empty:
                print("W: DataFrame is empty.")
                return None
            return df
        else:
            print("E: File extension is not supported.")
            return None
    except FileNotFoundError:
        print(f"E: File not found. Check the path: {path}")
        return None
    except pd.errors.ParserError as e:
        print(f"E: Error parsing CSV file: {e}")
        return None
    except Exception as e:
        print(f"E: An unexpected error occured: {e}")
        return None

# 리스트의 평균을 반환.(기본값 0)
def calculate_mean(lst):
    if lst: # 빈 리스트가 아닌 경우에만 평균 계산
        return np.mean(lst)
    return 0 # 빈 리스트인 경우 NaN 반환

# 리스트의 합을 반환.(기본값 0)
def calculate_sum(lst):
    if lst:
        return np.sum(lst)
    return 0


class ChartShape(Enum):
    BAR = 1
    HISTOGRAM = 2
    LINE = 3
    HEATMAP = 4

# Chart 뽑는 함수
def get_chart_from(df: pd.DataFrame, x_col: str, y_col: str, shape: ChartShape = ChartShape.BAR, real_time: bool = False):
    plt.figure(figsize=(10, 6))
    if shape == ChartShape.BAR:
        plt.bar(df[x_col], df[y_col])
    elif shape == ChartShape.HISTOGRAM:
        plt.hist(df[x_col], bins=10)
    elif shape == ChartShape.LINE:
        plt.plot(df[x_col], df[y_col])
    else:
        plt.plot(df[x_col], df[y_col])

    if real_time:
        plt.gca().xaxis.set_major_locator(mdates.MonthLocator()) # 월 단위로 큰 눈금 표시
        plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d')) # 날짜 형식 지정
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.grid(True)
    plt.tight_layout()
    plt.xticks(df[x_col], rotation=45, ha='right')
    plt.show()

def get_chart_from_series(sr: pd.Series, x_col: str, y_col: str, shape: ChartShape = ChartShape.BAR):
    if shape == ChartShape.BAR:
        sr.plot.bar()
    if shape == ChartShape.LINE:
        sr.plot.line()
    if shape == ChartShape.HISTOGRAM:
        sr.plot.hist()
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.show()

def get_integer(question: str):
    try:
        id = int(input(question))
        return id
    except:
        print(f"E: invalid integer")
        return 0

In [5]:
# Function

folder_path = '../public/kmrd-small/'

file_paths = {
    'castings': folder_path + 'castings.csv',
    'countries': folder_path + 'countries.csv',
    'genres': folder_path + 'genres.csv',
    'movies': folder_path + 'movies.txt',
    'peoples': folder_path + 'peoples.txt',
    'rates': folder_path + 'rates.csv',
}

def get_df_strict_from(path: str, sep: str = ','):
    df = get_df_from(path, sep)
    for _ in range(3):
        if df is not None:
            return df
        new_path = input('오류가 발생했습니다. 정확한 경로를 확인해주세요: ')
        new_sep = input('구분자를 입력해주세요: ')
        df = get_df_from(new_path, new_sep)
    exit()

### 정밀도와 재현율

- 사용자별 평가를 10개만 한 사용자 && 영화별 평점 개수가 [60, 200)인 영화

In [6]:
dfs = {}
names = ['rates', 'movies']
for name in names:
    path = file_paths[name]
    if name == 'movies' or name == 'peoples':
        dfs[name] = get_df_strict_from(path, '\t')
    else:
        dfs[name] = get_df_strict_from(path)

In [7]:
from sklearn.preprocessing import MultiLabelBinarizer

In [8]:
rates_df = dfs['rates']
movies_df = dfs['movies']

user_rates_count = rates_df.value_counts('user')
user_rates_10 = rates_df.groupby('user')['movie'].apply(list)[user_rates_count == 10]

user_rates_10_idx = user_rates_10.index
rates_df_10 = rates_df[rates_df['user'].isin(user_rates_10_idx)]
movie_rates_count = rates_df_10.value_counts('movie')

movie_rates_60to200 = movie_rates_count[(movie_rates_count >= 60) & (movie_rates_count < 200)].index

result = pd.DataFrame(user_rates_10.apply(lambda x: [item for item in x if item in movie_rates_60to200]))

result.columns = ['movie']
result['cnt'] = result['movie'].apply(len)

result = result.sort_values('cnt', ascending=False)
result = result[result['cnt'] > 4]

mlb = MultiLabelBinarizer()

movie_mat = pd.DataFrame(mlb.fit_transform(result['movie']), columns=mlb.classes_, index=result.index)

movie_mat['m'] = movie_mat.sum(axis=1)

movie_mat = movie_mat.sort_values(by='m', ascending=False)

print(movie_mat)
# result.pivot_table(index="user", columns='movie', aggfunc='count')

       10001  10016  10048  10071  10102  10114  10200  10936  m
user                                                            
1917       1      0      1      1      1      1      1      1  7
10418      0      1      1      1      1      0      1      1  6
1105       1      1      1      1      0      0      1      1  6
1805       1      0      1      1      1      1      1      0  6
5136       0      0      1      1      1      1      1      1  6
2277       1      1      1      0      1      0      1      1  6
1561       1      0      1      1      0      1      1      1  6
1980       1      1      0      1      1      1      0      1  6
1312       1      1      1      0      0      0      1      1  5
3189       1      0      0      1      0      1      1      1  5
3674       0      1      0      1      0      1      1      1  5
2469       1      1      1      1      1      0      0      0  5
5166       0      1      0      1      1      1      0      1  5
729        1      1      

In [9]:
def get_movie_id_from_name(lst: list):
    res = []
    for name in lst:
        res.append(movies_df[movies_df['title'] == name]['movie'].values[0])
    return res
        

recommends = ['가위손', '나 홀로 집에', '대부', '사운드 오브 뮤직']

recommends = get_movie_id_from_name(recommends)
print(recommends)
movie_mat_sum = movie_mat.sum()

movie_mat['K'] = len(recommends)
precision = []

movie_mat['AP'] = 0
for idx, id in enumerate(recommends):
    movie_mat[f"Precision@{idx + 1}"] = movie_mat[recommends[:idx + 1]].sum(axis=1) / (idx + 1)
    movie_mat['AP'] += movie_mat[f"Precision@{idx + 1}"] * movie_mat[id]

movie_mat['AP'] /= movie_mat['m']
movie_mat['tp'] = movie_mat[recommends].sum(axis=1)
movie_mat['fp'] = len(recommends) - movie_mat['tp']
movie_mat['fn'] = movie_mat['m'] - movie_mat['tp']
movie_mat['precision'] = movie_mat['tp'] / (movie_mat['tp'] + movie_mat['fp'])
movie_mat['recall'] = movie_mat['tp'] / (movie_mat['tp'] + movie_mat['fn'])
print(movie_mat)


[10936, 10016, 10071, 10102]
       10001  10016  10048  10071  10102  ...  tp  fp  fn  precision    recall
user                                      ...                                 
1917       1      0      1      1      1  ...   3   1   4       0.75  0.428571
10418      0      1      1      1      1  ...   4   0   2       1.00  0.666667
1105       1      1      1      1      0  ...   3   1   3       0.75  0.500000
1805       1      0      1      1      1  ...   2   2   4       0.50  0.333333
5136       0      0      1      1      1  ...   3   1   3       0.75  0.500000
2277       1      1      1      0      1  ...   3   1   3       0.75  0.500000
1561       1      0      1      1      0  ...   2   2   4       0.50  0.333333
1980       1      1      0      1      1  ...   4   0   2       1.00  0.666667
1312       1      1      1      0      0  ...   2   2   3       0.50  0.400000
3189       1      0      0      1      0  ...   2   2   3       0.50  0.400000
3674       0      1    

In [29]:
from sklearn.metrics import average_precision_score

print(movie_mat[10001:10071])
res = average_precision_score([1, 0, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 0, 0, 0, 0])
print(res)

Empty DataFrame
Columns: [10001, 10016, 10048, 10071, 10102, 10114, 10200, 10936, m, K, AP, Precision@1, Precision@2, Precision@3, Precision@4, tp, fp, fn, precision, recall]
Index: []
0.8214285714285714
