In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### 한글 깨짐 방지

#Import 

In [None]:
import pandas as pd
import numpy as np

import category_encoders as ce
from sklearn.preprocessing import LabelEncoder

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import lightgbm as lgb


## Set CFG

In [None]:
class CFG:
    SEED = 42
    PATH = "/content/drive/MyDrive/Dacon/FIANL_MOVIE_RECOM/"
    

## Load Dataset

In [None]:
with open(f"{CFG.PATH}DATA/movies_movie_20230117.csv", 'r',encoding='utf-8') as f:
    data = pd.read_csv(f)
data.head(3)

Unnamed: 0,id,title,poster,director,cast,genre,nation,running_time,release_date,ratings,synopsis,keyword,status,avg_grade,viewer,cnt_click
0,10526,블러드 심플,https://movie-phinf.pstatic.net/20190926_233/1...,조엘 코엔,존 게츠|프란시스 맥도맨드|댄 헤다야,스릴러|범죄|드라마,미국,96.0,2019-10-17,15세 관람가,"삐뚤어진 욕망, 한 번의 잘못된 선택 \n추잡한 비극이 시작된다!텍사스의 한마을,...",선택|애비|레이|비저|욕망,0,8.8,48,0
1,10621,이창,https://movie-phinf.pstatic.net/20160725_260/1...,알프레드 히치콕,제임스 스튜어트,스릴러|미스터리,미국,112.0,1957-02-24,15세 관람가,사진작가 제프는 카레이싱 촬영 도중 다리를 다쳐 휠체어에 의지한 채 자신의 방에서 ...,제프|남편|주변|이웃|시작,1,10.4,45,0
2,13481,지상 최대의 쇼,https://movie-phinf.pstatic.net/20200219_193/1...,세실 B. 드밀,베티 허튼|코넬 와일드|찰톤 헤스톤,,미국,,1955-05-20,,서커스단의 갖가지 애환을 큰 스케일로 그린 대작. 스츄어트는 항상 분장을 지우지않는...,작품|서커스|갖가지|애환|스케일,1,7.1,7,0


# 결측치 채우기

In [None]:
data.isnull().sum()

id                0
title             0
poster            0
director          4
cast            104
genre            34
nation            0
running_time     15
release_date      0
ratings          25
synopsis          7
keyword           8
status            0
avg_grade         0
viewer            0
cnt_click         0
dtype: int64

In [None]:

def imute_missingValue( movie_info ):
    # 1. 문자열 데이터 결측치 채우기 
    col_list_str = [ col for col in movie_info.columns if ( movie_info[col].isnull().sum() >0 and type(movie_info[col].iloc[0]) == type("s"))]
    # 결측치 채우기
    for i in col_list_str:
        movie_info[i] = movie_info[i].fillna(  "정보없음" )

    #2. numeric value - IterativeImputer로 채우기 
    # 결측값 있는 리스트 
    col_list_num = [ col for col in movie_info.columns if ( movie_info[col].isnull().sum() >0 and type(movie_info[col].iloc[0]) != type("s"))]
    
    # col_list_num 결측치 채우기
    for i in col_list_num:
        imputer = IterativeImputer(estimator =  lgb.LGBMRegressor(),random_state=CFG.SEED)
        movie_info[i] = imputer.fit_transform(movie_info[[i]])

    return movie_info

In [None]:
data = imute_missingValue(data)

In [None]:
data.isnull().sum()

id              0
title           0
poster          0
director        0
cast            0
genre           0
nation          0
running_time    0
release_date    0
ratings         0
synopsis        0
keyword         0
status          0
avg_grade       0
viewer          0
cnt_click       0
dtype: int64

# 나라 수정 
- 대한민국, 미국 뺴고 모두 드랍 
- '미국|캐나다' 인 경우 : 미국만 남기고 나머지 국가 제거 

In [None]:
for idx, x in enumerate(data["nation"]):
    result = [ i for i in x.split("|") if i in ["대한민국","미국"] ]
    data["nation"].iloc[idx] = "|".join(sorted(result))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


# 날짜 형태 맞추기 
- 만약 개봉 월 & 개봉일 정보가 없을 경우 12-31일로 맞춰줌 


In [None]:
data["release_date"] = data["release_date"].apply(lambda x : x+'1231' if len(x.split("-"))<2 else ( x+"31" if len(x.split("-"))<3  else  "".join(x.split("-")) ) )
data["release_date"]

0       20191017
1       19570224
2       19550520
3       19630301
4       19730629
          ...   
1628    20221228
1629    20230125
1630    20230112
1631    20230102
1632    20231231
Name: release_date, Length: 1633, dtype: object

In [None]:
data.isnull().sum().sum()

0

## 이미지 / 시놉시스 저장 테이블 생성

In [None]:
# 이미지 저장 테이블 
img_df = data[["id","title","poster"]].copy()
img_df

# 시놉시스 저장 테이블 
synop_df = data[["id","title","synopsis"]].copy()
synop_df

data.to_csv(f"{CFG.PATH}DATA/fillna_movie.csv", index=False, encoding="utf-8-sig", header=data.columns)

In [None]:
tmp_df = data.copy()
tmp_df["release_date"] = data["release_date"].apply(lambda x : x[:4])
tmp = tmp_df.groupby("release_date")["id"].count().reset_index()

col = []
cnt = 0
for i in range(len(tmp)):
    if tmp["id"].iloc[i] > 5:
        col.append([tmp["release_date"].iloc[i] ,tmp["id"].iloc[i] ]  )
    else:
        cnt +=tmp["id"].iloc[i] 

col.append(["재개봉" ,cnt])
tmp_df = pd.DataFrame(col)
tmp_df


Unnamed: 0,0,1
0,2019,440
1,2020,366
2,2021,422
3,2022,354
4,2023,41
5,재개봉,10


# Encoding 

In [None]:
class Enc:
    def binaryEncoding(df , col):
        enc_binary = ce.BinaryEncoder()
        df_binary = enc_binary.fit_transform(df[col])
        return pd.concat([df, df_binary], axis=1)

    def labelEncoding(df , col):
        enc_label = LabelEncoder()
        df[col] = enc_label.fit_transform(df[col])
        return df
    
    def countVec( df , col ):

        str_connector = "|"

        all_cols = [ x.split(str_connector)[i] for x in df[col] for i in range(len(x.split(str_connector)))   ]
        uniq_col_lst = pd.unique( sorted( all_cols ) ) 

        # create dummy DF
        zero_metrix = np.zeros((len(df[col]), len(uniq_col_lst)))
        dummy = pd.DataFrame(zero_metrix, columns=uniq_col_lst)

        # count genres
        for i, gen in enumerate(df[col]):
            indices = dummy.columns.get_indexer(  gen.split(str_connector) )
            dummy.iloc[i, indices] = 1

        return dummy


## apply encoding

In [None]:
# 영화 등급 - 라벨인코딩
data = Enc.labelEncoding(data, "ratings")  

# # # 감독 - 바이너리 
data = Enc.binaryEncoding(data, "director")  

# # 장르 - 카운드 벡터
data = pd.concat([data, Enc.countVec(data, "genre")  ], axis=1)

# 국가 - 원핫 인코딩 or 바이너리 
data = Enc.binaryEncoding(data, "nation")

# 태그  
# sorted(Enc.countVec(data,"keyword").sum(axis = 0),reverse=True)
data = pd.concat([data, Enc.countVec(data, "keyword")  ], axis=1)

data.head(2)

Unnamed: 0,id,title,poster,director,cast,genre,nation,running_time,release_date,ratings,...,희귀질환,희망,희망이,희생,히로,히말라야,히어로,히트,히트송,힙합
0,10526,블러드 심플,https://movie-phinf.pstatic.net/20190926_233/1...,조엘 코엔,존 게츠|프란시스 맥도맨드|댄 헤다야,스릴러|범죄|드라마,미국,96.0,20191017,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10621,이창,https://movie-phinf.pstatic.net/20160725_260/1...,알프레드 히치콕,제임스 스튜어트,스릴러|미스터리,미국,112.0,19570224,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Drop columns

In [None]:

drop_cols = ["title","poster","cast","synopsis" , "nation","genre","director","keyword"]
data = data.drop(columns = drop_cols)

In [None]:
data

Unnamed: 0,id,running_time,release_date,ratings,status,avg_grade,viewer,cnt_click,director_0,director_1,...,희귀질환,희망,희망이,희생,히로,히말라야,히어로,히트,히트송,힙합
0,10526,96.000000,20191017,1,0,8.8,48,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10621,112.000000,19570224,1,1,10.4,45,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,13481,100.365884,19550520,3,1,7.1,7,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,13512,165.000000,19630301,3,1,10.3,21,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,13775,120.000000,19730629,0,1,8.7,24,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1628,224919,96.000000,20221228,0,1,5.3,3,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1629,224929,102.000000,20230125,1,2,0.0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1630,225173,99.000000,20230112,1,1,9.8,6,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1631,225699,158.000000,20230102,0,0,9.7,12,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# export data 

In [None]:
data.to_csv(f"{CFG.PATH}DATA/clean_movie.csv", index=False, encoding="utf-8-sig", header=data.columns)