## 패키지 설치 및 로드


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 데이터 로드 및 전처리

In [2]:
# 데이터 로드
# csv 파일 읽기
original_data = pd.read_csv('movie_finished_toFDA_V9.csv')

# Parquet 파일 읽기
movie_after_14 = pd.read_parquet('kofic_data/kofic_data.parquet')

In [3]:
# 데이터 확인
print(movie_after_14.info())
movie_after_14.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13090 entries, 0 to 13089
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   date         13090 non-null  object
 1   rank         13090 non-null  object
 2   movieCd      13090 non-null  object
 3   movieNm      13090 non-null  object
 4   openDt       13090 non-null  object
 5   salesAmt     13090 non-null  object
 6   salesShare   13090 non-null  object
 7   salesInten   13090 non-null  object
 8   salesChange  13090 non-null  object
 9   salesAcc     13090 non-null  object
 10  audiCnt      13090 non-null  object
 11  audiInten    13090 non-null  object
 12  audiChange   13090 non-null  object
 13  audiAcc      13090 non-null  object
 14  scrnCnt      13090 non-null  object
 15  showCnt      13090 non-null  object
dtypes: object(16)
memory usage: 1.6+ MB
None


Unnamed: 0,date,rank,movieCd,movieNm,openDt,salesAmt,salesShare,salesInten,salesChange,salesAcc,audiCnt,audiInten,audiChange,audiAcc,scrnCnt,showCnt
0,20160101,1,20136068,히말라야,2015-12-16,4529795647,38.3,2303664641,103.5,43999571095,554307,259690,88.1,5683716,955,4678
1,20160101,10,20156807,뽀로로 극장판 컴퓨터 왕국 대모험,2015-12-10,96435100,0.8,27436400,39.8,2998525700,12243,2749,29.0,404003,151,245
2,20160101,2,20150025,내부자들: 디 오리지널,2015-12-31,2092539355,17.7,864281640,70.4,3329327070,247992,88926,55.9,408140,822,2088
3,20160101,3,20154141,스타워즈: 깨어난 포스,2015-12-17,1400855545,11.9,603603060,75.7,25550334085,154446,59442,62.6,2957680,552,2067
4,20160101,4,20144442,조선마술사,2015-12-30,1153061953,9.8,387563159,50.6,2659511747,145065,40336,38.5,369722,608,2319


In [4]:
# 데이터 병합을 위해 영화 코드를 기준으로 정렬함
movie_after_14 = movie_after_14.sort_values(by='movieCd').reset_index(drop=True)
movie_after_14.head()

Unnamed: 0,date,rank,movieCd,movieNm,openDt,salesAmt,salesShare,salesInten,salesChange,salesAcc,audiCnt,audiInten,audiChange,audiAcc,scrnCnt,showCnt
0,20180328,8,19720061,정무문,1973-07-27,26872000,0.7,26872000,100.0,55326000,3000,3000,100.0,9528,11,11
1,20160719,9,19818004,불의 전차,2016-06-16,12462000,0.5,6981000,127.4,294463100,1951,1055,117.7,44957,19,22
2,20190613,8,19880001,이웃집 토토로,2001-07-28,20093840,0.7,-6760790,-25.2,954379710,2550,-879,-25.6,116824,191,247
3,20190619,8,19880001,이웃집 토토로,2001-07-28,16411980,0.5,-3272090,-16.6,1220881670,2076,-397,-16.1,149037,146,197
4,20190622,10,19880001,이웃집 토토로,2001-07-28,18613660,0.2,9028500,94.2,1256031950,2168,1096,102.2,153147,59,84


In [5]:
# openDt 열에서 공백 문자열을 NaN으로 변환
movie_after_14['openDt'] = movie_after_14['openDt'].replace(" ", pd.NA)
movie_after_14['date'] = movie_after_14['date'].replace(" ", pd.NA)

# D_D를 계산하기 위해 openDt와 date를 datetime 형식으로 변환
movie_after_14['openDt'] = pd.to_datetime(movie_after_14['openDt'], format='%Y-%m-%d')
movie_after_14['date'] = pd.to_datetime(movie_after_14['date'],format='%Y%m%d')

# D_D 값 계산
movie_after_14['D_D'] = (movie_after_14['date'] - movie_after_14['openDt']).dt.days

# D_D 값의 통계 요약
d_d_stats = movie_after_14['D_D'].describe()
# D_D 값의 고유값 및 빈도
d_d_value_counts = movie_after_14['D_D'].value_counts()

# 통계 요약 출력
print("D_D Statistics:")
print(d_d_stats)

# 고유값 및 빈도 출력
print("\nD_D Value Counts:")
print(d_d_value_counts.sort_index())

D_D Statistics:
count    13070.000000
mean       113.584468
std        749.615344
min       -176.000000
25%          3.000000
50%          9.000000
75%         18.000000
max      16315.000000
Name: D_D, dtype: float64

D_D Value Counts:
D_D
-176.0      1
-42.0       1
-38.0       1
-37.0       1
-36.0       1
           ..
 9592.0     1
 9593.0     1
 9907.0     1
 9979.0     1
 16315.0    1
Name: count, Length: 421, dtype: int64


In [6]:
# 원본 데이터에 있는 영화만 추출
movie_code_list = original_data['kobis_movie_code'].astype(str).unique()
filtered_data = movie_after_14[movie_after_14['movieCd'].isin(movie_code_list)]

In [7]:
print(f"Number of movies in the original data: {len(movie_code_list)}")
print(f"Number of movies in the filtered data: {filtered_data['movieCd'].nunique()}")

Number of movies in the original data: 409
Number of movies in the filtered data: 409


In [8]:
#D_D 값이 14 이상인 것만 필터링
filtered_after_14 = filtered_data[filtered_data['D_D']>=14].reset_index(drop=True)

# D_D 값 정수형으로 변환
filtered_after_14['D_D'] = filtered_after_14['D_D'].astype(int)

In [9]:
df = filtered_after_14[['movieCd', 'movieNm', 'openDt', 'date', 'D_D', 'audiCnt']]
df.head()

Unnamed: 0,movieCd,movieNm,openDt,date,D_D,audiCnt
0,20068576,그날의 분위기,2016-01-14,2016-01-28,14,4229
1,20068576,그날의 분위기,2016-01-14,2016-01-31,17,6317
2,20068576,그날의 분위기,2016-01-14,2016-01-30,16,6650
3,20068576,그날의 분위기,2016-01-14,2016-02-01,18,2750
4,20068576,그날의 분위기,2016-01-14,2016-02-02,19,2573


In [10]:
np.arange(14, 21)

array([14, 15, 16, 17, 18, 19, 20])

In [11]:
df[df['D_D'].isin(np.arange(14, 22))]['D_D'].value_counts().sort_index()

D_D
14    265
15    237
16    238
17    240
18    232
19    218
20    201
21    150
Name: count, dtype: int64

In [12]:
df.to_csv('./processed_data/audi_cnt_after_14.csv', index=False)

---
---