In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
file_path = '/mnt/data/imdb_final_2.xlsx'
df = pd.read_excel(file_path, sheet_name='Sheet1')


# 연도별 TV 시리즈 수
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='year')
plt.title('Number of TV Series by Year')
plt.xticks(rotation=45)
plt.show()

# 장르별 TV 시리즈 수
plt.figure(figsize=(12, 8))
genres = df['genre'].str.split(',', expand=True).stack().reset_index(level=1, drop=True)
sns.countplot(y=genres, order=genres.value_counts().index)
plt.title('Number of TV Series by Genre')
plt.show()

# 국가별 TV 시리즈 수
plt.figure(figsize=(12, 8))
sns.countplot(y=df['production_country'], order=df['production_country'].value_counts().index)
plt.title('Number of TV Series by Production Country')
plt.show()

# 연령 등급별 TV 시리즈 수
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='age_rating')
plt.title('Number of TV Series by Age Rating')
plt.xticks(rotation=45)
plt.show()

# 숫자형 데이터만 선택
numeric_df = df.select_dtypes(include=['float64', 'int64'])

# 상관 관계 매트릭스
plt.figure(figsize=(14, 12))
sns.heatmap(numeric_df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# runtime 열을 숫자형으로 변환 (예를 들어, '1시간 0분' 형식에서 '60'으로 변환)
def convert_runtime(runtime):
    if isinstance(runtime, str):
        if '시간' in runtime:
            hours, minutes = runtime.split('시간')
            minutes = minutes.replace('분', '').strip()
            return int(hours.strip()) * 60 + int(minutes)
        else:
            return int(runtime.replace('분', '').strip())
    return runtime

df['runtime'] = df['runtime'].apply(convert_runtime)

# runtime 분포 시각화
plt.figure(figsize=(12, 6))
sns.histplot(df['runtime'].dropna(), kde=True)
plt.title('Distribution of Runtime')
plt.xlabel('Runtime (minutes)')
plt.show()
