# K - 드라마 탑100 데이터 분석

In [13]:
import pandas as pd
import matplotlib.pyplot as plt

import sys
sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname('set_matplotlib_hangul.py'))))

import plotly.express as px
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objects as go
init_notebook_mode(connected=True)

# So that warning doesn't pop up
import warnings
warnings.filterwarnings("ignore")


In [4]:
# 데이터 불러오기

data = pd.read_csv('./top100_kdrama.csv')

data.head()

Unnamed: 0,Name,Year of release,Aired Date,Aired On,Number of Episode,Network,Duration,Content Rating,Synopsis,Cast,Genre,Tags,Rank,Rating
0,Move to Heaven,2021,"May 14, 2021",Friday,10,Netflix,52 min.,18+ Restricted (violence & profanity),Geu Roo is a young autistic man. He works for ...,"Lee Je Hoon, Tang Jun Sang, Hong Seung Hee, Ju...","Life, Drama, Family","Autism, Uncle-Nephew Relationship, Death, Sava...",#1,9.2
1,Hospital Playlist,2020,"Mar 12, 2020 - May 28, 2020",Thursday,12,"Netflix, tvN",1 hr. 30 min.,15+ - Teens 15 or older,The stories of people going through their days...,"Jo Jung Suk, Yoo Yeon Seok, Jung Kyung Ho, Kim...","Friendship, Romance, Life, Medical","Strong Friendship, Multiple Mains, Best Friend...",#2,9.1
2,Flower of Evil,2020,"Jul 29, 2020 - Sep 23, 2020","Wednesday, Thursday",16,tvN,1 hr. 10 min.,15+ - Teens 15 or older,Although Baek Hee Sung is hiding a dark secret...,"Lee Joon Gi, Moon Chae Won, Jang Hee Jin, Seo ...","Thriller, Romance, Crime, Melodrama","Married Couple, Deception, Suspense, Family Se...",#3,9.1
3,Hospital Playlist 2,2021,"Jun 17, 2021 - Sep 16, 2021",Thursday,12,"Netflix, tvN",1 hr. 40 min.,15+ - Teens 15 or older,Everyday is extraordinary for five doctors and...,"Jo Jung Suk, Yoo Yeon Seok, Jung Kyung Ho, Kim...","Friendship, Romance, Life, Medical","Workplace, Strong Friendship, Best Friends, Mu...",#4,9.1
4,My Mister,2018,"Mar 21, 2018 - May 17, 2018","Wednesday, Thursday",16,tvN,1 hr. 17 min.,15+ - Teens 15 or older,Park Dong Hoon is a middle-aged engineer who i...,"Lee Sun Kyun, IU, Park Ho San, Song Sae Byuk, ...","Psychological, Life, Drama, Family","Age Gap, Nice Male Lead, Strong Female Lead, H...",#5,9.1


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               100 non-null    object 
 1   Year of release    100 non-null    int64  
 2   Aired Date         100 non-null    object 
 3   Aired On           100 non-null    object 
 4   Number of Episode  100 non-null    int64  
 5   Network            100 non-null    object 
 6   Duration           100 non-null    object 
 7   Content Rating     100 non-null    object 
 8   Synopsis           100 non-null    object 
 9   Cast               100 non-null    object 
 10  Genre              100 non-null    object 
 11  Tags               100 non-null    object 
 12  Rank               100 non-null    object 
 13  Rating             100 non-null    float64
dtypes: float64(1), int64(2), object(11)
memory usage: 11.1+ KB


In [8]:
data.isna().sum() # No missing value

Name                 0
Year of release      0
Aired Date           0
Aired On             0
Number of Episode    0
Network              0
Duration             0
Content Rating       0
Synopsis             0
Cast                 0
Genre                0
Tags                 0
Rank                 0
Rating               0
dtype: int64

In [10]:
data.columns

Index(['Name', 'Year of release', 'Aired Date', 'Aired On',
       'Number of Episode', 'Network', 'Duration', 'Content Rating',
       'Synopsis', 'Cast', 'Genre', 'Tags', 'Rank', 'Rating'],
      dtype='object')

### 변수 설명
```
Name: 드라마 이름
Year of release: 드라마 방영 년도
Aired Date: 방영일자 (시작일) - (종영일)
Aired On: 방영 용일
Number of Episode: 에피소드 수
Network: 방영 매체
Duration: 에피소드 당 대략적 시간
Content Rating: 방영등급
Synopsis: 드라마 간략설명
Genre: 드라마 장르
Tags: 태그
Rank: 랭킹
Rating: 평점
```

## EDA


#### 년도 기준

In [29]:
# data.groupby('Year of release').size().reset_index().rename(columns = {0:'Count'})

fig = px.bar(data_frame = data.groupby('Year of release').size().reset_index().rename(columns = {0:'cnt'}),
              x = 'Year of release',
              y = 'cnt')
fig.update_layout(title = {'text':'년도 별 방영 드라마 수',
                           'font_size':20})
fig.show()

In [47]:
data.groupby(['Year of release','Aired On']).size().reset_index(). head()

Unnamed: 0,Year of release,Aired On,0
0,2003,"Monday, Tuesday",1
1,2012,Tuesday,1
2,2012,"Wednesday, Thursday",1
3,2013,"Monday, Tuesday",2
4,2013,"Wednesday, Thursday",3


In [34]:
fig = px.bar(data_frame = data.groupby(['Year of release','Aired On']).size().reset_index().rename(columns = {0:'cnt'}),
             x = 'Year of release',
             y = 'cnt',
             color = 'Aired On',
             barmode = 'stack')

fig.update_layout(title = {'text':'년도별, 요일별 드라마 방영 수'},
                  legend_title = '방영 요일')

fig.show()

### 에피소드 기준

In [41]:
data['Number of Episode'].value_counts().reset_index().head()

Unnamed: 0,index,Number of Episode
0,16,42
1,20,13
2,12,10
3,6,6
4,32,4


In [46]:
num_episode = data['Number of Episode'].value_counts().reset_index().rename(columns={'Number of Episode':'cnt','index':'Ep_cnt'})

fig = px.bar(data_frame = num_episode,
             x = 'Ep_cnt', y = 'cnt',
             title = '에피소드 수 별 드라마 수')

fig.update_layout(xaxis_title = '에피소드 수')

fig.update_xaxes(type='category')

fig.show()

### 방영 매체 기준 

In [50]:
data['Network'].value_counts()

tvN                19
SBS                17
Netflix,  tvN      16
KBS2               10
MBC                 9
jTBC                8
Netflix             7
OCN                 7
Netflix,  SBS       2
jTBC,  Netflix      2
Netflix,  OCN       1
jTBC,  Viki         1
KBS2,  Netflix      1
Name: Network, dtype: int64

In [51]:
from collections import Counter

network_list = []
for networks in data['Network'].to_list():
    networks = networks.strip().split(", ")
    for network in networks:
        network_list.append(network)
        
network_df = pd.DataFrame.from_dict(Counter(network_list),orient='index').rename(columns={0:'cnt'})
network_df.sort_values(by='cnt',ascending = False,inplace = True)
network_df

Unnamed: 0,cnt
Netflix,26
tvN,19
SBS,17
tvN,16
jTBC,11
KBS2,11
MBC,9
OCN,7
Netflix,3
SBS,2


In [63]:
fig = px.bar(data_frame = network_df,
             x = network_df.index,
             y = 'cnt')

fig.update_layout(title = "방영 매체 분포",
                  xaxis_title = '방영 매체')

fig.show()

In [69]:
fig = px.pie(data_frame = network_df,
             values = 'cnt',
             names = network_df.index,
             color_discrete_sequence = px.colors.qualitative.Prism)

fig.update_traces(textposition ='inside',
                  textinfo = 'label+percent',
                  pull = [0.05] * len(network_df.index.to_list()),
                  insidetextorientation='horizontal')

fig.update_layout(paper_bgcolor = 'white',
                  title = '방영 매체 분포',
                  legend_title = '방영 매체'
                  )

fig.show()

### 방영 시간 기준

In [74]:
duration_df = data['Duration'].value_counts().reset_index().rename(columns={'Duration' : 'cnt', 'index':'Duration'})

In [75]:
fig = px.bar(data_frame=duration_df,
           x = 'Duration',
           y = 'cnt')

fig.update_layout(title='방영 시간 분포')

fig.show()

### 등급 연령 기준

In [79]:
fig = px.bar(data_frame=data['Content Rating'].value_counts().reset_index().rename(columns={'Content Rating' : 'cnt', 'index':'Content Rating'}),
            x = 'Content Rating',
            y = 'cnt')

fig.update_layout(title='등급 연령 분포')

fig.show()


In [81]:
fig = px.bar(
    data_frame = data.groupby(['Year of release','Content Rating']).size().reset_index().rename(columns={0:'cnt'}),
    x = 'Year of release',
    y = 'cnt',
    color = 'Content Rating',
    barmode = 'stack'
)

fig.update_layout(title='연도별 드라마 등급 분포')

fig.show()

### 장르 기준

In [83]:
gener_list = []
for geners in data['Genre'].str.strip().tolist():
    gener = geners.split(', ')

    for gen in gener:
        gener_list.append(gen)

gen_df = pd.DataFrame.from_dict(Counter(gener_list), orient='index').rename(columns={0:'cnt'})
gen_df.sort_values(by='cnt',ascending = False, inplace = True)
gen_df.head()

Unnamed: 0,cnt
Drama,51
Romance,46
Mystery,25
Thriller,21
Comedy,21


In [84]:
fig = px.bar(data_frame = gen_df,
             x = gen_df.index,
             y = 'cnt')

fig.update_layout(title = '장르 분포',
                  xaxis_title = '장르')

fig.show()

In [88]:
data['Tags'].tolist()[0]

'Autism, Uncle-Nephew Relationship, Death, Savant Syndrome, Mourning, Tearjerker, Father-Son Relationship, Life Lesson, Ex-convict, Cleaning And Organizing'

In [89]:
tag_list = []
for tags in data['Tags'].tolist():
    tag = tags.split(', ')

    for t in tag:
        tag_list.append(t)

tag_df = pd.DataFrame.from_dict(Counter(tag_list), orient='index').rename(columns={0:'cnt'})
tag_df.sort_values(by='cnt',ascending = False, inplace = True)
tag_df.head()

Unnamed: 0,cnt
Strong Female Lead,43
Smart Female Lead,22
Bromance,21
Smart Male Lead,19
Murder,18


In [91]:
fig = px.bar(data_frame = tag_df,
             x = tag_df.index,
             y = 'cnt')

fig.update_layout(title = '드라마 태그 분포',
                  xaxis_title = '태그')

fig.show()

In [93]:
# 상위 10개 축소
fig = px.bar(data_frame = tag_df.head(10),
             x = tag_df.iloc[:10].index,
             y = 'cnt')

fig.update_layout(title = '드라마 태그 분포',
                  xaxis_title = '태그')

fig.show()

배우 기준

In [96]:
actors_list = []

for actors in data['Cast'].tolist():
    actors = actors.split(', ')
    for actor in actors:
        actors_list.append(actor)

actors_df = pd.DataFrame.from_dict(Counter(actors_list),orient = 'index').rename(columns = {0:'cnt'})
actors_df.sort_values(by='cnt',ascending = False,inplace = True)
actors_df.head()

Unnamed: 0,cnt
Lee Joon Hyuk,6
Jung Kyung Ho,5
Kim Ji Won,5
Song Joong Ki,5
Ji Sung,4


## 드라마 추천 시스템

In [97]:
# 드라마 키워드 검색
def kdrama_available(key):
    
    keyword = key
    
    print("Movie with keyword: [{}]".format(keyword))
    
    for i, mov in enumerate(data[data['Name'].str.contains(keyword)]['Name'].to_list()):
        print("{}) {} ".format(i+1,mov))

In [98]:
kdrama_available('Love')

Movie with keyword: [Love]
1) Moon Lovers 
2) It's Okay, That's Love 
3) My Love from the Star 
4) Just Between Lovers 


In [99]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


data['combined_features'] = data['Synopsis'] + " " + data['Genre'] + " " + data['Tags']

cv = CountVectorizer()
count_matrix = cv.fit_transform(data['combined_features'])
cosine_sim = cosine_similarity(count_matrix)

In [100]:
def kdrama_recommendation(mov,sim_num = 5):

    user_choice = mov
    
    try:
        ref_index = data[data['Name'].str.contains(user_choice, case = False)].index[0]

        similar_movies = list(enumerate(cosine_sim[ref_index]))

        sorted_simmilar_movies = sorted(similar_movies, key = lambda x: x[1], reverse = True)[1:]

        print('\n추천 한국 드라마 [{}]'.format(user_choice))
        print('-'*(24 + len(user_choice)))

        for i, element in enumerate(sorted_simmilar_movies):
            similar_movie_id = element[0]
            similar_movie_title = data['Name'].iloc[similar_movie_id]
            s_score = element[1]
            print('{:40} -> {:.3f}'.format(similar_movie_title, s_score))

            if i > sim_num:
                break
    except IndexError:
        print("\n[{}] is not in our database!".format(user_choice))
        print("We couldn't recommend anyting...Sorry...")

In [101]:
kdrama_recommendation('Just Between Love')


추천 한국 드라마 [Just Between Love]
-----------------------------------------
Chicago Typewriter                       -> 0.651
Cruel City                               -> 0.589
Kingdom                                  -> 0.534
Live                                     -> 0.530
Jewel in the Palace                      -> 0.517
Hotel del Luna                           -> 0.516
Kingdom                                  -> 0.516
