In [None]:
#hide
import networkx as nx
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import plotly.express as px
import json
import requests

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

# Basic Statistics
> "Awesome summary"

- toc: false
- branch: master
- badges: true
- comments: true
- categories: [fastpages, jupyter]
- image: images/some_folder/your_image.png
- hide: false
- search_exclude: true
- metadata_key1: metadata_value1
- metadata_key2: metadata_value2

This part of the webpage are used to give a further introduction to the data used for analysis in this project. The Data [ref] introduces the basics of the data, whereas this page will dive further in to what the data contains and some of its properties. A full analysis can be found in the Explainer Notebook [ref]. 

This page will contain three parts: firstly, properties of the Game Of Thrones network are presented, secondly properties of the dialogoues are presented and lastly the reviews and ratings from IMDB.


## Game Of Thrones Network

This part are used to introduce properties of the data used for network analysis of the data especially there are focus on the properties of the characters namely their religion, allegiance, culture, number of appearances in the series and status ie. whether they are dead or alive at the end of the series. 


In [None]:
#hide
G = nx.read_gpickle("/work/got_G.gpickle")

property_dict = {
    "status": [],
    "appearances": [],
    "culture" : [],
    "allegiance": [],
    "religion" : []
}
attributes = ["status", "appearances", "culture", "allegiance", "religion"]

for x,y in G.nodes(data = True): 
    node_name = x
    for attribute in attributes:
        if attribute == "appearances":
            if y[attribute] == "":
                yat = 0
            else:
                yat = int(y[attribute])
        else:
            yat = y[attribute]
        property_dict[attribute].append(yat)    
df_property = pd.DataFrame.from_dict(property_dict, orient = "columns")
dfs = {}
figs ={}
for attribute in attributes:
    dfs[attribute] = df_property[attribute].value_counts()
    dfs[attribute] = dfs[attribute].reset_index()
    dfs[attribute].columns = [attribute.capitalize(), "Counts"]

    figs[attribute] = px.bar(dfs[attribute], x=attribute.capitalize(),
             y="Counts", color=attribute.capitalize(), title="Distribution of character "+attribute)


The network data contains 224 characters, which is found by scraping the gameofthrones.fandom.com webpage. 

First we investigate the distribution of the religion across the found characters.

In [None]:
#hide_input
figs["religion"].show()

In [None]:
#hide_input
figs["allegiance"].show()

In [None]:
#hide_input
figs["culture"].show()

In [None]:
#hide_input
figs["status"].show()

In [None]:
#hide_input
figs["appearances"].show()

## Character dialogoues

Next we dive into the character dialogoues which are extracted from transcripts, this dataset contains dialogoues from all characters in the season, and this is based on another dataset than in the previous part of this page. Therefore we restrict the data to only contain data for the characters that are present in the network used for analysis in Text Analysis [ref]. 

Originally the data contains 817 characters and the original dataset can be found here transcripts [ref]. 

In [None]:
#hide
resp = requests.get("https://raw.githubusercontent.com/jeffreylancaster/game-of-thrones/master/data/script-bag-of-words.json")

diag = json.loads(resp.text)

char_diag = {}
for element in diag:
    episode = element['episodeNum']
    season = element['seasonNum']
    title = element['episodeTitle']
    text = element['text']
    for textObj in text:
        if textObj['name'] in char_diag:
            char_diag[textObj['name']].append(textObj['text'])
        else:
            char_diag[textObj['name']] = [textObj['text']]
            

We are going to investigate how many episodes and series does each character appear in and also what is the average token length ie. how much dialogoue are present for each character. 

## Reviews and ratings

Lastly we will dive into the data from IMDB, where ratings and reviews are extracted. Here we will investigate how the rating distribution are in general, but also how it is distributed when taking the average rating pr. episode but also pr. season. 

In [None]:
#hide
f = open("/work/imdb_reviews.json")
ratings = json.load(f)

episode_rating = {}
season_rating = {}
s = 0
for season, episodes in ratings.items():
    season_rating["S" + str(s+1)] = 0
    c = 0
    for episode in episodes:
        c+= 1
        season_rating["S"+ str(s+1)] += episodes[episode]['ratings']['demographics']['imdb users']['rating']
        episode_rating["S" + str(s+1) + " E" + str(c+1)]= episodes[episode]['ratings']['demographics']['imdb users']['rating']
    season_rating["S"+ str(s+1)] = season_rating["S"+str(s+1)]/c
    s+=1

Ratings pr season

In [None]:
#hide_input
df_season_rating = pd.DataFrame.from_dict(season_rating, orient= 'index')
df_season_rating = df_season_rating.reset_index()
df_season_rating.columns = ['Season', "IMDB rating"]
fig_season_rating = px.bar(df_season_rating, x="Season",
             y="IMDB rating", color="Season", title="Rating pr. season")
fig_season_rating.show()

ratings pr episode

In [None]:
#hide_input
df_episode_rating = pd.DataFrame.from_dict(episode_rating, orient= 'index')
df_episode_rating = df_episode_rating.reset_index()
df_episode_rating.columns = ['Episode', "IMDB rating"]
fig_episode_rating = px.bar(df_episode_rating, x="Episode",
             y="IMDB rating", color="Episode", title="Rating pr. episode")
fig_episode_rating.show()

ratings in general

In [None]:
#hide_input
df_ratings_dist = pd.DataFrame.from_dict(episode_rating, orient = 'index')
df_ratings_dist.columns = ['IMDB rating']
fig = px.histogram(df_ratings_dist, x="IMDB rating", title = "Rating distribution")
fig.show()

what are the average review length 

In [None]:
#hide
episode_review = {}
season_review = {}
episode_tokens  = {}
season_tokens = {}

for season, episodes in ratings.items():
    season_review[season] = list()
    season_tokens[season] = 0
    s = 0
    for episode in episodes:
        episode_review[episode] = list()
        reviews = episodes[episode]['reviews']
        episode_tokens[episode] = 0
        e = 0
        for review in reviews:
            e+=1
            s+=1
            toknized_review = len(word_tokenize(review['content']))

            episode_tokens[episode] += toknized_review
            season_tokens[season] += toknized_review

            episode_review[episode].append(review['content'])
            season_review[season].append(review['content'])
        episode_tokens[episode] = episode_tokens[episode]/e
    season_tokens[season] = season_tokens[season]/s

In [None]:
#hide_input
df_episode_review = pd.DataFrame.from_dict(episode_tokens, orient= 'index')
df_episode_review = df_episode_review.reset_index()
df_episode_review.columns = ['Episode', "Review length"]
fig_episode_review = px.bar(df_episode_review, x="Episode",
             y="Review length", color="Episode", title="Episode length pr. episode")
fig_episode_review.show()

In [None]:
#hide_input
df_season_review = pd.DataFrame.from_dict(season_tokens, orient= 'index')
df_season_review = df_season_review.reset_index()
df_season_review.columns = ['Season', "Review length"]
fig_season_review = px.bar(df_season_review, x="Season",
             y="Review length", color="Season", title="Episode length pr. season")
fig_season_review.show()

### SKAL VI EVT TILFØJE NOGET OM DEMOGRAPHICS??? 

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=0aadd790-0254-407e-bf1a-a0259cad43c9' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>