In [1]:
import requests
import time
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
urls = ['https://www.imdb.com/list/ls008957859/?st_dt=&mode=detail&sort=list_order,asc&page=1',
        'https://www.imdb.com/list/ls008957859/?st_dt=&mode=detail&sort=list_order,asc&page=2',
        'https://www.imdb.com/list/ls008957859/?st_dt=&mode=detail&sort=list_order,asc&page=3']

all_shows = []

for j in urls:
    response = requests.get(j)
    soup = BeautifulSoup(response.text, 'html.parser')
    show = soup.find_all('div', {'class' : 'lister-item mode-detail'})
        
    for i in show:
        header = i.find('h3', {'class': 'lister-item-header'})
        rate = header.find('span', {'class' : 'lister-item-index unbold text-primary'}).get_text()
        link = header.find('a').get('href')
        name = header.find('a').get_text()
        year = header.find('span',  {'class' : 'lister-item-year text-muted unbold'}).get_text()
        all_shows.append([rate, name, link, year])
    
df = pd.DataFrame(all_shows)
df.columns = ['Rate', 'Name', 'Link', 'Year']

df['Start_year'] = df['Year'].str.slice(1,5)
df['End_year'] = df['Year'].str.slice(6,-1)

df = df[df.End_year != ' ']
df = df[df.End_year != '']
df.reset_index(inplace = True, drop = True)

df['mean_all'] = 0
df['mean_last'] = 0

df.set_index(df.Name, inplace = True)

In [None]:
for index, row in df.iterrows():
    print('Now scraping ', row['Name'])
    episodes_ex = []
    episodes = []

    for i in range(1,30):
        url = 'https://www.imdb.com' + row['Link'] + 'episodes?season=' + str(i)
        response = requests.get(url)
        
        print('Season', str(i), '. Response: ', response.ok)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        rating = soup.find_all('span', {'class' : 'ipl-rating-star__rating'})
        rate = [span.get_text() for span in rating][::23]
        rate = [float(x) for x in rate] 
        time.sleep(0.1)    
        
        if i>1:
            if episodes[-1] == rate:
                print('All season scraped: season', str(i), 'does not exist^^.')
                last_season = rate
                break
        
        episodes_ex.extend(rate)
        episodes.append(rate)
    
    df.loc[row['Name'], 'mean_all'] = np.mean(episodes_ex)
    df.loc[row['Name'], 'mean_last'] = np.mean(rate)

    print('\n')

In [2]:
#df.to_csv(r'C:\Users\Bruger\Dropbox\Python\Ratings\data.csv', index = True)
#df = pd.read_csv(r'C:\Users\Bruger\Dropbox\Python\Ratings\data.csv', index_col = 0)
df = df[:-23]

In [3]:
df = df[df['mean_all'] != df['mean_last']]
df['Last_season_better'] = df['mean_all'] < df['mean_last']
df['Last_col'] =df['mean_last'] - df['mean_all']

In [6]:
from bokeh.plotting import figure, show, reset_output
from bokeh.io import output_file, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, Label
from bokeh.transform import linear_cmap
from bokeh.palettes import Turbo256
from bokeh.models import ColumnDataSource
from bokeh.models.glyphs import Text

reset_output()
output_notebook()
output_file('Plot.html')

In [7]:
data = ColumnDataSource(df)
    
# Specify the selection tools to be made available
select_tools = ['box_zoom','tap', 'reset']

# Create the figure
fig = figure(plot_height=600,
             plot_width=900,
             x_axis_label='Average rating of all episodes',
             y_axis_label='Avgerage rating of last season episodes',
             title='Rating of the last season against the average episode rating',
             toolbar_location='below',
             tools=select_tools)

#col = df['mean_last']

colors = linear_cmap('Last_col', palette=Turbo256, low = min(df['Last_col']), high = max(df['Last_col'])) 

# Add square representing each player
fig.circle(x='mean_all',
           y='mean_last',
           source=df,
           fill_alpha=0.4,
           color=colors,
           selection_color='deepskyblue',
           nonselection_color='lightgray',
           size = 8,
           nonselection_alpha=0.3)

# Format the tooltip
tooltips = [
            ('Show','@Name'),
            ('Overall ranking', '@Rate'),
            ('Last season better than average', '@Last_season_better'),
            ('Avg. rating of all episodes', '@mean_all{1.00}'),
            ('Avg. rating of last season episodes','@mean_last{1.00}'),
           ]

# Configure a renderer to be used upon hover
hover_glyph = fig.circle(x='mean_all', y='mean_last', source=df,
                         size=10, alpha=0,
                         hover_fill_color='black', hover_alpha=0.5)

# Add the HoverTool to the figure
fig.add_tools(HoverTool(tooltips=tooltips, renderers=[hover_glyph]))

fig.line(x=[0,1,2,3,4,5,6,7,8,9,10], y=[0,1,2,3,4,5,6,7,8,9,10], 
         color='black', line_width=0.2)

# Visualize
show(fig)