In [1]:
import pandas as pd
import plotly.graph_objs as go
import string
import re
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import cufflinks as cf
from plotly.offline import iplot
from tqdm import tqdm

from collections import Counter
from typing import Dict, Text
from ast import literal_eval
from datetime import datetime
import plotly.express as px

cf.go_offline()
cf.set_config_file(offline = False, world_readable = True)

import plotly.io as pio
pio.renderers.default='notebook'

import warnings
warnings.simplefilter('ignore')

# **import данных**

In [2]:
movies = pd.read_csv('movies.csv')

In [3]:
movies.columns = [col.upper() for col in movies.columns]
movies.head()

Unnamed: 0,MOVIEID,TITLE,GENRES
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies.shape

(86537, 3)

In [5]:
rating = pd.read_csv('ratings.csv')

In [6]:
links = pd.read_csv('links.csv')

In [7]:
links.shape

(86537, 3)

In [8]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [9]:
rating.columns = [col.upper() for col in rating.columns]
rating.head()

Unnamed: 0,USERID,MOVIEID,RATING,TIMESTAMP
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119


In [10]:
df = movies.merge(rating, how="left", on="MOVIEID")
df.head()

Unnamed: 0,MOVIEID,TITLE,GENRES,USERID,RATING,TIMESTAMP
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,1225735000.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2.0,5.0,835816000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.0,974518000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10.0,3.0,1430666000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,12.0,5.0,862500700.0


In [11]:
del rating

# **Определение популярных жанров фильмов**

In [12]:
genres_list = []
for i in tqdm(df['GENRES']):
    genres_list.extend(i.split('|'))

n_genres = len(genres_list)
df_plot = pd.DataFrame(Counter(genres_list).most_common(n_genres), columns=['genre', 'total'])

fig = px.bar(df_plot, x='genre', y='total',
             title="Количество оценок по жанрам",
             labels={'total':'Количество оценок',
                     'genre':'Жанр'})
fig.update_layout(xaxis_tickangle=-45)

fig.show()

100%|██████████████████████████| 33835460/33835460 [00:05<00:00, 5685116.53it/s]


In [13]:
def visualization_helper(df):
    ratings_list = [(5.0, 4.0), (4.0, 3.0), (3.0, 2.0), (2.0, 1.0), (1.0, 0.0)]
    
    fig_array = []
    title_list = []
    for i, r in enumerate(ratings_list):
        if i == 0:
            df_plot = df[(df['RATING'] >= r[1]) & (df['RATING'] <= r[0])]['TITLE'].value_counts()[:20].reset_index()
        else:
            df_plot = df[(df['RATING'] >= r[1]) & (df['RATING'] < r[0])]['TITLE'].value_counts()[:20].reset_index()

        fig = px.bar(df_plot, x='count', y='TITLE', orientation = 'h')
        fig_array.append(fig)
        title_list.append(f"Топ 20 фильмов с рейтингом в диапазоне {r}")

    fig = cf.subplots(fig_array, shape = (5, 1),
                      subplot_titles = title_list, vertical_spacing = 0.05)
    
    fig['layout']['height'] = len(ratings_list) * 500
    fig['layout']['title'] = 'Рейтинг фильмов'
    
    fig['layout'].update(showlegend = False)
    iplot(fig)

In [14]:
visualization_helper(df)

# **Работа с рейтингами**

In [15]:
df['TIMESTAMP'] = pd.to_datetime(df['TIMESTAMP'], unit='s')
df['YEAR'] = df['TIMESTAMP'].dt.year
df['MONTH'] = df['TIMESTAMP'].dt.month
df['TIME'] = df['TIMESTAMP'].dt.time
df['DATE'] = df['TIMESTAMP'].dt.date

In [16]:
df['WEEK_DAY'] = df['TIMESTAMP'].dt.day_name()
df['DAY_NUM'] = df['TIMESTAMP'].dt.day_of_week

In [17]:
df.head()

Unnamed: 0,MOVIEID,TITLE,GENRES,USERID,RATING,TIMESTAMP,YEAR,MONTH,TIME,DATE,WEEK_DAY,DAY_NUM
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,2008-11-03 17:52:19,2008.0,11.0,17:52:19,2008-11-03,Monday,0.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2.0,5.0,1996-06-26 19:06:11,1996.0,6.0,19:06:11,1996-06-26,Wednesday,2.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.0,2000-11-18 03:27:04,2000.0,11.0,03:27:04,2000-11-18,Saturday,5.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10.0,3.0,2015-05-03 15:19:54,2015.0,5.0,15:19:54,2015-05-03,Sunday,6.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,12.0,5.0,1997-05-01 15:32:18,1997.0,5.0,15:32:18,1997-05-01,Thursday,3.0


## Фильмы без рейтинга

In [18]:
stat_no_rating = df.loc[df['RATING'].isnull(), ['TITLE', 'RATING']].fillna(0).groupby('TITLE')['RATING'].count().reset_index()

In [19]:
df.loc[df['RATING'].isnull(), ['TITLE', 'RATING']].shape

(3298, 2)

In [20]:
stat_no_rating.head()

Unnamed: 0,TITLE,RATING
0,#387 (2020),1
1,#FromJennifer (2017),1
2,10949 femmes (2015),1
3,"11 Days, 11 Nights 2 (1990)",1
4,11 Metri (2011),1


In [21]:
stat_no_rating.max()

TITLE     Éducation anglaise (1983)
RATING                            1
dtype: object

# **Динамика оценок пользователей**

In [22]:
df_plot = df.groupby('RATING')['MOVIEID'].count().reset_index()

fig = px.bar(df_plot, x='RATING', y='MOVIEID',
             title="Количество оценок по рейтингу",
             labels={'MOVIEID':'Количество оценок',
                     'RATING':'Рейтинг'})

fig.show()

In [23]:
def visualization_year(df, years):
    years_list = sorted(df[df['YEAR'].between(years[0], years[1])]['YEAR'].astype(int).dropna().unique())
    
    fig_array = []
    title_list = []
    for i, r in enumerate(years_list):
        df_plot = df[df['YEAR'] == r]['RATING'].value_counts().reset_index()
        
        fig = px.bar(df_plot, x='count', y='RATING', orientation = 'h')
        fig_array.append(fig)
        title_list.append(f"Оценки фильмов за {r} год")

    fig = cf.subplots(fig_array, shape = (len(years_list), 1),
                      subplot_titles = title_list, vertical_spacing = 0.05)
    
    fig['layout']['height'] = len(years_list) * 250
    fig['layout']['title'] = 'Динамика оценок по годам'
    
    fig['layout'].update(showlegend = False)
    iplot(fig)

In [24]:
df['YEAR'].min()

1995.0

In [25]:
visualization_year(df, [1995, 2009])

In [26]:
df['YEAR'].max()

2023.0

In [27]:
visualization_year(df, [2010, 2023])

In [28]:
df_plot = df.groupby('YEAR')['MOVIEID'].count().reset_index()
df_plot['YEAR'] = df_plot['YEAR'].astype(int)

fig = px.bar(df_plot, x='YEAR', y='MOVIEID',
             title="Количество оценок по годам",
             labels={'MOVIEID':'Количество оценок',
                     'YEAR':'Год'})

fig.show()

# **Пользователи, у которых больше всего оценок**

In [29]:
df_plot = df['USERID'].astype(str).value_counts().reset_index()[:20] 

fig = px.bar(df_plot, x='USERID', y='count',
             title="Пользователи, у которых больше всего оценок",
             labels={'count':'Количество оценок',
                     'USERID':'USERID пользователя'})

fig.show()

In [30]:
df_plot = df['USERID'].astype(str).value_counts().reset_index()[:20]
df_plot['count'] = df_plot['count'] / movies.shape[0]* 100

fig = px.bar(df_plot, x='USERID', y='count',
             title="Пользователи, у которых больше всего оценок",
             labels={'count':'Процент оцененных фильмов Movielens, %',
                     'USERID':'USERID пользователя'})

fig.show()

# **Рейтинг фильма в зависимости от года выпуска**

In [31]:
df['year_of_release'] = df['TITLE'].str.extract("\((\d{4})\)", expand=True)
df['year_of_release'] = pd.to_datetime(df['year_of_release'], format='%Y')
df['year_of_release'] = df['year_of_release'].dt.year
df['TITLE'] = df['TITLE'].str[:-7]

In [32]:
df.head()

Unnamed: 0,MOVIEID,TITLE,GENRES,USERID,RATING,TIMESTAMP,YEAR,MONTH,TIME,DATE,WEEK_DAY,DAY_NUM,year_of_release
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,2008-11-03 17:52:19,2008.0,11.0,17:52:19,2008-11-03,Monday,0.0,1995.0
1,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,2.0,5.0,1996-06-26 19:06:11,1996.0,6.0,19:06:11,1996-06-26,Wednesday,2.0,1995.0
2,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,7.0,4.0,2000-11-18 03:27:04,2000.0,11.0,03:27:04,2000-11-18,Saturday,5.0,1995.0
3,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,10.0,3.0,2015-05-03 15:19:54,2015.0,5.0,15:19:54,2015-05-03,Sunday,6.0,1995.0
4,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,12.0,5.0,1997-05-01 15:32:18,1997.0,5.0,15:32:18,1997-05-01,Thursday,3.0,1995.0


In [33]:
movies['year_of_release'] = movies['TITLE'].str.extract("\((\d{4})\)", expand=True)
movies['year_of_release'] = pd.to_datetime(movies['year_of_release'], format='%Y')
movies['year_of_release'] = movies['year_of_release'].dt.year
movies['TITLE'] = movies['TITLE'].str[:-7]
movies.head()

Unnamed: 0,MOVIEID,TITLE,GENRES,year_of_release
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995.0
1,2,Jumanji,Adventure|Children|Fantasy,1995.0
2,3,Grumpier Old Men,Comedy|Romance,1995.0
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995.0
4,5,Father of the Bride Part II,Comedy,1995.0


In [34]:
movies.groupby('year_of_release')['TITLE'].nunique().sort_values().iplot(kind = 'bar', 
                                                       color = 'purple', 
                                                       title = 'Количество фильмов, вышедших в прокат по годам',
                                                        xTitle = 'Год выхода в прокат фильма',
                                                       yTitle = 'Количество фильмов')

In [35]:
df.groupby('year_of_release')['RATING'].mean().sort_values().iplot(kind = 'bar', 
                                                       color = 'purple', 
                                                       title = 'Средний рейтинг фильмов, вышедших в прокат по годам',
                                                        xTitle = 'Год выхода в прокат фильма',
                                                       yTitle = 'Средний рейтинг фильмов')

In [36]:
def visualization_movies(df):
    ratings_list = [(5.0, 4.0), (4.0, 3.0), (3.0, 2.0), (2.0, 1.0), (1.0, 0.0)]
    
    fig_array = []
    title_list = []
    
    data_full = df.groupby('year_of_release')['TITLE'].count().reset_index()
    data_full.sort_values(['year_of_release'], inplace = True)
    data_full.rename({'TITLE': 'TITLE_full'}, axis = 1, inplace=True)
    
    for i, r in enumerate(ratings_list):
        if i == 0:
            df_plot = df[(df['RATING'] >= r[1]) & (df['RATING'] <= r[0])].groupby('year_of_release')['TITLE'].count().reset_index()
            
        else:
            df_plot = df[(df['RATING'] >= r[1]) & (df['RATING'] < r[0])].groupby('year_of_release')['TITLE'].count().reset_index()

        df_plot = df_plot.merge(data_full, on = ['year_of_release'], how = 'left')
        df_plot['TITLE'] = df_plot['TITLE'] / df_plot['TITLE_full'] * 100
        df_plot.sort_values(['year_of_release'], inplace = True)
            
        fig = px.bar(df_plot, x='year_of_release', y='TITLE')
        fig_array.append(fig)
        title_list.append(f"Динамика по годам фильмов с рейтингом диапазоне {r}")

    fig = cf.subplots(fig_array, shape = (5, 1),
                      subplot_titles = title_list, vertical_spacing = 0.05)
    
    fig['layout']['height'] = len(ratings_list) * 500
    fig['layout']['title'] = 'Динамика по годам рейтингов фильмов, проценты'
    
    fig['layout'].update(showlegend = False)
    iplot(fig)

In [37]:
visualization_movies(df)

# **Рейтинг фильма в зависимости от жанра**

In [38]:
df.head(3)

Unnamed: 0,MOVIEID,TITLE,GENRES,USERID,RATING,TIMESTAMP,YEAR,MONTH,TIME,DATE,WEEK_DAY,DAY_NUM,year_of_release
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,2008-11-03 17:52:19,2008.0,11.0,17:52:19,2008-11-03,Monday,0.0,1995.0
1,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,2.0,5.0,1996-06-26 19:06:11,1996.0,6.0,19:06:11,1996-06-26,Wednesday,2.0,1995.0
2,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,7.0,4.0,2000-11-18 03:27:04,2000.0,11.0,03:27:04,2000-11-18,Saturday,5.0,1995.0


In [39]:
genres_list = list(np.unique(genres_list))
genres_list

['(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [40]:
df_plot = pd.DataFrame()

for genre in tqdm(genres_list):
    df_temp = df.loc[df['GENRES'].str.contains(genre, regex=False), ['RATING', 'USERID']].groupby('RATING')['USERID'].count().reset_index()
    df_temp['GENRE'] = genre
    df_plot = df_plot._append(df_temp, ignore_index=True)

100%|███████████████████████████████████████████| 20/20 [01:19<00:00,  3.99s/it]


In [41]:
df_plot.head()

Unnamed: 0,RATING,USERID,GENRE
0,0.5,2339,(no genres listed)
1,1.0,1796,(no genres listed)
2,1.5,1600,(no genres listed)
3,2.0,3317,(no genres listed)
4,2.5,4569,(no genres listed)


In [42]:
def visualization_genres_rating(df, list_agg):

    fig_array = []
    title_list = []
    
    for i, r in enumerate(list_agg):
            
        fig = px.bar(df_plot.loc[df_plot['GENRE'] == r], x='RATING', y='USERID')
        fig_array.append(fig)
        title_list.append(f"Динамика оценок по жанру {r}")

    fig = cf.subplots(fig_array, shape = (len(list_agg), 1),
                      subplot_titles = title_list)
    
    fig['layout']['height'] = len(list_agg) * 150
    fig['layout']['title'] = 'Динамика оценок по жанрам'
    
    fig['layout'].update(showlegend = False)
    iplot(fig)

In [43]:
visualization_genres_rating(df_plot, genres_list)

# **Жанры по дню недели просмотра**

In [44]:
df_plot = pd.DataFrame()

for genre in tqdm(genres_list):
    df_temp = df.loc[df['GENRES'].str.contains(genre, regex=False), ['WEEK_DAY', 'USERID', 'DAY_NUM']].groupby(['WEEK_DAY', 'DAY_NUM'])['USERID'].count().reset_index()
    df_temp['GENRE'] = genre
    df_temp.sort_values(by = 'DAY_NUM', inplace=True)
    df_plot = df_plot._append(df_temp, ignore_index=True)

100%|███████████████████████████████████████████| 20/20 [01:21<00:00,  4.05s/it]


In [45]:
df_plot.head()

Unnamed: 0,WEEK_DAY,DAY_NUM,USERID,GENRE
0,Monday,0.0,7823,(no genres listed)
1,Tuesday,1.0,7458,(no genres listed)
2,Wednesday,2.0,7322,(no genres listed)
3,Thursday,3.0,6977,(no genres listed)
4,Friday,4.0,7797,(no genres listed)


In [46]:
def visualization_genres_weekday(df, list_agg):

    fig_array = []
    title_list = []
    
    for i, r in enumerate(list_agg):
            
        fig = px.bar(df_plot.loc[df_plot['GENRE'] == r], x='WEEK_DAY', y='USERID')
        fig_array.append(fig)
        title_list.append(f"Динамика просмотров в зависимости от дня недели фильмов в жанре {r}")

    fig = cf.subplots(fig_array, shape = (len(list_agg), 1),
                      subplot_titles = title_list)
    
    fig['layout']['height'] = len(list_agg) * 150
    fig['layout']['title'] = 'Динамика просмотров по жанру в зависимости от дня недели'
    
    fig['layout'].update(showlegend = False)
    iplot(fig)

In [47]:
visualization_genres_weekday(df_plot, genres_list)

# **Жанры по дню просмотра**

In [48]:
df_plot = pd.DataFrame()

for genre in tqdm(genres_list):
    df_temp = df.loc[df['GENRES'].str.contains(genre, regex=False), ['DATE', 'USERID']].groupby('DATE')['USERID'].count().reset_index()
    df_temp['GENRE'] = genre
    df_temp.sort_values(by = 'DATE', inplace=True)
    df_plot = df_plot._append(df_temp, ignore_index=True)

100%|███████████████████████████████████████████| 20/20 [01:19<00:00,  3.97s/it]


In [49]:
df_plot.head()

Unnamed: 0,DATE,USERID,GENRE
0,2011-02-28,2,(no genres listed)
1,2011-03-02,1,(no genres listed)
2,2011-04-27,1,(no genres listed)
3,2011-05-25,1,(no genres listed)
4,2011-11-25,1,(no genres listed)


In [50]:
def visualization_genres_weekday(df, list_agg):

    fig_array = []
    title_list = []
    
    for i, r in enumerate(list_agg):
            
        fig = px.line(df_plot.loc[df_plot['GENRE'] == r], x='DATE', y='USERID')
        fig_array.append(fig)
        title_list.append(f"Динамика просмотров в зависимости от дня фильмов в жанре {r}")

    fig = cf.subplots(fig_array, shape = (len(list_agg), 1),
                      subplot_titles = title_list)
    
    fig['layout']['height'] = len(list_agg) * 250
    fig['layout']['title'] = 'Динамика просмотров по жанру в зависимости от дня'
    
    fig['layout'].update(showlegend = False)
    iplot(fig)

In [51]:
visualization_genres_weekday(df_plot, genres_list)

# **Жанры по месяцу просмотра**

In [52]:
df_plot = pd.DataFrame()

for genre in tqdm(genres_list):
    df_temp = df.loc[df['GENRES'].str.contains(genre, regex=False), ['MONTH', 'USERID']].groupby('MONTH')['USERID'].count().reset_index()
    df_temp['GENRE'] = genre
    df_temp.sort_values(by = 'MONTH', inplace=True)
    df_plot = df_plot._append(df_temp, ignore_index=True)

100%|███████████████████████████████████████████| 20/20 [01:17<00:00,  3.87s/it]


In [53]:
df_plot.head()

Unnamed: 0,MONTH,USERID,GENRE
0,1.0,5194,(no genres listed)
1,2.0,4778,(no genres listed)
2,3.0,5047,(no genres listed)
3,4.0,5029,(no genres listed)
4,5.0,5297,(no genres listed)


In [54]:
def visualization_genres_weekday(df, list_agg):

    fig_array = []
    title_list = []
    
    for i, r in enumerate(list_agg):
            
        fig = px.bar(df_plot.loc[df_plot['GENRE'] == r], x='MONTH', y='USERID')
        fig_array.append(fig)
        title_list.append(f"Динамика просмотров в зависимости от меясца фильмов в жанре {r}")

    fig = cf.subplots(fig_array, shape = (len(list_agg), 1),
                      subplot_titles = title_list)
    
    fig['layout']['height'] = len(list_agg) * 150
    fig['layout']['title'] = 'Динамика просмотров по жанру в зависимости от месяца'
    
    fig['layout'].update(showlegend = False)
    iplot(fig)

In [55]:
visualization_genres_weekday(df_plot, genres_list)