In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

import os
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
RND_ST = 12345

In [2]:
from IPython.core.display import display, HTML
display(HTML("""
<style>
.container { 
   width:90% !important; 
   position: relative; 
   right: 25px; 
}
</style>
"""))

  from IPython.core.display import display, HTML


In [3]:
def pth_exist(pth):
    if os.path.exists(pth):
        df = pd.read_csv(pth)
        return df
    else:
        print('Путь не таков')
        return None

In [4]:
df = pth_exist(r'C:\Users\temoc\OneDrive\Рабочий стол\kaggle\reccom\data\TMDB_movie_dataset_v11.csv')

In [5]:
def info_func(df):
    print('Общая информация')
    print('')
    df.info()
    print('')
    print(df.describe())
    print('')
    print('Кол-во пропущенных значений')
    print('')
    print(df.isna().sum().sort_values(ascending=False))
    print('')
    print('Процент пропущенных значений')
    print('')
    print((df.isna().sum()/len(df)*100).sort_values(ascending=False))
    print('')
    print('Как выглядит датасет')
    print('')
    print(df.head(10))

In [6]:
info_func(df)

Общая информация

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1151022 entries, 0 to 1151021
Data columns (total 24 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1151022 non-null  int64  
 1   title                 1151009 non-null  object 
 2   vote_average          1151022 non-null  float64
 3   vote_count            1151022 non-null  int64  
 4   status                1151022 non-null  object 
 5   release_date          957813 non-null   object 
 6   revenue               1151022 non-null  int64  
 7   runtime               1151022 non-null  int64  
 8   adult                 1151022 non-null  bool   
 9   backdrop_path         304206 non-null   object 
 10  budget                1151022 non-null  int64  
 11  homepage              121619 non-null   object 
 12  imdb_id               606248 non-null   object 
 13  original_language     1151022 non-null  object 
 14  original_title  

Возможна фильтрация данных - отсечение фильмов с малым кол-вом оценок (определить можно через 95%). Нужно ли? Тогда эти фильмы может вообще никто не посмотреть. А данные о фильмах обновляются во времени (оценки, кол-во оценок). Тем более есть примеры фильмов которые в свое время провалились, но спустя время стали не только популярными, но и культовыми (Бойцовский клуб, Большой Лебовски итд).

Можно построить разные варианты рекомендательной системы
* просто советовать самые популярные фильмы, по рейтингу (наверное может подойти, скорее не как рекомендация, а как рейтинг самых популярных фильмов). Можно добавить фильтрацию по какому-нибудь определенному жанру, году, возрастному ограничению. 
* советовать фильм на основе другого фильма. сравнивать фильмы по схожести описания, слогана, состава команды, актеров, жанра, кейвордов. вроде как косунисуное сходство.
* на основе опыта (его оценик опредленного фильма) другого юзера - определить насколько мне понравится фильм
* совмещение 2 и 3 способа. это что то типа "мы вам рекомендуем вот это на основе того, что вы смотрели раньше".

для того, что я предпологал сделать, а именно вписать 3 фильма которые понравились пользователю, и на основе их подобрать 10 фильмов (возможно добавить кнопку "смотрел" около выданных фильмов, чтобы заменить на фильм дальше (11 фильм, 12, 13 итд)) которые могли бы понравится пользователю - скорее всего подходит 2 вариант. 1 вариант можно внедрить просто как рейтинг фильмов, которые "культовые" и стоит посмотреть в не зависимости от предпочтений юзера. 3 и 4 фильмы видимо требуют аккаунт системы, чтобы можно было сохранять историю просмотров по юзерам (видимо +сервера тут же) - что мне недоступно, если я собираюсь внедрить это на какой нибудь сайт сделанный на fastapi. но в качестве кода можно конечно 3 и 4 вариант реализовать просто в коде на основе датасета этого.  

по текущим задачам:
* реализовать в коде второй вариант
* разобраться в fastapi и реализовать в нем первый пункт
* (бонус) как вариант добавить окошко в котором можно будет ввести фильм и данные о нем, а модель предскажет его рейтинг, условно вышел новый фильм и пользователь думает сходить или нет, а модель выдает предположительный рейтинг основываясь на данных о фильме 

# Основная часть

In [11]:
df_first = df[
    (df['vote_average'] > 0) & 
    (~df['adult']) & 
    (df['keywords'].notna()) & 
    (df['tagline'].notna()) & 
    (df['genres'].notna()) 
].sample(frac=0.02, random_state=RND_ST)

In [118]:
df_second = df[
    (df['vote_average'] > 0) & 
    (~df['adult']) & 
    (df['keywords'].notna()) & 
    (df['tagline'].notna()) & 
    (df['genres'].notna()) 
].sample(frac=0.5, random_state=RND_ST)

In [12]:
float_rows = df_first[df_first['keywords'].apply(lambda x: isinstance(x, float))]

print("Строки с float значениями:")
print(float_rows['keywords'])

Строки с float значениями:
Series([], Name: keywords, dtype: object)


In [13]:
df_first.head(10)

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
340552,322655,Kylie Minogue: Kiss Me Once - Live at the SSE ...,10.0,1,Released,2015-03-23,0,110,False,/xVZJ5NcN01LrRi7OsRuXXhzcBt2.jpg,...,Kylie Minogue: Kiss Me Once - Live at the SSE ...,Live at The SSE Hydro in Glasgow. The thirteen...,1.531,/zGV4DOaYAtkUZ7fulMesshTIJ8E.jpg,Live at the SSE Hydro,Music,Parlophone,United Kingdom,English,"concert, glasgow, scotland"
256863,1378472,Jack's Lantern,9.0,1,Released,2024-10-27,0,24,False,/ffZkPF1wv2mmAmBOoDkbHR530uM.jpg,...,Jack's Lantern,"This suspenseful Halloween drama, is a proof o...",0.0,/bTctmUCP8kycLJgbB7auCNX8ZfD.jpg,"It's Halloween night, and Jack wants to carve ...","Drama, Thriller",,United States of America,,"halloween, pumpkin carving, drama, indie film,..."
267143,1393449,The Wildman Massacre,10.0,1,Released,2024-11-15,0,90,False,/8aOPvHKGF1Pa2Mfkjb6PMrOIAPI.jpg,...,The Wildman Massacre,"A hunter searches for his daughter, who has be...",1.4,/yoHzdDDsWI8nBauOmuoyCqRanCg.jpg,Some secrets don't stay buried.,"Horror, Comedy, Mystery, Drama",Night Cap Canada,Canada,English,"bigfoot, mysterious, shocking, sasquatch, bigf..."
4212,10658,Howard the Duck,5.394,911,Released,1986-08-01,37962774,110,False,/t3wX0hMH0esnx4SZEbugYCnCoMl.jpg,...,Howard the Duck,A scientific experiment unknowingly brings ext...,27.691,/eU0dWo8PJgsSAZFbcyHiUpuLSyW.jpg,You will believe that a duck can talk.,"Comedy, Fantasy, Science Fiction","Lucasfilm Ltd., Universal Pictures, Marvel Ent...",United States of America,English,"duck, based on comic, anthropomorphism, extrat..."
54768,125413,Fear Strikes Out,7.1,17,Released,1957-03-20,0,100,False,/4SU6I3O80HiNinufaWYQKaMJF3.jpg,...,Fear Strikes Out,"True story of the life of Jimmy Piersall, who ...",4.411,/f4RPc5nOxMaAlTlzsc5YdcPB09s.jpg,"This is Jimmy, balanced on the ragged edge of ...",Drama,Paramount,United States of America,English,"baseball, mental breakdown, psychiatrist, over..."
1313,2253,Valkyrie,6.909,3384,Released,2008-12-25,200276000,121,False,/h1bNQ2MZzKLuKa7N0nOuSXlgMwT.jpg,...,Valkyrie,"Wounded in Africa during World War II, Nazi Co...",19.493,/8eoRtXswC8IQDsqW7iJWO56NhAv.jpg,Many saw evil. They dared to stop it.,"Drama, Thriller, War","United Artists, Bad Hat Harry Productions, Ach...","Germany, United States of America","English, German","friendship, suicide, plan, berlin, germany, hu..."
45510,38021,Changing Times,5.4,23,Released,2004-01-01,0,90,False,/oEOwWlYlNmqJfYGJ9P30iYkgvuR.jpg,...,Les Temps qui changent,"In Tangiers where he traveled for his work, a ...",2.466,/cUF2Dbtsayn0q2KebksAzvbLlwN.jpg,Can your first love also be your last?,"Drama, Romance","Agora Films, Gemini Films, TPS Star, Banque Po...",France,French,"marriage, morocco, tangier morocco, twins, fir..."
29122,888097,Slayers,4.315,46,Released,2022-10-21,0,88,False,/nKdCl2roOng0DiLRgk4W3P4RTSi.jpg,...,Slayers,A group of superstar influencers are drawn to ...,9.252,/1dgNc7FB5RMm6w4D6S0UpZhsUbW.jpg,Vengeance bites back.,"Horror, Comedy, Fantasy","BondIt Media Capital, Fasehun Films, Film Mode...",United States of America,English,vampire
5824,354861,Father Figures,5.797,577,Released,2017-12-21,25601244,125,False,/diBOwmjmy0RPXs2oRxGsmfMDxsK.jpg,...,Father Figures,Upon learning that their mother has been lying...,11.101,/avRiH6kbsw3Wxi1J7857alb9yyt.jpg,Finding Their Father Would Be a Family Miracle,Comedy,"Alcon Entertainment, The Montecito Picture Com...",United States of America,English,"florida, hitchhiker, twins, entrepreneur, divo..."
19230,124623,I Know That Voice,7.049,91,Released,2014-01-07,0,95,False,/p4r3FHgoELOxIoYlF9QVUBskfIQ.jpg,...,I Know That Voice,Filmmaker Lawrence Shapiro discusses voice-ove...,7.933,/eIYd9xNhFSyKhvd9oBvXq9kXdc6.jpg,A documentary that puts a face to the voices w...,Documentary,"Dundee Entertainment, Cinovative, Record Farm ...",United States of America,English,"voice acting, film history"


In [14]:
#pulp_fiction = df[df['title'].str.contains('Pulp Fiction', case=False, na=False)]

#if not pulp_fiction.empty:
    #df_first = pd.concat([df_first, pulp_fiction]).drop_duplicates()

In [15]:
df_first['tagline'] = df_first['tagline'].fillna('')
df_first['description'] = df_first['overview'] + " " + df_first['tagline']
df_first['description'] = df_first['description'].fillna('')

In [16]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0.01, stop_words='english')
tf_matrix = tf.fit_transform(df_first['description'])

In [20]:
tf_matrix.shape

(1074, 456)

In [21]:
cos_sim = linear_kernel(tf_matrix, tf_matrix)

In [22]:
cos_sim[0]

array([1., 0., 0., ..., 0., 0., 0.])

In [23]:
df_first = df_first.reset_index()
titles = df_first['title']
indices = pd.Series(df_first.index, index=df_first['title'])

In [24]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cos_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [25]:
get_recommendations('Godzilla vs. Kong').head(10)

912                  High as Mike
367    Wristcutters: A Love Story
878               Passage to Mars
656                 Six of a Kind
925                 Man About Dog
483        Mr. Mouse Takes a Trip
798                    Navigation
269              A Five Star Life
77         To get lost on purpose
700              Lost in Thailand
Name: title, dtype: object

In [26]:
keywords = df_first['keywords'].value_counts()
keywords[:10]

keywords
woman director            16
short film                 5
based on novel or book     4
found footage              3
movie serial               3
orphan                     2
murder                     2
adult animation            2
softcore                   2
wrestling                  2
Name: count, dtype: int64

In [27]:
stemmer = SnowballStemmer('english')

In [45]:
df_first['keywords'] = df_first['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])

In [49]:
df_first['keywords'] = df_first['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [53]:
df_first['all'] = df_first.apply(lambda row: ' '.join(row['keywords']) + ' '.join(row['genres']), axis=1)
df_first['all'] = df_first['all'].apply(lambda x: ' '.join(x))

In [67]:
print(df_first['all'].head(10))
print(df_first['all'].str.len().describe())

0    c   o   n   c   e   r   t   ,     g   l   a   ...
1    h   a   l   l   o   w   e   e   n   ,     p   ...
2    b   i   g   f   o   o   t   ,     m   y   s   ...
3    d   u   c   k   ,     b   a   s   e   d     o ...
4    b   a   s   e   b   a   l   l   ,     m   e   ...
5    f   r   i   e   n   d   s   h   i   p   ,     ...
6    m   a   r   r   i   a   g   e   ,     m   o   ...
7    v   a   m   p   i   r   e H   o   r   r   o   ...
8    f   l   o   r   i   d   a   ,     h   i   t   ...
9    v   o   i   c   e     a   c   t   i   n   g   ...
Name: all, dtype: object
count    1074.000000
mean      307.270019
std       252.209438
min        31.000000
25%       137.000000
50%       227.000000
75%       396.500000
max      1797.000000
Name: all, dtype: float64


In [69]:
df_first['all'] = df_first['all'].str.replace(r'\s+', '', regex=True)

print(df_first['all'].head(10))

0                        concert,glasgow,scotlandMusic
1    halloween,pumpkincarving,drama,indiefilm,suspe...
2    bigfoot,mysterious,shocking,sasquatch,bigfooth...
3    duck,basedoncomic,anthropomorphism,extraterres...
4    baseball,mentalbreakdown,psychiatrist,overbear...
5    friendship,suicide,plan,berlin,germany,husband...
6    marriage,morocco,tangiermorocco,twins,firstlov...
7                         vampireHorror,Comedy,Fantasy
8    florida,hitchhiker,twins,entrepreneur,divorced...
9                   voiceacting,filmhistoryDocumentary
Name: all, dtype: object


In [71]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0.01, stop_words=None)
count_matrix = count.fit_transform(df_first['all'])

In [73]:
print(count.vocabulary_)

{'drama': 13, 'thriller': 49, 'comedy': 8, 'mystery': 34, 'fantasy': 19, 'sciencefiction': 42, 'suicide': 47, 'worldwarii': 55, 'war': 51, 'romance': 41, 'murder': 31, 'music': 32, 'california': 7, 'drama romance': 14, 'horror': 24, 'family': 18, 'action': 0, 'adventure': 2, 'action adventure': 1, 'horror thriller': 25, 'police': 38, 'mystery thriller': 35, 'basedonnovelorbook': 4, 'western': 52, 'newyorkcity': 36, 'sheriff': 45, 'monster': 30, 'documentary': 11, 'revenge': 40, 'womandirectordrama': 54, 'highschool': 22, 'womandirector': 53, 'sequel': 43, 'animation': 3, 'crime': 10, 'history': 23, 'zombie': 56, 'drama thriller': 15, 'dystopia': 16, 'comingofage': 9, 'lgbt': 27, 'filmnoir': 20, 'martialarts': 29, 'love': 28, 'dog': 12, 'england': 17, 'parentchildrelationship': 37, 'tvmovie': 50, 'basedontruestory': 5, 'serialkiller': 44, 'supernatural': 48, 'musical': 33, 'gangster': 21, 'kidnapping': 26, 'biography': 6, 'pre': 39, 'slasher': 46}


In [75]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [77]:
df_first= df_first.reset_index()
titles = df_first['title']
indices = pd.Series(df_first.index, index=df_first['title'])

In [79]:
get_recommendations('Godzilla vs. Kong').head(10)

912                  High as Mike
367    Wristcutters: A Love Story
878               Passage to Mars
656                 Six of a Kind
925                 Man About Dog
483        Mr. Mouse Takes a Trip
798                    Navigation
269              A Five Star Life
77         To get lost on purpose
700              Lost in Thailand
Name: title, dtype: object

In [81]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [83]:
m = df_first['vote_count'].quantile(0.95) 

In [85]:
C = df_first['vote_average'].mean() 

In [87]:
df_first['year'] = pd.to_datetime(df_first['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [93]:
def improved_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = df_first.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
    
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('float')
    
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    
    qualified = movies[
        (movies['vote_count'] >= m) &
        (movies['vote_count'].notnull()) &
        (movies['vote_average'].notnull())
    ].copy()
    
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('float')
    
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [95]:
improved_recommendations('Godzilla vs. Kong')

Unnamed: 0,title,vote_count,vote_average,year,wr
612,The SpongeBob SquarePants Movie,2618,6.995,2004,6.655634
1045,Minuscule 2: Mandibles from Far Away,174,7.103,2019,6.226224
179,Weird: The Al Yankovic Story,292,6.733,2022,6.223941
413,Lego Batman: The Movie - DC Super Heroes Unite,238,6.5,2013,6.181444
640,Citizen Ruth,117,6.564,1996,6.16448
227,Gnome Alone,225,6.084,2017,6.131029
300,Bigger Fatter Liar,67,5.328,2017,6.106758
252,The Big Hit,372,5.843,1998,6.084732
650,The In-Laws,250,5.6,2003,6.068814
115,Asterix & Obelix Take on Caesar,1705,5.985,1999,6.061297


### Основная часть но теперь вместо одного фильма вставляем три

In [None]:
def improved_recommendations_multiple(titles):
    # Проверка, что введён хотя бы один фильм
    if len(titles) == 0 or len(titles) > 3:
        raise ValueError("Введите от одного до трёх названий фильмов.")
    
    # Получение индексов для всех введённых фильмов
    indices_list = [indices[title] for title in titles if title in indices]
    
    if not indices_list:
        raise ValueError("Ни один из введённых фильмов не найден в базе данных.")
    
    # Вычисление суммарного сходства для всех введённых фильмов
    sim_scores = sum([list(enumerate(cosine_sim[idx])) for idx in indices_list], [])
    
    # Группировка сходства по индексу и суммирование их значений
    from collections import defaultdict
    combined_scores = defaultdict(float)
    for idx, score in sim_scores:
        combined_scores[idx] += score
    
    # Сортировка фильмов по комбинированным сходствам
    sorted_scores = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    sorted_scores = sorted_scores[len(titles):26]  # Исключение исходных фильмов и ограничение на топ-25
    
    movie_indices = [i[0] for i in sorted_scores]
    
    # Извлечение данных о фильмах
    movies = df_first.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
    
    # Расчёт среднего рейтинга и порога голосов
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('float')
    
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    
    # Отбор фильмов с достаточным количеством голосов и хорошими оценками
    qualified = movies[
        (movies['vote_count'] >= m) &
        (movies['vote_count'].notnull()) &
        (movies['vote_average'].notnull())
    ].copy()
    
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('float')
    
    # Расчёт взвешенного рейтинга
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    
    # Сортировка по рейтингу и возврат топ-10 фильмов
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

# Реализация первого варианта для самых популярных фильмов

In [137]:
qualified = df_second[(df_second['vote_count'] >= m) & (df_second['vote_count'].notnull()) & (df_second['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(1215, 6)

In [139]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)

In [141]:
qualified = qualified.sort_values('wr', ascending=False).head(250)

In [143]:
qualified.head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
0,Inception,2010,34495,8,83.952,"Action, Science Fiction, Adventure",7.911834
1,Interstellar,2014,32571,8,140.241,"Adventure, Drama, Science Fiction",7.906887
2,The Dark Knight,2008,30619,8,130.643,"Drama, Action, Crime, Thriller",7.901265
6,Avengers: Infinity War,2018,27713,8,154.34,"Adventure, Action, Science Fiction",7.891515
14,The Shawshank Redemption,1994,24649,8,122.61,"Drama, Crime",7.878906
15,Avengers: Endgame,2019,23857,8,91.756,"Adventure, Science Fiction, Action",7.875156
18,Joker,2019,23425,8,54.522,"Crime, Thriller, Drama",7.87301
20,The Lord of the Rings: The Return of the King,2003,22334,8,99.276,"Adventure, Fantasy, Action",7.867249
30,Inglourious Basterds,2009,20746,8,59.162,"Drama, Thriller, War",7.857863
38,Harry Potter and the Prisoner of Azkaban,2004,20038,8,124.386,"Adventure, Fantasy",7.853236


#### Выше топ фильмов, ниже по жанрам

In [126]:
df_second['year'] = pd.to_datetime(df_second['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [128]:
s = df_second.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_df_second = df_second.drop('genres', axis=1).join(s)

In [129]:
def build_chart(genre, percentile=0.85):
    df = gen_df_second[gen_df_second['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified

In [134]:
build_chart('Horror').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
1563,The Texas Chain Saw Massacre,1974,2872,7,40.426,6.92227
2309,Dawn of the Dead,1978,1865,7,30.73,6.882386
2319,Climax,2018,1859,7,16.828,6.882025
3455,The Wicker Man,1973,1183,7,25.576,6.819746
4218,Possession,1981,909,7,20.393,6.770678
4220,The Exorcism of God,2022,909,7,70.339,6.770678
6151,The Changeling,1980,531,7,20.266,6.632768
8005,Black Sabbath,1963,358,7,10.778,6.493307
8362,Lights Out,2013,334,7,5.502,6.465128
12890,Ghost Mansion,2021,173,7,21.263,6.146835
