In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn import preprocessing 
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import ipywidgets as widgets
from ast import literal_eval
import numpy as np


pd.options.mode.chained_assignment = None  #default='warn'




In [None]:
#load the data
movie_data = pd.read_csv('movie_metadata2.csv', low_memory=False)

df = pd.DataFrame(movie_data)
df.head(5)


In [None]:


movie_data = pd.read_csv('movie_metadata2.csv', low_memory=False)
indices = pd.Series(movie_data.index, 
                    index=movie_data['genres']).drop_duplicates()

indices[:10]


In [None]:

# tfidf_vector = TfidfVectorizer(stop_words='english')

movie_data['genres'] = movie_data['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x]if isinstance(x, list) else [])

#tfidf_matrix = tfidf_vector.fit_transform(movie_data['genres'])


In [None]:
movie_data['vote_count']

In [None]:
vote_count= movie_data[movie_data['vote_count'].notnull()]['vote_count'].astype('int')

vote_averages = movie_data[movie_data['vote_average'].notnull()]['vote_average'].astype('int')

C = vote_averages.mean()

C

In [None]:
m = vote_count.quantile(0.95)
m

In [None]:
movie_data['release_date']

In [None]:
pd.to_datetime(movie_data['release_date'], errors='coerce')

In [None]:
movie_data['year'] = pd.to_datetime(movie_data['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [None]:
qualified = movie_data[(movie_data['vote_count'] >= m) & (movie_data['vote_count'].notnull()) & (movie_data['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]

qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

In [None]:
def weight_rate(x):
    v = x['vote_count']
    R = x['vote_average']
    
    return (v/(v+m)*R) + (m/(m+v)*C)

In [None]:
qualified['wr'] = qualified.apply(weight_rate, axis = 1)

In [None]:
qualified.head()
qualified.sort_values('wr', ascending=False).head(250)

In [None]:
qualified.head(15)

In [None]:
def update_df_length(limit):
    
    df = pd.read_csv('movie_metadata2.csv', low_memory=False)

    df = df.iloc[0:limit, :]
    
    print("Number of rows in file.")
    widgets.interactive(update_df_length, limit=limit)

In [None]:
s = movie_data.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)

In [None]:
s.name = 'genre'
gen_md = movie_data.drop('genres', axis=1).join(s)



Below is the

In [None]:
def top_movie(genre, n_percentile = .85):
    df = gen_md[gen_md['genre'] == genre]
    vote_count = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    
    vote_avg = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_avg.mean()
    m = vote_count.quantile(n_percentile)
    
    qualified = df[(df['vote_count']>= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['weighted_average'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m)*x['vote_average'])+(m/(m+x['vote_count'])*C), axis =1)
                                                                                                         
                                                                                                         
                                                                    
    qualified = qualified.sort_values('weighted_average',ascending = False).head(29)
                                                                
    return qualified

top_movie('Romance')

Unnamed: 0,title,year,vote_count,vote_average,popularity,weighted_average
24,Leaving Las Vegas,1995,365,7,10.332025,6.714638


In [None]:
plt.figure(figsize = (12,4))
plt.rcParams["axes.grid"] = False
plt.style.use('dark_background')
%matplotlib inline
#lt.hist(vote_average['vote_count'], bins= 28, color='tab: purple')
#plt.ylabel('Ratings Count (Scaled)', fontsize=16)
df['vote_average'].plot(kind = "hist")
plt.ylabel('Ratings Average', fontsize=12)




In [None]:
plt.figure(figsize = (10,8))
plt.rcParams["axes.grid"] = False
plt.style.use('dark_background')
#%matplotlib inline
#plt.hist(vote_average['vote_count'], bins= 28, color='tab: purple')
#plt.ylabel('Ratings Count (Scaled)', fontsize=16)
df['vote_count'].plot(kind = "pie")
plt.ylabel('Ratings Count', fontsize=16, color='purple')




In [None]:
fig, ax = plt.subplots(figsize = (16, 9))


#movie_data['genres'] = movie_data[

genres = df['genres'].head(12)
vote_count = df['vote_count'].head(12)

ax.barh(genres, vote_count)

for s in ['top', 'bottom', 'left', 'right']:
    ax.spines[s].set_visible(False)
    
ax.xaxis.set_ticks_position('none')
ax.yaxis.set_ticks_position('none')

ax.xaxis.set_tick_params(pad = 5)
ax.yaxis.set_tick_params(pad = 10)

ax.grid(visible = True, color = 'grey', linestyle = '-.', linewidth = 0.5, alpha = 0.2)

ax.invert_yaxis()

for i in ax.patches:
    plt.text(i.get_width() + 0.2, i.get_y() + 0.5, str(round((i.get_width()), 2)), fontsize = 10, fontweight = 'bold', color = 'grey')
    
ax.set_title('Genre Collections And Their Vote Counts', loc = 'left')
             
plt.show()