In [None]:
#Some very important libraries

import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.dates
import seaborn as sns

In [None]:
#Loading the joined dataset

movies=pd.read_csv('Join_TMDB_Filtered.csv')
movies.head(10)

In [None]:
#Loading the file that contains the ratings each Movielens user gave each of the films he/she watched

ratings=pd.read_csv("./ml-dataset/ratings.csv")

#Changing the timestamp to a more readable format

ratings["timestamp"]=pd.to_datetime(ratings["timestamp"],unit='s')

ratings.head(10)

In [None]:
#List of movielens IDs

list_ids=list(movies["movieId"])

list_ids

In [None]:
#Just fixing the tmdb_cast column so we can better use the data -  it comes as a string, but it should be a dictionary, so we will just use split to get what we want

cast_all=[]
cast_main=[]
for i in range(len(movies)):
    cast_str=movies.iloc[i]["tmdb_cast"]
    cast_spl=cast_str.split("'name': ")
    actors_list=[]
    for j in range(1,len(cast_spl)):
        actor_name=cast_spl[j].split(',')[0]
        actor=actor_name.replace("'","").replace('"','')
        actors_list.append(actor)
    cast_all.append(actors_list)
    #The first three actors that appear are the main ones
    cast_main.append(actors_list[0:3])

In [None]:
#Creating two new columns for our main dataset, to better access cast members and main actors
movies["actors_list"]=cast_all
movies["main_actors"]=cast_main

In [None]:
movies.head()

In [None]:
movies.iloc[0]["tmdb_info"]

In [None]:
#Using a similar split approach to better process the information in the tmdb_info column

production_companies=[]
release_dates=[]
votes_average=[]
votes_count=[]
revenues=[]
budgets=[]
for i in range(len(movies)):
    info_str=movies.iloc[i]["tmdb_info"]
    budget=int(info_str.split("'budget': ")[1].split(",")[0].replace("'","").replace('"',''))
    vote_average=float(info_str.split("'vote_average': ")[1].split(",")[0].replace("'","").replace('"',''))
    vote_count=int(info_str.split("'vote_count': ")[1].split("}")[0].replace("'","").replace('"',''))
    release_date=info_str.split("'release_date': ")[1].split(",")[0].replace("'","").replace('"','')
    revenue=int(info_str.split("'revenue': ")[1].split(",")[0].replace("'","").replace('"',''))
    production_spl=info_str.split("'production_companies': ")[1].split("}]")[0].split("'name': ")
    production=[]
    for j in range(1,len(production_spl)):
        company_name=production_spl[j].split(',')[0]
        company=company_name.replace("'","").replace('"','')
        production.append(company)
    cast_all.append(actors_list)
    production_companies.append(production)
    release_dates.append(release_date)
    votes_average.append(vote_average)
    votes_count.append(vote_count)
    revenues.append(revenue)
    budgets.append(budget)

In [None]:
#Creating the columns for better processing the information on the tmdb_info in the movies database

movies['production_companies']=production_companies
movies['release_date']=release_dates
movies['tmdb_vote_avg']=votes_average
movies['vote_count']=votes_count
movies['revenue']=revenues
movies['budget']=budgets
movies.head(10)

In [None]:
#Calculating P&L = Revenue - Budget

P_L=[row.revenue - row.budget for index, row in movies.iterrows()]

In [None]:
#Creating the P&L column
movies["P_L"]=P_L
movies.head(10)

In [None]:
#Generating a more complete csv dataset
movies.to_csv('full_movies.csv')

## Ratings vs Actors

In [None]:
#Creating a list with actors names

act=list(movies["main_actors"])
all_actors=[]
for i in range(len(act)):
    sub=act[i]
    for actor in sub:
        if actor in all_actors:
            pass
        else:
            all_actors.append(actor)
            
#print(all_actors)

In [None]:
movies.columns

In [None]:
# len(all_actors)

In [None]:
#Analysing Movielens Avg Rating, Avg Budget, Avg Revenue, Movie Count and Avg P&L per Movie with actor in main characters:
ratings_per_actor=[]
budget_per_actor=[]
revenue_per_actor=[]
movies_per_actor=[]
pl_per_actor=[]
counts=0

for actor in all_actors:
    nr_reviews=0
    sum_ratings=0
    nr_movies=0
    rev=0
    bud=0
    pl=0
    for i in range(len(movies)):
        if actor in movies.iloc[i]["main_actors"]:
            nr_reviews= nr_reviews + movies.iloc[i]["movielens_number_reviews"]
            sum_ratings=sum_ratings + movies.iloc[i]["movielens_number_reviews"]*movies.iloc[i]["movielens_avg_rating"]
            nr_movies+=1
            rev=rev+movies.iloc[i]["revenue"]
            bud=bud+movies.iloc[i]["budget"]
            pl=pl+movies.iloc[i]["P_L"]
    if nr_reviews==0:
        ratings_per_actor.append(0)
    else:
        ratings_per_actor.append(sum_ratings/nr_reviews)
    if nr_movies==0:
        budget_per_actor.append(0)
        revenue_per_actor.append(0)
        pl_per_actor.append(0)
        movies_per_actor.append(0)
    else:
        budget_per_actor.append(bud/nr_movies)
        revenue_per_actor.append(rev/nr_movies)
        pl_per_actor.append(pl/nr_movies)
        movies_per_actor.append(nr_movies)

In [None]:
#Creating the database to analyse actors
actor_analysis=pd.DataFrame(all_actors)

In [None]:
#Properly naming the column
actor_analysis.rename(columns={0:'main_actor'},inplace=True)

In [None]:
#Creating new columns to store the information we have previously calculated

actor_analysis["avg_rating_movies"]=ratings_per_actor
actor_analysis["avg_movies_budget"]=budget_per_actor
actor_analysis["avg_movies_revenue"]=revenue_per_actor
actor_analysis["avg_movies_pl"]=pl_per_actor
actor_analysis["nr_movies"]=movies_per_actor

actor_analysis.head(10)

In [None]:
#Better safe than sorry! Let's save this to a csv (it took a long time)

actor_analysis.to_csv('actor_analysis.csv')

In [None]:
#Analysing ratings

actors_top_rt=actor_analysis[["main_actor","avg_rating_movies","nr_movies"]].copy()


#only actores that starred in over 10 movies (not to be too biased over people that starred in few movies)
actors_top_rt_more_10=actors_top_rt[actors_top_rt.nr_movies>10].copy()
actors_top_rt_more_10.dropna(inplace=True)
actors_top_rt_more_10.sort_values(by='avg_rating_movies',axis=0,ascending=False,inplace=True)

In [None]:
actors_top_rt_more_10

In [None]:
#The initial plot for actor vs avg movie ratings
plt.figure()

actors_top_rt_more_10.plot(kind='bar',x='main_actor',y='avg_rating_movies',colormap='rainbow',figsize=(25,10))
plt.show()

In [None]:
#Fancying up the plot
plt.figure()

ax = actors_top_rt_more_10.plot(kind='bar',x='main_actor',y='avg_rating_movies',colormap='tab10',figsize=(25,10),ylim=(2.5,4.0),fontsize=14,title='Average Movie Ratings per Actor',legend=False)
ax.set(xlabel='Actors', ylabel='Average Ratings')
ax.title.set_size(30)
ax.xaxis.label.set_size(20)
ax.yaxis.label.set_size(20)
plt.show()

## Revenue vs Actors

In [None]:
actors_top_rv=actor_analysis[["main_actor","avg_movies_revenue","nr_movies"]].copy()


#only actors that starred in over 10 movies (not to be too biased over people that starred in few movies)
actors_top_rv_more_10=actors_top_rv[actors_top_rv.nr_movies>10].copy()
actors_top_rv_more_10.dropna(inplace=True)
actors_top_rv_more_10.sort_values(by='avg_movies_revenue',axis=0,ascending=False,inplace=True)

actors_top_rv_more_10

In [None]:
#The initial plot for Avg Revenue

plt.figure()

ax = actors_top_rv_more_10.plot(kind='bar',x='main_actor',y='avg_movies_revenue',colormap='summer',figsize=(25,10),fontsize=14,title='Average Revenue per Movies Starred',legend=False)
ax.set(xlabel='Actors', ylabel='Average Revenues')
ax.title.set_size(30)
ax.xaxis.label.set_size(20)
ax.yaxis.label.set_size(20)
plt.show()

## Budget vs Actors

In [None]:
actors_top_bd=actor_analysis[["main_actor","avg_movies_budget","nr_movies"]].copy()

#only actors that starred in over 10 movies (not to be too biased over people that starred in few movies)
actors_top_bd_more_10=actors_top_bd[actors_top_bd.nr_movies>10].copy()
actors_top_bd_more_10.dropna(inplace=True)
actors_top_bd_more_10.sort_values(by='avg_movies_budget',axis=0,ascending=False,inplace=True)

actors_top_bd_more_10

In [None]:
#The inital plot for Budget

plt.figure()

ax = actors_top_bd_more_10.plot(kind='bar',x='main_actor',y='avg_movies_budget',colormap='RdYlGn',figsize=(25,10),fontsize=14,title='Average Budget per Movies Starred',legend=False)
ax.set(xlabel='Actors', ylabel='Average Budgets')
ax.title.set_size(30)
ax.xaxis.label.set_size(20)
ax.yaxis.label.set_size(20)
plt.show()

## P&L vs Actors

In [None]:
actors_top_pl=actor_analysis[["main_actor","avg_movies_pl","nr_movies"]].copy()

#only actors that starred in over 10 movies (not to be too biased over people that starred in few movies)
actors_top_pl_more_10=actors_top_pl[actors_top_pl.nr_movies>10].copy()
actors_top_pl_more_10.dropna(inplace=True)
actors_top_pl_more_10.sort_values(by='avg_movies_pl',axis=0,ascending=False,inplace=True)

actors_top_pl_more_10

In [None]:
#The initial plot for P&L

plt.figure()

ax = actors_top_pl_more_10.plot(kind='bar',x='main_actor',y='avg_movies_pl',colormap='Pastel1',figsize=(25,10),fontsize=14,title='Average P&L per Movies Starred',legend=False)
ax.set(xlabel='Actors', ylabel='Average P&L')
ax.title.set_size(30)
ax.xaxis.label.set_size(20)
ax.yaxis.label.set_size(20)
plt.show()

