In [None]:
#Assessing Data

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline
df = pd.read_csv(open('/home/khrystyne/tmdb-movies.csv'), encoding='utf-8', engine='python')

df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.dtypes

In [None]:
#Cleaning

In [None]:
df.drop(df.columns[[0,1, 7, 9, 10, 11, 12, 14, 15]], axis = 1, inplace = True)

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.fillna(df.mean(), inplace=True)

In [None]:
df.cast.fillna(' ', inplace=True)
df.genres.fillna(' ', inplace=True)
df.director.fillna(' ', inplace=True)

In [None]:
budget_zero = df[df.budget == 0]

In [None]:
revenue_zero = df[df.revenue == 0]

In [None]:
df.drop(budget_zero.index, inplace=True)

In [None]:
df.drop(revenue_zero.index, axis = 0,  inplace=True)

In [None]:
df['profit'] = df.revenue - df.budget
df = df[['profit'] + df.columns[:-1].tolist()]

In [None]:
#Exploration of data

In [None]:
#defining a function
def data_calc(column):
    #highest 
    high = df[column].idxmax()
    high_calc = pd.DataFrame(df.loc[high])
    
    #lowest
    low= df[column].idxmin()
    low_calc=pd.DataFrame(df.loc[low])
    
    #putting the data into a table
    table = pd.concat([high_calc, low_calc], axis=1)
    
    return table

#calling the function
data_calc('profit')

In [None]:
data_calc('budget')

In [None]:
data_calc('revenue')

In [None]:
#Create a dataframe that contains the movies that made a profit
profit_data = df[df['profit'] > 0]
profit_data.index = range(len(profit_data))
#make sure changes are applied
profit_data.head()

In [None]:
profit_data.groupby('release_year').profit.mean().plot(kind='bar');

In [None]:
#Create a dataframe that contains the movies that made a loss
loss_data = df[df['profit'] <= 0]
loss_data.index = range(len(loss_data))
#make sure changes are applied
loss_data.head()

In [None]:
loss_data.groupby('release_year').profit.mean().plot(kind='bar');

In [None]:
p = len(profit_data)
p

In [None]:
loss = len(df) - len(profit_data) 
loss

In [None]:
year_prof = df.groupby('release_year')['profit'].sum()

#figure size(width, height)
plt.figure(figsize=(12,6), dpi = 130)

#x-axis
plt.xlabel('Release Year', fontsize = 12)
#y-axis
plt.ylabel('Profits earned by Movies', fontsize = 12)
#title
plt.title('Movies Profits by Year of their release.')

#plotting the graph
plt.plot(year_prof, label = 'Profits')
#Display legend
plt.legend(loc='upper center')
#display the line plot
plt.show()


In [None]:
plt.hist(df.vote_average, bins=20, color='blue')
plt.xlabel("Vote Average")
plt.show()

In [None]:
vote_prof = df.groupby('vote_average')['profit'].mean()

#figure size(width, height)
plt.figure(figsize=(12,6), dpi = 130)

#x-axis
plt.xlabel('Vote Average', fontsize = 12)
#y-axis
plt.ylabel('Profits earned by Movies', fontsize = 12)
#title
plt.title('Correlation between Profit and Average Vote.')

#plotting the graph
plt.plot(vote_prof, label = 'Vote Average')
plt.legend(loc='upper left')
#display the line plot
plt.show()

In [None]:
plt.scatter(df.vote_average, df.profit, lw=0,alpha=0.8)
plt.xlabel('Rating')
plt.ylabel('profit')
plt.show()

In [None]:
df['vote_average'].corr(df['profit'])

In [None]:
budget_prof = df.groupby('release_year')['profit', 'budget'].mean()

#figure size(width, height)
plt.figure(figsize=(12,6), dpi = 130)

#x-axis
plt.xlabel('Budget, Profit', fontsize = 12)
#y-axis
plt.ylabel('Budgets and Profits of Movies', fontsize = 12)
#title
plt.title('Correlation between Profit and Budget.')

#plotting the graph
plt.plot(budget_prof, label =  'Profit')

plt.legend()
#display the line plot

plt.show()

In [None]:
prof_high = df['profit'].sort_values(ascending=False)[:20]

high_profits=pd.DataFrame()
titles = []
profits = []
for i in prof_high.index:
    titles.append(df.loc[i,'original_title'])
    profits.append(prof_high.loc[i])
high_profits['Titles']=titles
high_profits['Profits']=profits
high_profits.set_index('Titles',inplace=True)
high_profits.plot(kind ='bar',figsize=(8,8))
plt.title('Top 20 highest grossing movies (1960 - 2015) ');
plt.ylabel('Profits in billions ($)');

In [None]:
budget_high = df['budget'].sort_values(ascending=False)[:20]

high_budget=pd.DataFrame()
titles = []
budgets = []
for i in budget_high.index:
    titles.append(df.loc[i,'original_title'])
    budgets.append(budget_high.loc[i])
high_budget['Titles']=titles
high_budget['Budget']=profits
high_budget.set_index('Titles',inplace=True)
high_budget.plot(kind ='bar',figsize=(8,8))
plt.title('Top 20 movies with the highest Budgets (1960 - 2015) ');
plt.ylabel('Budget in billions ($)');

In [None]:
df.plot(x = 'budget', y = 'profit', kind = 'scatter')

In [None]:
df['budget'].corr(df['profit'])

In [None]:
plt.scatter(df['release_year'], df['profit'], color='b', label='profit', alpha=0.5)
plt.scatter(df['release_year'], df['budget'], color='r', label='budget', alpha=0.5)
plt.title('Correlation between Budget and Profit')
plt.xlabel("Year")
plt.ylabel("Dollars")
plt.legend(loc='upper center')  
#plt.grid(True)
plt.show()

In [None]:
def data_extract(col_name):
    df_data = profit_data[col_name].str.cat(sep = '|')
    data = pd.Series(df_data.split('|'))
    count = data.value_counts(ascending = False)
    return count


In [None]:
genre_count = data_extract('genres')
genre_count.head()

In [None]:
genre_count.sort_values(ascending = True, inplace = True)

# plot
ax = genre_count.plot.barh(color = '#007482', fontsize = 15)

# title
ax.set(title = 'The Most filmed genres')

#x-label
ax.set_xlabel('Number of Movies', color = 'b', fontsize = '18')

# figure sizes(width, height)
ax.figure.set_size_inches(12, 10)

#printing the plot
plt.show()


In [None]:
cast_count = data_extract('cast')
cast_count.head()

In [None]:
actor = df['cast'].str.cat(sep='|').split('|')

table=pd.Series(actor).value_counts(ascending=True)[-10:]
graph = table.plot.barh()
graph.set(title = 'List of actors who appeared in the most movies',xlabel = 'Number of time casted.')

In [None]:
director_count = data_extract('director')
director_count.head()

In [None]:
directors = df['director'].str.cat(sep='|').split('|')

table=pd.Series(directors).value_counts(ascending=True)[-10:]
graph = table.plot.barh()
graph.set(title = 'List of most popular directors',xlabel = 'Number of movies directed.')