In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FuncFormatter
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [None]:
cleaned_df = merged_df.drop(['start_year','original_title','id','movie'],axis=1)

grouped_df = cleaned_df.groupby('director_name').agg(
    {'movie_id': ['count'],
    'primary_title': lambda x: list(x),
     'production_budget': ['sum', 'mean'], 
     'worldwide_gross': ['sum', 'mean'],
     'Profit:Production Ratio': ['sum', 'mean'],
     'Profit': ['sum', 'mean']}
)

grouped_df.columns = [
    f"{col[0]}_{col[1]}" for col in grouped_df.columns
]

grouped_df[grouped_df['movie_id_count']>0].sort_values(by=['Profit:Production Ratio_mean'],ascending=False)

In [None]:
sorted_profit = grouped_df.reset_index().sort_values(by='Profit:Production Ratio_mean', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Profit:Production Ratio_mean', y='director_name', data=sorted_profit.head(10))

# Set the title and labels
plt.title('Top 10 Directors with the Highest ROI')
plt.xlabel('Average ROI')
plt.ylabel('Director Name')

plt.savefig('top 10 directors by ROI.png')

plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.regplot(x='production_budget', y='worldwide_gross', data=merged_df, scatter_kws={'alpha':0.5})
plt.title('Production Budget vs. Profit with Linear Regression')
plt.xlabel('Production Budget ($)')
plt.ylabel('Worldwide Gross ($)')

plt.savefig('production vs gross regression.png')
plt.show()

In [None]:
low_budget_threshold = merged_df['production_budget'].quantile(0.33)
medium_budget_threshold = merged_df['production_budget'].quantile(0.66)

# Assigning categories based on these thresholds
budget_categories = pd.cut(merged_df['production_budget'], 
                           bins=[0, low_budget_threshold, medium_budget_threshold, merged_df['production_budget'].max()], 
                           labels=['Low', 'Medium', 'High'])
# Plotting Production Budget vs. Worldwide Gross:Production Ratio
plt.figure(figsize=(10, 6))
sns.scatterplot(x='production_budget', y='Profit:Production Ratio', hue=budget_categories, data=merged_df)
plt.title('Production Budget vs. Profit:Production Ratio')
plt.xlabel('Production Budget ($)')
plt.ylabel('Profit:Production Ratio')

# Highlighting the point with the maximum ratio
max_efficiency_point = merged_df.loc[merged_df['Worldwide Gross:Production Ratio'].idxmax()]
plt.scatter(max_efficiency_point['production_budget'], max_efficiency_point['Profit:Production Ratio'], color='red')
plt.text(max_efficiency_point['production_budget'], max_efficiency_point['Profit:Production Ratio'], 
         f"  Budget: ${int(max_efficiency_point['production_budget'])}\n  Ratio: {max_efficiency_point['Profit:Production Ratio']:.2f}", 
         verticalalignment='top')

plt.savefig('production vs ratio scatter.png')

plt.show()

In [None]:
X = merged_df[['production_budget']].values.reshape(-1, 1) 
y = merged_df['worldwide_gross'].values


model = LinearRegression()
model.fit(X, y)

# Predicting the profits
y_pred = model.predict(X)

# Calculating the R-squared value
r_squared = r2_score(y, y_pred)

print(f'R-squared value: {r_squared}')

In [None]:
# Overall Regression Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='production_budget', y='worldwide_gross', data=merged_df)
sns.regplot(x='production_budget', y='worldwide_gross', data=merged_df, scatter=False)

plt.title('Production Budget vs. Worldwide Gross')
plt.xlabel('Production Budget ($)')
plt.ylabel('Worldwide Gross ($)')

plt.savefig('production vs gross .png')
plt.show()

In [None]:
# Horror Regression Plot
merged_df['Is_Horror'] = merged_df['genres'].str.contains('Horror', na=False)

plt.figure(figsize=(10, 6))
sns.scatterplot(x='production_budget', y='worldwide_gross', hue = 'Is_Horror',data=merged_df)
sns.regplot(x='production_budget', y='worldwide_gross', data=merged_df, scatter=False)

plt.title('Production Budget vs. Worldwide Gross with Highlight on Horror Genre')
plt.xlabel('Production Budget ($)')
plt.ylabel('Worldwide Gross ($)')

plt.savefig('production vs gross .png')
plt.show()