In [1]:
# Define numerical columns
numeric_cols = ['budget', 'popularity', 'revenue', 'runtime', 'vote_average', 'vote_count']

In [None]:
# Plot each histogram separately for better readability
fig, axes = plt.subplots(2, 3, figsize=(15, 10))  # 2 rows, 3 columns
axes = axes.flatten()  # Flatten for easy iteration

for i, col in enumerate(numeric_cols):
    axes[i].hist(df1[col], bins=30, color='royalblue', edgecolor='black', alpha=0.7)
    axes[i].set_title(f"Distribution of {col}", fontsize=14)
    axes[i].set_xlabel(col, fontsize=12)
    axes[i].set_ylabel("Frequency", fontsize=12)

    # Add mean line
    mean_value = df1[col].mean()
    axes[i].axvline(mean_value, color='red', linestyle='dashed', linewidth=2, label=f'Mean: {mean_value:.2f}')
    axes[i].legend()

plt.tight_layout()
plt.show()

In [None]:
#scatter plot for budget vs revenue scatter plot 
plt.figure(figsize=(10, 6))
sns.scatterplot(x='budget', y='revenue', data=df1, hue='popularity', palette='coolwarm', alpha=0.7, edgecolor='black')
plt.title("Budget vs. Revenue", fontsize=16)
plt.xlabel("Budget ($)", fontsize=12)
plt.ylabel("Revenue ($)", fontsize=12)
plt.legend(title="Popularity", bbox_to_anchor=(1, 1))
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

In [None]:
#runtime vs revenue scatter plot 

plt.figure(figsize=(10, 6))

sns.scatterplot(x='runtime', y='revenue', data=df1, hue='vote_average', palette='viridis', alpha=0.7, edgecolor='black')
plt.title("Runtime vs. Revenue", fontsize=16)
plt.xlabel("Runtime (minutes)", fontsize=12)
plt.ylabel("Revenue ($)", fontsize=12)
plt.legend(title="Vote Average", bbox_to_anchor=(1, 1))
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

In [None]:
#popularity vs vote count scatterplot

plt.figure(figsize=(10, 6))
sns.scatterplot(x='popularity', y='vote_count', data=df1, hue='revenue', palette='magma', alpha=0.7, edgecolor='black')
plt.title("Popularity vs. Vote Count", fontsize=16)
plt.xlabel("Popularity Score", fontsize=12)
plt.ylabel("Vote Count", fontsize=12)
plt.legend(title="Revenue", bbox_to_anchor=(1, 1))
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

In [None]:
#boxplot for vote average vs production company

# Handle zero revenues to avoid issues in pd.qcut()
df1['revenue'] = df1['revenue'].replace(0, np.nan)
df1 = df1.dropna(subset=['revenue']).reset_index(drop=True)

# Create revenue categories (handle duplicate bins)
df1['revenue_category'] = pd.qcut(df1['revenue'], q=3, labels=["Low", "Medium", "High"])

# Select top 10 production companies
top_production_companies = df1['production_companies'].value_counts().index[:10]
df_filtered = df1[df1['production_companies'].isin(top_production_companies)]

# Plot
plt.figure(figsize=(14, 6))
sns.boxplot(x='production_companies', y='vote_average', hue='revenue_category',
            data=df_filtered, palette="viridis")

plt.xticks(rotation=45, ha="right")
plt.title("Vote Average Distribution by Production Company (Grouped by Revenue)", fontsize=14)
plt.xlabel("Production Company", fontsize=12)
plt.ylabel("Vote Average", fontsize=12)
plt.legend(title="Revenue Level")

plt.show()

In [None]:
#box  plot for runtime vs genre 

# Categorize revenue into groups (fix duplicate bin error)
df1['revenue_category'] = pd.qcut(df1['revenue'], q=3, labels=["Low", "Medium", "High"])

# Select only the top 10 most frequent genres for clarity
top_genres = df1['genres'].value_counts().index[:10]
df_filtered = df1[df1['genres'].isin(top_genres)]

plt.figure(figsize=(14, 6))
sns.boxplot(x='genres', y='runtime', hue='revenue_category', 
            data=df_filtered, palette="viridis")

# Improve readability
plt.xticks(rotation=45, ha="right")
plt.title("Runtime Distribution by Genre (Grouped by Revenue)", fontsize=14)
plt.xlabel("Genre", fontsize=12)
plt.ylabel("Runtime (minutes)", fontsize=12)
plt.legend(title="Revenue Level")  # Add legend

plt.show()
print(df1.shape)

In [None]:
#revenue vs original language box plot
# Handle zero revenues to avoid issues in pd.qcut()
df1['revenue'] = df1['revenue'].replace(0, np.nan)
df1 = df1.dropna(subset=['revenue']).reset_index(drop=True)

# Categorize revenue into groups (fix duplicate bin error)
df1['revenue_category'] = pd.qcut(df1['revenue'], q=3, labels=["Low", "Medium", "High"])

# Select only the top 10 most frequent original languages for clarity
top_languages = df1['original_language'].value_counts().index[:10]
df_filtered = df1[df1['original_language'].isin(top_languages)]

plt.figure(figsize=(14, 6))
sns.boxplot(x='original_language', y='revenue', hue='revenue_category', 
            data=df_filtered, palette="viridis")

# Improve readability
plt.xticks(rotation=45, ha="right")
plt.title("Revenue Distribution by Original Language (Grouped by Revenue)", fontsize=14)
plt.xlabel("Original Language", fontsize=12)
plt.ylabel("Revenue", fontsize=12)
plt.legend(title="Revenue Level")  # Add legend

plt.show()

In [None]:
#pairplot allowing us to make comparison of various features against each other

# Ensure revenue has no zero values (avoid binning issues)
df1['revenue'] = df1['revenue'].replace(0, np.nan)
df1 = df1.dropna(subset=['revenue']).reset_index(drop=True)

# Categorize revenue into groups (fix duplicate bin issue)
df1['revenue_category'] = pd.qcut(df1['revenue'], q=3, labels=["Low", "Medium", "High"])

# Select numerical columns for comparison
numeric_features = ['budget', 'popularity', 'revenue', 'runtime', 'vote_average', 'vote_count']

# Remove rows with NaN in revenue_category (to avoid errors in pairplot)
df_filtered = df1.dropna(subset=['revenue_category'])

# Create pairplot
sns.pairplot(df_filtered, vars=numeric_features, hue='revenue_category', palette='viridis', diag_kind='kde')

plt.show()


In [None]:
# correlation heat map 
df1_corr = df1[[ 'budget', 'popularity', 'revenue', 'runtime', 'vote_average', 'vote_count']].corr() 
sns.heatmap(df1_corr , annot = True ) 