In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
import warnings
warnings.filterwarnings('ignore')

# Load data
df = pd.read_excel('superhero_box_office.xlsx')

# Data cleaning
numeric_cols = ['Domestic Revenue', 'International Revenue', 'Worldwide Revenue', 'Total Budget', 
                'Production Budget', 'Opening Weekend', 'Percent of Total Gross', 'Legs', 
                'Domestic Share Percentage']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Calculate ROI
df['ROI'] = (df['Worldwide Revenue'] - df['Total Budget']) / df['Total Budget']
df['ROI'] = df['ROI'].replace([np.inf, -np.inf], np.nan)

# Calculate Interest Post Opening Weekend
df['Interest Post Opening Weekend'] = 100 - df['Percent of Total Gross']

# Calculate International Box Office Share
df['International Box Office Share'] = 100 - df['Domestic Share Percentage']

# Extract Release Year
df['Release Year'] = pd.to_datetime(df['Release Date'], errors='coerce').dt.year

# Drop rows with NaN for key analysis columns
df = df.dropna(subset=['ROI', 'Release Year', 'Lead Sex'])

# Plotly configuration
config = {'scrollZoom': False, 'responsive': True, 'displayModeBar': True}

# Styling inspired by EstimatedSuperheroMovieProfit.html
colors = {
    'MCU': ['red', 'orange', 'cyan', 'magenta'],
    'DC': ['blue', 'navy', 'cyan', 'magenta'],
    'SONY': ['black', 'gray', 'cyan', 'magenta'],
    'FOX': ['yellow', 'gold', 'cyan', 'magenta']
}
markers = {'MCU': 'circle', 'DC': 'square', 'SONY': 'triangle-up', 'FOX': 'diamond'}

# Grouped bar charts for Lead Sex vs metrics
metrics = ['ROI', 'Domestic Share Percentage', 'International Box Office Share', 
           'Legs', 'Interest Post Opening Weekend']
metric_titles = ['ROI', 'Domestic Box Office Share (%)', 'International Box Office Share (%)', 
                 'Movie Legs', 'Interest Post Opening Weekend (%)']

for metric, title in zip(metrics, metric_titles):
    summary = df.groupby('Lead Sex')[metric].agg(['mean', 'std']).reset_index()
    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=summary['Lead Sex'],
        y=summary['mean'],
        error_y=dict(type='data', array=summary['std'], visible=True),
        marker_color='rgb(26, 118, 255)',
        name=title
    ))
    fig.update_layout(
        title={'text': f'{title} by Lead Sex (Mean ± Std)', 'y': 0.95, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top', 'pad': {'t': 10}},
        xaxis_title='Lead Sex',
        yaxis_title=title,
        margin=dict(l=50, r=50, t=50, b=50),
        height=600, width=1400,
        transition_duration=0
    )
    fig.write_html(f'barchart_{metric}.html', config=config)

# Linear Regression: ROI vs Release Year by Lead Sex
regression_results = []
for lead_sex in df['Lead Sex'].unique():
    subset = df[df['Lead Sex'] == lead_sex]
    if len(subset) > 1:
        X = subset[['Release Year']].values
        y = subset['ROI'].values
        model = LinearRegression()
        model.fit(X, y)
        r_squared = model.score(X, y)
        regression_results.append({
            'Category': 'Lead Sex', 'Group': lead_sex, 
            'Coefficient': model.coef_[0], 'Intercept': model.intercept_, 
            'R-squared': r_squared
        })
        fig = go.Figure()
        fig.add_trace(go.Scattergl(
            x=subset['Release Year'], y=subset['ROI'], mode='markers',
            name='Data',
            hovertemplate=(
                'Title: %{customdata[0]}<br>' +
                'Year: %{x}<br>' +
                'ROI: %{y:.2f}<br>' +
                'Lead Sex: %{customdata[1]}<extra></extra>'
            ),
            customdata=subset[['Title', 'Lead Sex']].values
        ))
        fig.add_trace(go.Scattergl(
            x=subset['Release Year'], y=model.predict(X), mode='lines',
            name='Regression Line', line=dict(color='red')
        ))
        fig.update_layout(
            title={'text': f'ROI vs Release Year (Lead Sex: {lead_sex})', 'y': 0.95, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top', 'pad': {'t': 10}},
            xaxis_title='Release Year', yaxis_title='ROI',
            margin=dict(l=50, r=50, t=50, b=50),
            height=600, width=1400, transition_duration=0
        )
        fig.write_html(f'regression_lead_sex_{lead_sex}.html', config=config)

# Combined Linear Regression: ROI vs Release Year (Franchises and Series)
fig_regression = go.Figure()
franchises = ['MCU', 'DC', 'FOX', 'SONY']
franchise_traces = {franchise: [] for franchise in franchises}
series_traces = {franchise: {} for franchise in franchises}
unique_series = {franchise: sorted(df[df['Franchise'] == franchise]['Series'].unique()) for franchise in franchises}

# Add franchise regression traces
for franchise in franchises:
    subset = df[df['Franchise'] == franchise]
    if len(subset) > 1:
        X = subset[['Release Year']].values
        y = subset['ROI'].values
        model = LinearRegression()
        model.fit(X, y)
        r_squared = model.score(X, y)
        regression_results.append({
            'Category': 'Franchise', 'Group': franchise, 
            'Coefficient': model.coef_[0], 'Intercept': model.intercept_, 
            'R-squared': r_squared
        })
        color = colors.get(franchise, ['grey'])[0]
        marker = markers.get(franchise, 'circle')
        fig_regression.add_trace(go.Scattergl(
            x=subset['Release Year'], y=subset['ROI'], mode='markers',
            name=f'{franchise} Data',
            hovertemplate=(
                'Title: %{customdata[0]}<br>' +
                'Year: %{x}<br>' +
                'ROI: %{y:.2f}<br>' +
                'Franchise: %{customdata[1]}<br>' +
                'Series: %{customdata[2]}<extra></extra>'
            ),
            customdata=subset[['Title', 'Franchise', 'Series']].values,
            marker=dict(symbol=marker, color=color), visible=True
        ))
        franchise_traces[franchise].append(len(fig_regression.data) - 1)
        fig_regression.add_trace(go.Scattergl(
            x=subset['Release Year'], y=model.predict(X), mode='lines',
            name=f'{franchise} Regression', line=dict(color=color, dash='solid'), visible=True
        ))
        franchise_traces[franchise].append(len(fig_regression.data) - 1)

# Add series regression traces with composite keys
for franchise in franchises:
    df_franchise = df[df['Franchise'] == franchise]
    franchise_colors = colors.get(franchise, ['grey', 'darkgrey', 'cyan', 'magenta'])
    for i, series in enumerate(unique_series[franchise]):
        subset = df_franchise[df_franchise['Series'] == series]
        if len(subset) > 1:
            X = subset[['Release Year']].values
            y = subset['ROI'].values
            model = LinearRegression()
            model.fit(X, y)
            r_squared = model.score(X, y)
            regression_results.append({
                'Category': 'Series', 'Group': f'{franchise}_{series}', 
                'Coefficient': model.coef_[0], 'Intercept': model.intercept_, 
                'R-squared': r_squared
            })
            color = franchise_colors[i % len(franchise_colors)]
            marker = markers.get(franchise, 'circle')
            series_key = f'{franchise}_{series}'
            series_traces[franchise][series_key] = []
            fig_regression.add_trace(go.Scattergl(
                x=subset['Release Year'], y=subset['ROI'], mode='markers',
                name=f'{series} Data',
                hovertemplate=(
                    'Title: %{customdata[0]}<br>' +
                    'Year: %{x}<br>' +
                    'ROI: %{y:.2f}<br>' +
                    'Franchise: %{customdata[1]}<br>' +
                    'Series: %{customdata[2]}<extra></extra>'
                ),
                customdata=subset[['Title', 'Franchise', 'Series']].values,
                marker=dict(symbol=marker, color=color), visible=False
            ))
            series_traces[franchise][series_key].append(len(fig_regression.data) - 1)
            fig_regression.add_trace(go.Scattergl(
                x=subset['Release Year'], y=model.predict(X), mode='lines',
                name=f'{series} Regression', line=dict(color=color, dash='solid'), visible=False
            ))
            series_traces[franchise][series_key].append(len(fig_regression.data) - 1)

# Create franchise dropdown
franchise_dropdown = [
    dict(label='ALL',
         method='update',
         args=[{'visible': [True if i in sum(franchise_traces.values(), []) else False for i in range(len(fig_regression.data))]},
               {'updatemenus[1].buttons': 
                [dict(label='None',
                      method='update',
                      args=[{'visible': [True if i in sum(franchise_traces.values(), []) else False for i in range(len(fig_regression.data))]}])],
                'updatemenus[1].visible': False}]
         ),
    dict(label='MCU',
         method='update',
         args=[{'visible': [True if i in franchise_traces['MCU'] else False for i in range(len(fig_regression.data))]},
               {'updatemenus[1].buttons': [
                   dict(label='ALL',
                        method='update',
                        args=[{'visible': [True if i in franchise_traces['MCU'] else False for i in range(len(fig_regression.data))]}])] + [
                   dict(label=series.split('_')[1],
                        method='update',
                        args=[{'visible': [True if i in series_traces['MCU'][series] else False for i in range(len(fig_regression.data))]}]
                        ) for series in series_traces['MCU'].keys()],
                'updatemenus[1].visible': True}]
         ),
    dict(label='DC',
         method='update',
         args=[{'visible': [True if i in franchise_traces['DC'] else False for i in range(len(fig_regression.data))]},
               {'updatemenus[1].buttons': [
                   dict(label='ALL',
                        method='update',
                        args=[{'visible': [True if i in franchise_traces['DC'] else False for i in range(len(fig_regression.data))]}])] + [
                   dict(label=series.split('_')[1],
                        method='update',
                        args=[{'visible': [True if i in series_traces['DC'][series] else False for i in range(len(fig_regression.data))]}]
                        ) for series in series_traces['DC'].keys()],
                'updatemenus[1].visible': True}]
         ),
    dict(label='FOX',
         method='update',
         args=[{'visible': [True if i in franchise_traces['FOX'] else False for i in range(len(fig_regression.data))]},
               {'updatemenus[1].buttons': [
                   dict(label='ALL',
                        method='update',
                        args=[{'visible': [True if i in franchise_traces['FOX'] else False for i in range(len(fig_regression.data))]}])] + [
                   dict(label=series.split('_')[1],
                        method='update',
                        args=[{'visible': [True if i in series_traces['FOX'][series] else False for i in range(len(fig_regression.data))]}]
                        ) for series in series_traces['FOX'].keys()],
                'updatemenus[1].visible': True}]
         ),
    dict(label='SONY',
         method='update',
         args=[{'visible': [True if i in franchise_traces['SONY'] else False for i in range(len(fig_regression.data))]},
               {'updatemenus[1].buttons': [
                   dict(label='ALL',
                        method='update',
                        args=[{'visible': [True if i in franchise_traces['SONY'] else False for i in range(len(fig_regression.data))]}])] + [
                   dict(label=series.split('_')[1],
                        method='update',
                        args=[{'visible': [True if i in series_traces['SONY'][series] else False for i in range(len(fig_regression.data))]}]
                        ) for series in series_traces['SONY'].keys()],
                'updatemenus[1].visible': True}]
         )
]

# Default series dropdown (hidden initially)
series_dropdown = [
    dict(label='None',
         method='update',
         args=[{'visible': [True if i in sum(franchise_traces.values(), []) else False for i in range(len(fig_regression.data))]}])
]

# Update layout for regression chart
fig_regression.update_layout(
#    title={'text': 'ROI vs Release Year by Franchise and Series', 'y': 0.95, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top', 'pad': {'t': 10}},
    xaxis_title='Release Year', yaxis_title='ROI',
    margin=dict(l=50, r=50, t=0, b=50),
    height=600, width=1400, transition_duration=0,
    legend=dict(orientation='h', yanchor='top', y=0.99, xanchor='left', x=0.01),
    hovermode='closest',  # Changed to 'closest' for individual point highlighting
    updatemenus=[
        dict(buttons=franchise_dropdown, direction='down', showactive=True, x=0.35, xanchor='left', y=1.08, yanchor='top', active=0),
        dict(buttons=series_dropdown, direction='down', showactive=True, x=0.45, xanchor='left', y=1.08, yanchor='top', active=0, visible=False)
    ]
)

# Save regression chart with Tailwind styling
plotly_html = fig_regression.to_html(full_html=False, include_plotlyjs='cdn')
custom_html = f"""
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Superhero Movie ROI Linear Regression</title>
    <script src="https://cdn.tailwindcss.com"></script>
    <style>
        body {{ font-family: 'Inter', sans-serif; }}
        .back-button {{ transition: background-color 0.2s, transform 0.2s; }}
        .back-button:hover {{ background-color: #2563eb; transform: translateY(-2px); }}
        .chart-container {{ width: 100%; max-width: 1400px; margin: 0 auto; }}
    </style>
</head>
<body class="bg-gray-100">
    <header class="bg-gradient-to-r from-blue-600 to-indigo-600 text-white py-3">
        <div class="container mx-auto px-4 text-center">
            <h1 class="text-3xl md:text-4xl font-bold mb-2">Superhero Movie ROI Regression Analysis</h1>
            <p class="text-lg md:text-lg">Interactive chart exploring ROI trends for superhero movies (2025 USD)</p>
        </div>
    </header>
    <main class="container mx-auto px-4 py-6">
        <section class="mb-6">
            <a href="index.html" class="back-button inline-block bg-blue-600 text-white font-semibold py-2 px-4 rounded-lg shadow-md hover:shadow-lg">
                Back to Index
            </a>
        </section>
        <section class="bg-white p-8 rounded-lg shadow-md mb-12">
            <p class="text-gray-700 mb-6">
                This interactive chart displays ROI regression trends by franchise and series. Hover over points to see movie details. Use the dropdown menus to filter by franchise or series.
            </p>
            <div class="chart-container">
                {plotly_html}
            </div>
        </section>
    </main>
    <footer class="bg-gray-800 text-white py-6">
        <div class="container mx-auto px-4 text-center">
            <p>Licensed under <a href="https://creativecommons.org/licenses/by-nc/4.0/" class="underline hover:text-blue-300" target="_blank">Creative Commons Attribution-NonCommercial 4.0</a></p>
            <p class="mt-2">Created by Marcus Wesley | Last Updated 8/20/2025 | <a href="https://www.linkedin.com/in/marcus-wesley15/" class="underline hover:text-blue-300" target="_blank">LinkedIn</a></p>
        </div>
    </footer>
</body>
</html>
"""
with open('SuperheroMovieROILinearRegression.html', 'w', encoding='utf-8') as f:
    f.write(custom_html)