In [22]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np

# --- LOAD DATA ---
file_path = 'SHR65_23.csv'
try:
    df = pd.read_csv(file_path, encoding='ISO-8859-1', low_memory=False)
    print("‚úÖ Dataset Loaded!")
except FileNotFoundError:
    print("‚ö†Ô∏è File not found.")


# We define a list of terms that mean "Anonymous" or "Hard to Solve"
anonymous_terms = [
    'stranger',
    'relationship not determined',
    'unknown',
    'nan'
]

# Create a clean flag: 1 if it's Stranger/Unknown, 0 if it's Family/Friend
df['Is_Anonymous'] = df['Relationship'].astype(str).apply(
    lambda x: 1 if any(term in x.lower() for term in anonymous_terms) else 0
)

print("‚úÖ Data Cleaned: 'Relationship not determined' is now counted as Anonymous.")

‚úÖ Dataset Loaded!
‚úÖ Data Cleaned: 'Relationship not determined' is now counted as Anonymous.


In [23]:
# --- ENSURE SOLVED FLAG EXISTS ---
if 'Solved_Flag' not in df.columns:
    df['Solved_Flag'] = df['OffAge'].apply(lambda x: 0 if str(x) == '999' or str(x).lower() == 'unknown' else 1)

# --- CALCULATE YEARLY STATS ---
stats = df.groupby('Year').apply(lambda x: pd.Series({
    'Total': len(x),
    'Solved_Count': x['Solved_Flag'].sum(),
    'Anonymous_Count': x['Is_Anonymous'].sum()
})).reset_index()

# Calculate Percentages
stats['Clearance_Rate'] = (stats['Solved_Count'] / stats['Total'] * 100).round(1)
stats['Anonymous_Pct'] = (stats['Anonymous_Count'] / stats['Total'] * 100).round(1)

# --- CALCULATE CORRELATION ---
corr_score = stats['Clearance_Rate'].corr(stats['Anonymous_Pct'])

print(f"üìä NEW STATISTICAL PROOF:")
print(f"Correlation Score: {corr_score:.2f}")
if corr_score < -0.5:
    print("‚úÖ SUCCESS: Strong Inverse Relationship found!")
else:
    print("‚ö†Ô∏è Still weak? Check if 'Is_Anonymous' is capturing all rows.")

üìä NEW STATISTICAL PROOF:
Correlation Score: -0.73
‚úÖ SUCCESS: Strong Inverse Relationship found!






In [24]:
# --- PLOT THE CORRECTED CHART ---
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Trace 1: Clearance Rate (RED)
fig.add_trace(
    go.Scatter(
        x=stats['Year'], y=stats['Clearance_Rate'],
        name="Clearance Rate (Solved %)",
        mode='lines+markers',
        line=dict(color='#d62728', width=4),
        marker=dict(size=6),
        hovertemplate='Year: %{x}<br>Solved: <b>%{y}%</b><extra></extra>'
    ),
    secondary_y=False
)

# Trace 2: Anonymous Crimes (GREEN)
fig.add_trace(
    go.Scatter(
        x=stats['Year'], y=stats['Anonymous_Pct'],
        name="Stranger/Undetermined %",
        mode='lines',
        line=dict(color='#2ca02c', width=3, dash='dot'),
        hovertemplate='Year: %{x}<br>Anonymous Cases: <b>%{y}%</b><extra></extra>'
    ),
    secondary_y=True
)

# --- FORMATTING ---
insight = f"Correlation: {corr_score:.2f} (Strong Inverse Relationship)"

fig.update_layout(
    title_text=f"<b>The Real Reason:</b> As Crimes Became Anonymous, They Became Harder to Solve<br><sup>{insight}</sup>",
    title_font_size=20,
    hovermode="x unified",
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
    template="plotly_white",
    height=600
)

# Set Axis Titles
fig.update_yaxes(title_text="<b>Clearance Rate (%)</b>", color="#d62728", secondary_y=False, range=[0, 100])
fig.update_yaxes(title_text="<b>% of Stranger/Undetermined</b>", color="#2ca02c", secondary_y=True, range=[0, 100])

fig.show()

# --- PRINT FINAL RESULT ---
print("="*60)
print(f"üìä FINAL STATISTICAL PROOF (1976-2023)")
print("="*60)
print(f"‚Ä¢ Correlation Coefficient: {corr_score:.2f}")
print("-" * 30)

if corr_score < -0.5:
    print("‚úÖ SUCCESS: Strong Negative Correlation Found.")
    print("   Interpretation: As 'Stranger' crimes go UP, the 'Clearance Rate' goes DOWN.")
elif corr_score > 0.5:
    print("‚ö†Ô∏è WARNING: Positive Correlation.")
    print("   This means Solved Rates and Stranger Crimes are moving in the SAME direction.")
else:
    print("‚ö†Ô∏è WEAK CORRELATION.")
    print("   There is no strong statistical link between these two variables.")
print("="*60)

üìä FINAL STATISTICAL PROOF (1976-2023)
‚Ä¢ Correlation Coefficient: -0.73
------------------------------
‚úÖ SUCCESS: Strong Negative Correlation Found.
   Interpretation: As 'Stranger' crimes go UP, the 'Clearance Rate' goes DOWN.


In [25]:
# --- 1. PREPARE DECADE DATA ---
# Create Decade Column
df['Decade'] = (df['Year'] // 10) * 10

# Group by Decade and Calculate Stats
decade_stats = df.groupby('Decade').apply(lambda x: pd.Series({
    'Total Cases': len(x),
    'Solved Cases': x['Solved_Flag'].sum(),
    'Unsolved Cases': len(x) - x['Solved_Flag'].sum(),
    'Clearance Rate': (x['Solved_Flag'].sum() / len(x) * 100)
})).reset_index()

# Format the Output (Round numbers and add commas)
decade_table = decade_stats.copy()
decade_table['Clearance Rate'] = decade_table['Clearance Rate'].round(1).astype(str) + '%'
decade_table['Total Cases'] = decade_table['Total Cases'].apply(lambda x: f"{int(x):,}")
decade_table['Solved Cases'] = decade_table['Solved Cases'].apply(lambda x: f"{int(x):,}")
decade_table['Unsolved Cases'] = decade_table['Unsolved Cases'].apply(lambda x: f"{int(x):,}")

# --- 2. PRINT THE TABLE ---
print("="*60)
print("üìä HOMICIDE TRENDS BY DECADE (Summary Table)")
print("="*60)
# Adjust spacing for a clean look
print(decade_table.to_string(index=False, justify='center'))
print("="*60)

üìä HOMICIDE TRENDS BY DECADE (Summary Table)
 Decade Total Cases Solved Cases Unsolved Cases Clearance Rate
  1970     77,684      57,427        20,257         73.9%     
  1980    198,531     139,894        58,637         70.5%     
  1990    204,032     130,764        73,268         64.1%     
  2000    163,338     103,891        59,447         63.6%     
  2010    162,454     106,640        55,814         65.6%     
  2020     88,597      59,725        28,872         67.4%     






In [26]:
# --- 1. PREPARE 5-YEAR PERIOD DATA ---
# Create 5-Year Bins (e.g., 1975-1979, 1980-1984)
df['Bin_Start'] = (df['Year'] // 5) * 5
df['Period'] = df['Bin_Start'].astype(str) + "-" + (df['Bin_Start'] + 4).astype(str)

# Group by Period and Calculate Stats
period_stats = df.groupby('Period').agg(
    Total_Cases=('Solved_Flag', 'count'),
    Solved_Cases=('Solved_Flag', 'sum')
).reset_index()

# Calculate Unsolved & Clearance Rate
period_stats['Unsolved_Cases'] = period_stats['Total_Cases'] - period_stats['Solved_Cases']
period_stats['Clearance_Rate'] = (period_stats['Solved_Cases'] / period_stats['Total_Cases'] * 100).round(1)

# Sort by Period to ensure chronological order
period_stats = period_stats.sort_values('Period')

# Format the Output Table (Add commas & %)
display_table = period_stats.copy()
display_table['Clearance_Rate'] = display_table['Clearance_Rate'].astype(str) + '%'
display_table['Total_Cases'] = display_table['Total_Cases'].apply(lambda x: f"{int(x):,}")
display_table['Solved_Cases'] = display_table['Solved_Cases'].apply(lambda x: f"{int(x):,}")
display_table['Unsolved_Cases'] = display_table['Unsolved_Cases'].apply(lambda x: f"{int(x):,}")

# Rename columns for cleaner display
display_table = display_table.rename(columns={
    'Period': '5-Year Period',
    'Total_Cases': 'Total Cases',
    'Solved_Cases': 'Solved',
    'Unsolved_Cases': 'Unsolved',
    'Clearance_Rate': 'Rate'
})

# --- 2. PRINT THE TABLE ---
print("="*60)
print("üìä HOMICIDE TRENDS (Every 5 Years)")
print("="*60)
# Use 'to_string' for perfect alignment in the console
print(display_table.to_string(index=False, justify='center'))
print("="*60)

üìä HOMICIDE TRENDS (Every 5 Years)
5-Year Period Total Cases Solved Unsolved  Rate
  1975-1979      77,684   57,427  20,257  73.9%
  1980-1984     102,590   73,051  29,539  71.2%
  1985-1989      95,941   66,843  29,098  69.7%
  1990-1994     115,279   73,867  41,412  64.1%
  1995-1999      88,753   56,897  31,856  64.1%
  2000-2004      79,578   50,231  29,347  63.1%
  2005-2009      83,760   53,660  30,100  64.1%
  2010-2014      75,185   49,414  25,771  65.7%
  2015-2019      87,269   57,226  30,043  65.6%
  2020-2024      88,597   59,725  28,872  67.4%


In [27]:
import plotly.express as px

# --- 1. FEATURE ENGINEERING ---
# We need to convert text columns to Numbers (0 or 1) for the correlation to work.

# A. Create "Handgun" Flag (1 if Handgun, 0 if not)
df['Is_Handgun'] = df['Weapon'].astype(str).apply(lambda x: 1 if 'handgun' in x.lower() else 0)

# B. Ensure "Is_Anonymous" exists (from previous block)
if 'Is_Anonymous' not in df.columns:
    anonymous_terms = ['stranger', 'relationship not determined', 'unknown', 'nan']
    df['Is_Anonymous'] = df['Relationship'].astype(str).apply(
        lambda x: 1 if any(t in x.lower() for t in anonymous_terms) else 0
    )

# --- 2. CALCULATE CORRELATION MATRIX ---
# Select the columns we want to analyze
features = ['Solved_Flag', 'Is_Anonymous', 'Year', 'Is_Handgun']

# Calculate the correlation (Numbers between -1.0 and 1.0)
corr_matrix = df[features].corr().round(2)

# Rename for cleaner labels on the chart
labels = {
    'Solved_Flag': 'Solved?',
    'Is_Anonymous': 'Stranger/Unknown',
    'Year': 'Time (Year)',
    'Is_Handgun': 'Weapon: Handgun'
}
corr_matrix = corr_matrix.rename(index=labels, columns=labels)

# --- 3. PLOT HEATMAP ---
fig = px.imshow(
    corr_matrix,
    text_auto=True,
    aspect="auto",
    color_continuous_scale='RdBu',
    zmin=-1, zmax=1
)

# --- 4. FORMATTING ---
fig.update_layout(
    title="<b>Correlation Heatmap:</b> What Drives a Case to be Solved?",
    title_font_size=20,
    height=500,
    width=700,
    template="plotly_white"
)

fig.show()

print("üìä HOW TO READ THIS:")
print("‚Ä¢ Blue Box (Positive): These factors move TOGETHER.")
print("‚Ä¢ Red Box (Negative): These factors move OPPOSITE.")
print("‚Ä¢ Look at the 'Solved?' row:")
print("  - If 'Stranger' is RED, it means Stranger Crimes = Less Likely to Solve.")

üìä HOW TO READ THIS:
‚Ä¢ Blue Box (Positive): These factors move TOGETHER.
‚Ä¢ Red Box (Negative): These factors move OPPOSITE.
‚Ä¢ Look at the 'Solved?' row:
  - If 'Stranger' is RED, it means Stranger Crimes = Less Likely to Solve.


In [28]:
import plotly.express as px
import pandas as pd

# --- 1. PREPARE DATA ---
# Group by Year and calculate % for the combined categories
yearly_breakdown = df.groupby('Year').apply(lambda x: pd.Series({
    'Clearance Rate': (x['Solved_Flag'].sum() / len(x) * 100),

    # Family (Known)
    'Family %': (len(x[x['Relationship'].isin(['Husband', 'Wife', 'Mother', 'Father', 'Son', 'Daughter', 'Brother', 'Sister', 'In-law', 'Stepfather', 'Stepmother', 'Stepson', 'Stepdaughter', 'Other family'])]) / len(x) * 100),

    # Acquaintance (Known)
    'Acquaintance %': (len(x[x['Relationship'] == 'Acquaintance']) / len(x) * 100),

    # ANONYMOUS (The Combined Group: Stranger + Unknown + Not Determined)
    'Anonymous %': (len(x[x['Relationship'].isin(['Stranger', 'Relationship not determined', 'Unknown', 'nan'])]) / len(x) * 100)
})).reset_index()

# --- 2. CALCULATE CORRELATION ---
features = ['Clearance Rate', 'Family %', 'Acquaintance %', 'Anonymous %']
corr_matrix = yearly_breakdown[features].corr().round(2)

# --- 3. PLOT HEATMAP ---
fig = px.imshow(
    corr_matrix,
    text_auto=True,
    aspect="auto",
    color_continuous_scale='RdBu',
    zmin=-1, zmax=1
)

fig.update_layout(
    title="<b>Impact of 'Anonymous' Crime on Clearance Rate</b>",
    height=500, width=700,
    template="plotly_white"
)
fig.show()

# --- 4. PRINT DETAILED ANALYSIS ---
anon_corr = corr_matrix.loc['Anonymous %', 'Clearance Rate']
family_corr = corr_matrix.loc['Family %', 'Clearance Rate']

print("="*80)
print("üìä AUTOMATED ANALYSIS & REASONING")
print("="*80)

# --- ANALYSIS OF NEGATIVE CORRELATION ---
print(f"1. ANONYMOUS CRIMES (Stranger + Unknown): {anon_corr}")
if anon_corr < -0.7:
    print(f"   üî¥ STRONG NEGATIVE RELATION: As anonymity rises, solved rates crash.")
    print(f"   ‚ùì WHY IS THIS HAPPENING? (Possible Reasons):")
    print(f"      1. Lack of Motive: Without a relationship, detectives cannot trace 'Why' the crime happened.")
    print(f"      2. No Witness Links: Bystanders cannot identify a stranger, whereas they can identify a family member.")
    print(f"      3. Reliance on Forensics: These cases require DNA/Fingerprints to solve, which are often not found.")

print("-" * 40)

# --- ANALYSIS OF POSITIVE CORRELATION ---
print(f"2. FAMILY CRIMES: {family_corr}")
if family_corr > 0.7:
    print(f"   üîµ STRONG POSITIVE RELATION: Domestic crimes help the solved rate.")
    print(f"   ‚ùì WHY IS THIS HAPPENING? (Possible Reasons):")
    print(f"      1. Obvious Suspects: The spouse/partner is always the first person investigated.")
    print(f"      2. Crimes of Passion: These often happen in the home, leaving massive amounts of physical evidence.")
    print(f"      3. High Confession Rate: Emotional guilt often leads to quicker confessions compared to calculated stranger crimes.")

print("="*80)





üìä AUTOMATED ANALYSIS & REASONING
1. ANONYMOUS CRIMES (Stranger + Unknown): -0.73
   üî¥ STRONG NEGATIVE RELATION: As anonymity rises, solved rates crash.
   ‚ùì WHY IS THIS HAPPENING? (Possible Reasons):
      1. Lack of Motive: Without a relationship, detectives cannot trace 'Why' the crime happened.
      2. No Witness Links: Bystanders cannot identify a stranger, whereas they can identify a family member.
      3. Reliance on Forensics: These cases require DNA/Fingerprints to solve, which are often not found.
----------------------------------------
2. FAMILY CRIMES: 0.86
   üîµ STRONG POSITIVE RELATION: Domestic crimes help the solved rate.
   ‚ùì WHY IS THIS HAPPENING? (Possible Reasons):
      1. Obvious Suspects: The spouse/partner is always the first person investigated.
      2. Crimes of Passion: These often happen in the home, leaving massive amounts of physical evidence.
      3. High Confession Rate: Emotional guilt often leads to quicker confessions compared to calcu

In [31]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score, mean_absolute_error

# --- 1. PREPARE THE DATA ---
# Sort data by year chronologically
yearly_stats = yearly_stats.sort_values('Year')

X = yearly_stats[['Year']]
y = yearly_stats['Clearance_Rate']

# --- 2. CREATE THE POLYNOMIAL MODEL (Degree 3 for a flexible curve) ---
poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X)

# Train the model on the curved features
poly_model = LinearRegression()
poly_model.fit(X_poly, y)

# --- 3. MAKE PREDICTIONS (Historical + Future) ---
# Create future timeline up to 2035
future_years = pd.DataFrame({'Year': np.arange(1976, 2036)})
future_years_poly = poly.fit_transform(future_years[['Year']])

# Predict
future_predictions = poly_model.predict(future_years_poly)
future_years['Predicted_Rate'] = future_predictions.round(1)

# Get the $R^2$ for the historical data to see how well it fits
historical_predictions = poly_model.predict(X_poly)
r2_poly = r2_score(y, historical_predictions)
mae_poly = mean_absolute_error(y, historical_predictions)

# --- 4. PLOT THE CURVED TREND ---
fig = go.Figure()

# Trace 1: Historical Data Points (Scatter)
fig.add_trace(go.Scatter(
    x=yearly_stats['Year'], y=yearly_stats['Clearance_Rate'],
    mode='markers', name='Actual Data',
    marker=dict(color='#d62728', size=8, opacity=0.7),
    hovertemplate='Year: %{x}<br>Actual Rate: <b>%{y}%</b><extra></extra>'
))

# Trace 2: Polynomial Trend Line (Smooth Curve)
fig.add_trace(go.Scatter(
    x=future_years['Year'], y=future_years['Predicted_Rate'],
    mode='lines', name='Polynomial Forecast',
    line=dict(color='#1f77b4', width=4),
    hovertemplate='Year: %{x}<br>Predicted Rate: <b>%{y}%</b><extra></extra>'
))

# Add a vertical line to show where the future begins
fig.add_vline(x=2023, line_width=2, line_dash="dash", line_color="gray", annotation_text=" Future Forecast ->")

# --- 5. FORMATTING ---
fig.update_layout(
    title=f"<b>Polynomial Regression:</b> Capturing the Plateau<br><sup>Model Accuracy (R¬≤): {r2_poly:.2f} | Average Error: {mae_poly:.2f}%</sup>",
    xaxis_title="Year",
    yaxis_title="Clearance Rate (%)",
    yaxis_range=[30, 100],
    hovermode="x unified",
    template="plotly_white",
    height=600,
    legend=dict(x=0.02, y=0.1)
)

fig.show()

# --- 6. PRINT ACCURACY ---
print("="*60)
print("üéØ POLYNOMIAL REGRESSION ACCURACY")
print("="*60)
print(f"‚Ä¢ New R-Squared (R¬≤): {r2_poly:.2f}")
print(f"‚Ä¢ Mean Absolute Error (MAE): {mae_poly:.2f}%")
print("-" * 60)
if r2_poly > 0.6:
    print("‚úÖ Massive Improvement! The curve fits the data much better than a straight line.")
else:
    print("‚ö†Ô∏è Still struggling to find a pattern. The data might be highly volatile.")
print("="*60)

üéØ POLYNOMIAL REGRESSION ACCURACY
‚Ä¢ New R-Squared (R¬≤): 0.81
‚Ä¢ Mean Absolute Error (MAE): 1.32%
------------------------------------------------------------
‚úÖ Massive Improvement! The curve fits the data much better than a straight line.


In [32]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_absolute_error

# --- 1. PREPARE THE DATA ---
yearly_stats = yearly_stats.sort_values('Year')
X = yearly_stats[['Year']]
y = yearly_stats['Clearance_Rate']

# --- 2. INITIALIZE MODELS ---
# A. Polynomial Regression (Degree 3)
poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X)
model_poly = LinearRegression()
model_poly.fit(X_poly, y)
pred_poly = model_poly.predict(X_poly)

# B. Random Forest Regressor
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X, y)
pred_rf = model_rf.predict(X)

# C. Support Vector Regression (SVR)
model_svr = SVR(kernel='rbf', C=100, gamma=0.1)
model_svr.fit(X, y)
pred_svr = model_svr.predict(X)

# --- 3. CALCULATE METRICS ---
results = pd.DataFrame({
    'Model': ['Polynomial (Degree 3)', 'Random Forest', 'Support Vector (SVR)'],
    'R-Squared (R¬≤)': [
        r2_score(y, pred_poly),
        r2_score(y, pred_rf),
        r2_score(y, pred_svr)
    ],
    'MAE (%)': [
        mean_absolute_error(y, pred_poly),
        mean_absolute_error(y, pred_rf),
        mean_absolute_error(y, pred_svr)
    ]
})

# Format the output
results['R-Squared (R¬≤)'] = results['R-Squared (R¬≤)'].round(3)
results['MAE (%)'] = results['MAE (%)'].round(2)

# --- 4. PRINT THE COMPARISON ---
print("="*65)
print("üèÜ MACHINE LEARNING MODEL SHOWDOWN")
print("="*65)
print(results.to_string(index=False))
print("-" * 65)

# Find the winner based on R-Squared
winner = results.loc[results['R-Squared (R¬≤)'].idxmax()]
print(f"ü•á THE WINNER: {winner['Model']} with an R¬≤ of {winner['R-Squared (R¬≤)']} and MAE of {winner['MAE (%)']}%")
print("="*65)

üèÜ MACHINE LEARNING MODEL SHOWDOWN
                Model  R-Squared (R¬≤)  MAE (%)
Polynomial (Degree 3)           0.814     1.32
        Random Forest           0.976     0.44
 Support Vector (SVR)           0.956     0.45
-----------------------------------------------------------------
ü•á THE WINNER: Random Forest with an R¬≤ of 0.976 and MAE of 0.44%


In [33]:
import plotly.graph_objects as go
import pandas as pd
import numpy as np

# --- 1. PREPARE FUTURE TIMELINE ---
# Create years 1976 to 2035
future_years = pd.DataFrame({'Year': np.arange(1976, 2036)})

# --- 2. GENERATE PREDICTIONS ---
# A. Polynomial Predictions
future_poly = poly.transform(future_years[['Year']])
pred_future_poly = model_poly.predict(future_poly)

# B. Random Forest Predictions
pred_future_rf = model_rf.predict(future_years[['Year']])

# C. SVR Predictions
pred_future_svr = model_svr.predict(future_years[['Year']])

# --- 3. PLOT THE COMPARISON ---
fig = go.Figure()

# Historical Data Points
fig.add_trace(go.Scatter(
    x=yearly_stats['Year'], y=yearly_stats['Clearance_Rate'],
    mode='markers', name='Actual Data',
    marker=dict(color='black', size=6, opacity=0.5)
))

# Polynomial Line (The true forecast)
fig.add_trace(go.Scatter(
    x=future_years['Year'], y=pred_future_poly,
    mode='lines', name='Polynomial (Degree 3)',
    line=dict(color='#d62728', width=4) # Red
))

# Random Forest Line (The Trap)
fig.add_trace(go.Scatter(
    x=future_years['Year'], y=pred_future_rf,
    mode='lines', name='Random Forest (Flatlines)',
    line=dict(color='#2ca02c', width=3, dash='dot') # Green
))

# SVR Line
fig.add_trace(go.Scatter(
    x=future_years['Year'], y=pred_future_svr,
    mode='lines', name='SVR (Reverts to mean)',
    line=dict(color='#1f77b4', width=3, dash='dash') # Blue
))

# Add a vertical line to show where the future begins
fig.add_vline(x=2023, line_width=2, line_dash="dash", line_color="gray", annotation_text=" Future ->")

# --- 4. FORMATTING ---
fig.update_layout(
    title="<b>Why Polynomial Wins:</b> Random Forest Cannot Forecast the Future",
    xaxis_title="Year",
    yaxis_title="Clearance Rate (%)",
    yaxis_range=[30, 100],
    hovermode="x unified",
    template="plotly_white",
    height=600,
    legend=dict(x=0.02, y=0.1)
)

fig.show()