In [8]:
"""
Casestudies Trend Analysis and Visualization Script

This script identifies the books that best represent the average trendlines for specific groups in normalized metrics
(e.g., Staging_z, PlotProgression_z) using various filters (e.g., Gender, Genre). It generates visual comparisons between
these books and broader group trends.

Key Features:
- Filters dataset by Gender, Genre, or other attributes to identify specific groups.
- Uses GLM to fit quadratic models for average trends within groups and individual books.
- Identifies books closest to group trends based on combined Mean Squared Error (MSE).
- Visualizes comparisons between individual book trends and group trends.

Folder Structure:
- Results are saved in a directory named `Results Casestudies_{execution_date}`.
- Graphs are saved with descriptive filenames, showing the metric, group, and book title.

Dependencies:
- Python libraries: pandas, seaborn, statsmodels, numpy, matplotlib, re.

Usage:
- Ensure the dataset is in CSV format and update the `df` path.
- Modify `filters` and `metrics` as needed for specific analyses.
- Run the script to generate statistical summaries and visual comparisons.

Author: Maaike de Jongh
Date: 2025-01-06
"""

import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import statsmodels.api as sm
import numpy as np
from numpy.polynomial.polynomial import Polynomial
import re
from datetime import datetime

# Mapping for clean metric titles
metric_titles = {
    'Staging_z': 'Staging',
    'PlotProgression_z': 'Plot Progression',
    'CognitiveTension_z': 'Cognitive Tension'
}

# Result directory
execution_date = datetime.now().strftime('%Y-%m-%d')
result_directory = f"Results Casestudies_{execution_date}"
os.makedirs(result_directory, exist_ok=True)

# Load dataset
df = pd.read_csv('1000_segments_casestudies.csv')

# Function to clean book titles
def clean_book_title(filename):
    """
    Cleans the title of a book by removing unwanted prefixes, extensions, and unformatted names.
    """
    title_part = filename.split('_', 1)[-1].replace('.txt', '')
    cleaned_title = re.sub(r'([a-z])([A-Z])', r'\1 \2', title_part)
    return cleaned_title

# Function to find the best representative book for a group
def find_best_book(df, group_filters, metrics):
    """
    Identifies the book closest to average trendlines for a specific group based on combined MSE.
    """
    group_df = df.copy()
    for column, value in group_filters.items():
        group_df = group_df[group_df[column] == value]

    if group_df.empty:
        print(f"No books found for filters: {group_filters}")
        return None, None, group_df

    glm_models = {}
    for metric in metrics:
        glm = smf.glm(
            formula=f'{metric} ~ np.power(Relative_Segment, 1) + np.power(Relative_Segment, 2)',
            data=group_df, family=sm.families.Gaussian()
        )
        glm_models[metric] = glm.fit()

    segment_values = np.linspace(group_df['Relative_Segment'].min(), group_df['Relative_Segment'].max(), 100)
    pred_data = pd.DataFrame({'Relative_Segment': segment_values})

    for metric in metrics:
        pred_data[f'{metric}_pred'] = glm_models[metric].predict(pred_data)

    best_book = None
    lowest_combined_mse = float('inf')

    for book in group_df['Filename'].unique():
        book_data = group_df[group_df['Filename'] == book]

        book_glm_models = {}
        for metric in metrics:
            book_glm = smf.glm(
                formula=f'{metric} ~ np.power(Relative_Segment, 1) + np.power(Relative_Segment, 2)',
                data=book_data, family=sm.families.Gaussian()
            )
            book_glm_models[metric] = book_glm.fit()

        mse_combined = 0
        for metric in metrics:
            book_pred = book_glm_models[metric].predict(pred_data)
            mse = ((pred_data[f'{metric}_pred'] - book_pred) ** 2).mean()
            mse_combined += mse

        if mse_combined < lowest_combined_mse:
            lowest_combined_mse = mse_combined
            best_book = book

    return best_book, lowest_combined_mse, group_df

# Function to plot trends for a book and group
def plot_trends(group_df, best_book_filename, trend_column, clean_title, metric):
    """
    Generates and plots trends for a specific book and compares them to group trends.
    """
    best_book_data = group_df[group_df['Filename'] == best_book_filename]

    coefs = Polynomial.fit(
        best_book_data['Relative_Segment'], 
        best_book_data[metric], 
        2
    ).convert().coef

    x_vals = np.linspace(
        best_book_data['Relative_Segment'].min(), 
        best_book_data['Relative_Segment'].max(), 
        100
    )
    y_vals = coefs[0] + coefs[1] * x_vals + coefs[2] * x_vals**2

    global_groups = df[trend_column].unique()
    glm_group = smf.glm(
        formula=f'{metric} ~ np.power(Relative_Segment, 1) + np.power(Relative_Segment, 2) * {trend_column}', 
        data=df, 
        family=sm.families.Gaussian()
    )
    results_group = glm_group.fit()

    segment_values = np.linspace(group_df['Relative_Segment'].min(), group_df['Relative_Segment'].max(), 100)
    pred_data_group = pd.DataFrame({
        'Relative_Segment': np.tile(segment_values, len(global_groups)),
        trend_column: np.repeat(global_groups, len(segment_values))
    })
    pred_data_group[f'{metric}_pred'] = results_group.predict(pred_data_group)

    fig, ax = plt.subplots(figsize=(10, 6))

    for group in global_groups:
        subset = pred_data_group[pred_data_group[trend_column] == group]
        plt.plot(
            subset['Relative_Segment'], 
            subset[f'{metric}_pred'], 
            linestyle='--',
            alpha=0.6,
            linewidth=1,
            label=f'{group}'
        )

    plt.plot(
        x_vals, 
        y_vals, 
        color='red',
        linestyle='-',
        linewidth=2,
        label=clean_title
    )

    ax.set_xlabel('Relative Segment', fontsize=14, color="#0209ef")
    ax.set_ylabel(metric_titles.get(metric, metric), fontsize=14, color="#0209ef")
    ax.set_title(f'{metric_titles.get(metric, metric)} scores of {clean_title} compared to {trend_column} trends', fontsize=16, color="#0209ef")
    
    
    # Customize ticks, spines, and legend
    ax.tick_params(axis='x', colors="#0209ef")
    ax.tick_params(axis='y', colors="#0209ef")
    for spine in ax.spines.values():
        spine.set_color("#0209ef")
    
    legend = ax.legend()
    for text in legend.get_texts():
        text.set_color("#0209ef")
    ax.grid(True)

    graph_filename = os.path.join(result_directory, f'Casestudy_{metric_titles.get(metric, metric)}_{trend_column}_{clean_title.replace(" ", "")}.png')
    plt.savefig(graph_filename, dpi=300)
    plt.close()
    print(f"Graph saved as {graph_filename}")


# Combine finding the best book and plotting trends
trend_column_mapping = {
    "{'Gender': 'male', 'genre': 'Suspense'}": 'Gender',
    "{'Gender': 'female', 'genre': 'Suspense'}": 'Gender',
    "{'Gender': 'female', 'genre': 'Romantic'}": 'genre'
}

filters = [
    {'Gender': 'male', 'genre': 'Suspense'},
    {'Gender': 'female', 'genre': 'Suspense'},
    {'Gender': 'female', 'genre': 'Romantic'}
]

metrics = ['Staging_z', 'PlotProgression_z']

for filter_set in filters:
    best_book, mse, group_df = find_best_book(df, group_filters=filter_set, metrics=metrics)

    if best_book:
        clean_title = clean_book_title(best_book)
        print(f"Best book for {filter_set}: {clean_title}, with MSE: {mse}")

        # Convert the filter to a string representation to match the mapping
        filter_key = str(filter_set)
        trend_column = trend_column_mapping.get(filter_key, 'Genre')  # Default to 'Genre' if not found

        for metric in metrics:
            plot_trends(
                group_df=group_df,
                best_book_filename=best_book,
                trend_column=trend_column,
                clean_title=clean_title,
                metric=metric
            )
    else:
        print(f"No book found for {filter_set}")


Best book for {'Gender': 'male', 'genre': 'Suspense'}: Gebroken, with MSE: 0.001765352722553802
Graph saved as Results Casestudies_2025-01-07/Casestudy_Staging_Gender_Gebroken.png
Graph saved as Results Casestudies_2025-01-07/Casestudy_Plot Progression_Gender_Gebroken.png
Best book for {'Gender': 'female', 'genre': 'Suspense'}: Schaduwkant, with MSE: 0.00342113612477849
Graph saved as Results Casestudies_2025-01-07/Casestudy_Staging_Gender_Schaduwkant.png
Graph saved as Results Casestudies_2025-01-07/Casestudy_Plot Progression_Gender_Schaduwkant.png
Best book for {'Gender': 'female', 'genre': 'Romantic'}: Zussen Voor Altijd, with MSE: 0.00668089126284889
Graph saved as Results Casestudies_2025-01-07/Casestudy_Staging_genre_ZussenVoorAltijd.png
Graph saved as Results Casestudies_2025-01-07/Casestudy_Plot Progression_genre_ZussenVoorAltijd.png
