In [10]:
from pathlib import Path

import polars as pl
from statsmodels.api import MixedLM

from plot_functions import bar_plot, distribution_plot, line_plot_variables, violin_plot, scatter_with_regression
from utils.metrics_process import read_metrics, get_assistant_data, aggregate_df

In [11]:
version  = 3.0

In [12]:
metrics_dir = Path.cwd().parents[0] / "metrics"

plots_dir = Path.cwd().parents[0] / "plots" / f"v{version}"

curves_dir = plots_dir / "curves"
barplots_dir = plots_dir / "barplots"
postags_dir = plots_dir / "postags"

for dir in [curves_dir, barplots_dir, postags_dir]:
    dir.mkdir(parents=True, exist_ok=True)

## Text Stats

In [13]:
df = read_metrics(metrics_path=metrics_dir, version=version, metric_types=["text_stats"])
assistant_df = get_assistant_data(df)

In [14]:
assistant_df

role,content,model,group,id,fernandez_huerta,szigriszt_pazos,gutierrez_polini,crawford,flesch_kincaid_grade,total_message_number
str,str,str,enum,str,f64,f64,f64,f64,f64,i64
"""assistant""","""Hola! ¿Cómo estás hoy? ¡Bienve…","""Mistral 7B Instruct v0.3""","""A1""","""20250322-203105.json""",95.97,91.03,46.98,2.7,7.9,1
"""assistant""","""Hola Delia, un placer conocert…","""Mistral 7B Instruct v0.3""","""A1""","""20250322-203105.json""",95.35,93.7,45.53,2.6,8.1,2
"""assistant""","""Hola Carlos, un placer conocer…","""Mistral 7B Instruct v0.3""","""A1""","""20250322-203105.json""",102.27,98.9,47.92,2.0,6.6,3
"""assistant""","""Hola Carlos, encantado de cono…","""Mistral 7B Instruct v0.3""","""A1""","""20250322-203105.json""",88.33,85.36,44.72,3.3,9.7,4
"""assistant""","""Hola Carlos, me parece que est…","""Mistral 7B Instruct v0.3""","""A1""","""20250322-203105.json""",92.5,86.05,45.08,3.4,9.2,5
…,…,…,…,…,…,…,…,…,…,…
"""assistant""","""¡Qué emocionante! Los senderos…","""Qwen 2.5 7B Instruct""","""C1""","""20250323-021522.json""",85.07,81.23,39.05,3.9,10.9,5
"""assistant""","""Estoy totalmente de acuerdo, l…","""Qwen 2.5 7B Instruct""","""C1""","""20250323-021522.json""",79.77,75.41,38.41,4.4,12.9,6
"""assistant""","""Su análisis del personaje de J…","""Qwen 2.5 7B Instruct""","""C1""","""20250323-021522.json""",70.81,68.96,37.12,4.8,15.3,7
"""assistant""","""Gracias por tu análisis detall…","""Qwen 2.5 7B Instruct""","""C1""","""20250323-021522.json""",73.66,71.08,37.44,4.7,14.2,8


In [19]:
assistant_df = assistant_df.filter(pl.col("model") == "Qwen 2.5 7B Instruct")

model = MixedLM.from_formula("fernandez_huerta ~ group",  assistant_df.to_pandas(), groups="id", re_formula="~1")
results = model.fit()

print("\nSummary:")
print(results.summary())

p_values = results.pvalues
print("\nP-values with more precision:")
print(p_values.round(6))




Summary:
            Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: fernandez_huerta
No. Observations: 405     Method:             REML            
No. Groups:       45      Scale:              78.2878         
Min. group size:  9       Log-Likelihood:     -1479.9665      
Max. group size:  9       Converged:          Yes             
Mean group size:  9.0                                         
---------------------------------------------------------------
              Coef.   Std.Err.    z     P>|z|   [0.025   0.975]
---------------------------------------------------------------
Intercept     99.106     1.406  70.483  0.000   96.350  101.861
group[T.B1]   -5.768     1.989  -2.901  0.004   -9.665   -1.870
group[T.C1]  -10.498     1.989  -5.280  0.000  -14.396   -6.601
id Var        20.958     1.542                                 


P-values with more precision:
Intercept      0.000000
group[T.B1]    0.003725
group[T.C1]    0.000000
id Var    

Interpretation
- Intercept: Value of fernandez_huerta when group is A1
- group[T.B1]: Difference between fernandez_huerta between A1 and B1 
- group [T.C1]: Difference between fernandez_huerta between A1 and C1



In [16]:
p_values = results.pvalues
print("\nP-values with more precision:")
print(p_values.round(6))


P-values with more precision:
Intercept      0.000000
group[T.B1]    0.003725
group[T.C1]    0.000000
id Var         0.124499
dtype: float64


In [17]:
model = MixedLM.from_formula("fernandez_huerta ~ group + total_message_number",  assistant_df.to_pandas(), groups="id", re_formula="~1")
results = model.fit()

print(results.summary())

               Mixed Linear Model Regression Results
Model:               MixedLM  Dependent Variable:  fernandez_huerta
No. Observations:    405      Method:              REML            
No. Groups:          45       Scale:               63.3743         
Min. group size:     9        Log-Likelihood:      -1432.7962      
Max. group size:     9        Converged:           Yes             
Mean group size:     9.0                                           
-------------------------------------------------------------------
                      Coef.  Std.Err.    z    P>|z|  [0.025  0.975]
-------------------------------------------------------------------
Intercept            106.945    1.356  78.858 0.000 104.287 109.603
group[T.B1]           -5.768    1.583  -3.644 0.000  -8.870  -2.666
group[T.C1]          -10.498    1.583  -6.633 0.000 -13.600  -7.396
total_message_number  -1.568    0.153 -10.234 0.000  -1.868  -1.268
id Var                11.745    0.544                          

Interpretation
- Intercept: Value of fernandez_huerta when group is A1 and total message number is 0 (which never happens for us - need to fix)
