# Annotated Pylint Radon

Correlation:
- join with objectives score
- compute length
- compute inv_inf score

Comparison
- direct/indirect

In [12]:
import sqlite3
import pandas as pd
import os

db_path = "../../giicg.db"
if not os.path.exists(db_path):
    raise FileNotFoundError(f"Database file does not exist: {db_path}")

conn = sqlite3.connect(db_path)
code = pd.read_sql("""
    SELECT ps.conversation_id, ps. message_id, ps.message_text, ps.gender, ps.user_id, ps.model_version,  ps.radon_complexity,ps.radon_maintainability_index, ps.radon_loc, ps.radon_sloc, ps.radon_lloc, ps.radon_comments, ps.pylint_score,
           ap.maj_objectives_score,
           p.conversational,
           cp.can_you_question, cp.i_statement, cp.unpersonal_command, cp.you_command, cp.we_command, cp.unpersonal_question, cp.i_question, cp.we_question,
        iis.involved, iis.informational, iis.inv_inf
    FROM pylint_scores ps
    JOIN annotated_prompts ap ON ps.message_id = ap.message_id
    JOIN manually_split_prompts p ON ps.message_id = p.message_id
    JOIN categorized_prompts cp ON ps.message_id = cp.message_id
    JOIN inv_inf_scores iis ON ps.message_id = iis.message_id;
""", conn)

code

Unnamed: 0,conversation_id,message_id,message_text,gender,user_id,model_version,radon_complexity,radon_maintainability_index,radon_loc,radon_sloc,...,i_statement,unpersonal_command,you_command,we_command,unpersonal_question,i_question,we_question,involved,informational,inv_inf
0,6,5,I want to use Dummy Hot encoding to replace th...,Woman (cisgender),16,chatgpt-4o-latest,0.000000,100.000000,19.0,8.0,...,1,0,0,0,0,1,0,18.181818,18.181818,1.000000
1,8,47,I have a pandas dataframe like this:\ndata\tpe...,Woman (cisgender),28,chatgpt-4o-latest,0.000000,100.000000,22.0,9.0,...,1,0,0,0,0,0,0,29.411765,14.705882,2.000000
2,10,57,"as a NLP and LLM researcher, I am recently dow...",Non-binary,30,chatgpt-4o-latest,,,36.0,32.0,...,1,0,0,0,0,0,0,16.304348,19.565217,0.833333
3,12,65,Blender and Python. I have a collection of hun...,Man (cisgender),34,chatgpt-4o-latest,6.000000,76.810412,62.0,38.0,...,0,0,1,0,0,0,0,15.384615,20.512821,0.750000
4,13,126,"how to run a Python future without blocking, i...",Man (cisgender),46,chatgpt-4o-latest,1.333333,90.912037,21.0,16.0,...,0,0,0,0,1,0,0,15.151515,18.181818,0.833333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490,70,1524,can you write a test that tests whether parse ...,Woman (cisgender),90,deepseek-chat,5.000000,87.789161,30.0,22.0,...,0,0,0,0,0,0,0,25.000000,16.666667,1.500000
491,72,1534,can you write me some python code to count the...,Woman (cisgender),11,deepseek-chat,1.000000,82.315095,25.0,17.0,...,0,0,0,0,0,0,0,18.750000,18.750000,1.000000
492,74,1538,I want to merge these two dataframes: \ndf_tra...,Woman (cisgender),28,deepseek-chat,0.000000,100.000000,1.0,1.0,...,1,0,0,0,0,0,1,16.666667,27.777778,0.600000
493,78,1598,"0,tcp,http,SF,215,45076,0,0,0,0,0,1,0,0,0,0,0,...",Woman (cisgender),73,deepseek-chat,0.000000,100.000000,21.0,14.0,...,1,0,0,0,0,0,0,16.666667,20.000000,0.833333


In [13]:
import pandas as pd

# List of columns to aggregate using mean and std
agg_cols = [
    "radon_complexity",
    "radon_maintainability_index",
    "radon_loc",
    "radon_sloc",
    "radon_lloc",
    "radon_comments",
    "pylint_score"
]

# Define aggregation dictionary for mean and std, ignore NaNs automatically
agg_dict = {}
for col in agg_cols:
    agg_dict[col] = ['mean', 'std']

# Group by the required columns
group_cols = [
    "conversation_id",
    "message_id",
    "message_text",
    "conversational",
    "gender",
    "user_id",
    "model_version",
    "maj_objectives_score",
    "can_you_question",
    "i_statement",
    "unpersonal_command",
    "you_command",
    "we_command",
    "unpersonal_question",
    "i_question",
    "we_question",
    "informational",
    "involved",
    "inv_inf"
]

# Perform aggregation
agg_df = (
    code
    .groupby(group_cols, dropna=False)
    .agg(agg_dict)
    .reset_index()
)

# Flatten column multi-index
agg_df.columns = [
    f"{col if stat == '' else stat + '_' + col}" if stat != '' else col
    for col, stat in agg_df.columns.to_flat_index()
]

# Move mean_/std_ to correct suffix format: mean_colname, std_colname
agg_df = agg_df.rename(columns={f"mean_{col}": f"mean_{col}" for col in agg_cols})
agg_df = agg_df.rename(columns={f"std_{col}": f"std_{col}" for col in agg_cols})

agg_df


Unnamed: 0,conversation_id,message_id,message_text,conversational,gender,user_id,model_version,maj_objectives_score,can_you_question,i_statement,...,mean_radon_loc,std_radon_loc,mean_radon_sloc,std_radon_sloc,mean_radon_lloc,std_radon_lloc,mean_radon_comments,std_radon_comments,mean_pylint_score,std_pylint_score
0,6,5,I want to use Dummy Hot encoding to replace th...,I want to use Dummy Hot encoding to replace th...,Woman (cisgender),16,chatgpt-4o-latest,5,0,1,...,18.000000,1.732051,8.000000,0.000000,9.000000,0.000000,4.666667,0.577350,10.000000,0.000000
1,6,5,I want to use Dummy Hot encoding to replace th...,I want to use Dummy Hot encoding to replace th...,Woman (cisgender),16,claude-3-7-sonnet-20250219,5,0,1,...,24.333333,2.309401,10.666667,0.577350,8.333333,0.577350,8.666667,1.527525,9.166667,0.721688
2,6,5,I want to use Dummy Hot encoding to replace th...,I want to use Dummy Hot encoding to replace th...,Woman (cisgender),16,claude-sonnet-4-20250514,5,0,1,...,15.000000,1.000000,7.666667,0.577350,7.000000,1.000000,3.666667,0.577350,10.000000,0.000000
3,6,5,I want to use Dummy Hot encoding to replace th...,I want to use Dummy Hot encoding to replace th...,Woman (cisgender),16,deepseek-chat,5,0,1,...,15.000000,0.000000,7.000000,0.000000,7.000000,0.000000,5.000000,0.000000,8.570000,0.000000
4,6,5,I want to use Dummy Hot encoding to replace th...,I want to use Dummy Hot encoding to replace th...,Woman (cisgender),16,gpt-4.1-2025-04-14,5,0,1,...,18.333333,2.081666,8.333333,0.577350,9.333333,0.577350,4.666667,0.577350,7.776667,3.850926
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162,86,1664,how to merge multiple xarray under a new categ...,wie kann man mehrere xarray unter einer neuen ...,Woman (cisgender),60,claude-sonnet-4-20250514,3,0,0,...,9.666667,4.041452,6.333333,2.886751,4.333333,1.154701,3.666667,2.309401,10.000000,0.000000
163,86,1664,how to merge multiple xarray under a new categ...,wie kann man mehrere xarray unter einer neuen ...,Woman (cisgender),60,deepseek-chat,3,0,0,...,10.000000,0.000000,4.000000,0.000000,4.000000,0.000000,4.000000,0.000000,10.000000,0.000000
164,86,1664,how to merge multiple xarray under a new categ...,wie kann man mehrere xarray unter einer neuen ...,Woman (cisgender),60,gpt-4.1-2025-04-14,3,0,0,...,7.000000,1.000000,4.000000,1.000000,4.000000,1.000000,1.333333,0.577350,10.000000,0.000000
165,86,1664,how to merge multiple xarray under a new categ...,wie kann man mehrere xarray unter einer neuen ...,Woman (cisgender),60,gpt-5-chat-latest,3,0,0,...,11.000000,4.358899,6.000000,2.645751,7.000000,4.358899,2.333333,0.577350,10.000000,0.000000


In [14]:
agg_df["length_message_text_chars"] = agg_df["message_text"].astype(str).apply(len)
agg_df["length_conversational_chars"] = agg_df["conversational"].astype(str).apply(len)

# Word count
agg_df["length_message_text_words"] = agg_df["message_text"].astype(str).apply(lambda x: len(x.split()))
agg_df["length_conversational_words"] = agg_df["conversational"].astype(str).apply(lambda x: len(x.split()))


In [20]:
import pandas as pd
from scipy.stats import spearmanr, pearsonr

models = ["chatgpt-4o-latest", "o3-2025-04-16", "gpt-5-chat-latest", "gpt-4.1-2025-04-14", "deepseek-chat", "claude-3-7-sonnet-20250219", "claude-sonnet-4-20250514"]

# Set your column names:
code_quality_cols = [
    "mean_radon_complexity", "mean_radon_maintainability_index", "mean_radon_loc",
    "mean_radon_sloc", "mean_radon_lloc", "mean_radon_comments", "mean_pylint_score"
]
prompt_marker_cols = [
    "length_message_text_words", "length_conversational_words", "inv_inf", "informational", "involved","maj_objectives_score"
]

# Collect all result DataFrames in a dictionary
model_corrs = {}

for model, df_sub in agg_df.groupby("model_version"):
    results = []
    for cq in code_quality_cols:
        for pm in prompt_marker_cols:
            # dropna to ensure only valid value pairs
            x = df_sub[pm]
            y = df_sub[cq]
            mask = x.notna() & y.notna()
            if mask.sum() >= 3:  # Need at least 3 values for meaningful correlation
                corr, pval = pearsonr(x[mask], y[mask])
            else:
                corr, pval = (float('nan'), float('nan'))
            results.append({
                "code_quality_metric": cq,
                "prompt_marker": pm,
                "correlation": corr,
                "p_value": pval,
                "model": model,
                "n": mask.sum(),
            })
    # Build a DataFrame for this model
    model_corrs[model] = pd.DataFrame(results)

# Example usage:
# To see the correlations for model 'gpt-4o':
model_corrs['deepseek-chat']


Unnamed: 0,code_quality_metric,prompt_marker,correlation,p_value,model,n
0,mean_radon_complexity,length_message_text_words,0.061277,0.776076,deepseek-chat,24
1,mean_radon_complexity,length_conversational_words,0.263141,0.21411,deepseek-chat,24
2,mean_radon_complexity,inv_inf,-0.165955,0.438335,deepseek-chat,24
3,mean_radon_complexity,informational,0.156599,0.464932,deepseek-chat,24
4,mean_radon_complexity,involved,-0.058427,0.786251,deepseek-chat,24
5,mean_radon_complexity,maj_objectives_score,0.178952,0.402781,deepseek-chat,24
6,mean_radon_maintainability_index,length_message_text_words,-0.122521,0.568446,deepseek-chat,24
7,mean_radon_maintainability_index,length_conversational_words,-0.625821,0.001072,deepseek-chat,24
8,mean_radon_maintainability_index,inv_inf,0.278464,0.187636,deepseek-chat,24
9,mean_radon_maintainability_index,informational,-0.202963,0.341513,deepseek-chat,24
