In [None]:
#gemma-3-27b-it_NC8002_MS_african_all







import pandas as pd
import os
import re

model_name = 'gemma-3-27b-it'
cultures = ['arabic', 'african', 'turkish', 'indian', 'persian', 'korean', 'chinese', 'japanese', 'english']
base_input_folder = f'/home/maz2h/MoralStory/{model_name}-NC8002-MS'

def to_int(val):
    if isinstance(val, str):
        val = val.strip()
    try:
        return int(float(val))
    except:
        return None

def classify_gender(extracted_names):
    genders = re.findall(r'\(\s*[^,]+,\s*(Male|Female)\s*\)',
                         str(extracted_names), flags=re.IGNORECASE)
    genders = [g.lower() for g in genders]
    if not genders:
        return 'unknown'
    if all(g == 'male' for g in genders):
        return 'male'
    if all(g == 'female' for g in genders):
        return 'female'
    return 'mixed'

# Step 1: Load and filter English file for female-only
file_english = os.path.join(base_input_folder, f"{model_name}_NC8002_MS_english_all.csv")
df_english_full = pd.read_csv(file_english)
df_english_full['gender_group'] = df_english_full['ExtractedNames'].apply(classify_gender)



# df_english_female = df_english_full[df_english_full['gender_group']=='female'].copy()
df_english_female = df_english_full
# df_english_female = df_english_full[df_english_full['gender_group']=='male'].copy()



df_english_female = df_english_female[['ID', 'GPT_Answer']]


df_english_female.rename(columns={'GPT_Answer': 'GPT_Answer_english'}, inplace=True)
df_english_female['GPT_Answer_english'] = df_english_female['GPT_Answer_english'].astype(str).str.strip().str.lower()

# Step 2: Merge all cultures
merged_df = df_english_female.copy()
for culture in cultures:
    file_culture = os.path.join(base_input_folder, f"{model_name}_NC8002_MS_{culture}_all.csv")
    if not os.path.exists(file_culture):
        print(f"[Warning] File not found: {file_culture}")
        continue

    df_culture = pd.read_csv(file_culture)

    
    cols_to_drop = [col for col in df_culture.columns if col.startswith('consistency_flag')]
    if cols_to_drop:
        df_culture.drop(columns=cols_to_drop, inplace=True)

    df_culture = df_culture[['ID', 'GPT_Answer']]
    df_culture.rename(columns={'GPT_Answer': f'GPT_Answer_{culture}'}, inplace=True)
    df_culture[f'GPT_Answer_{culture}'] = df_culture[f'GPT_Answer_{culture}'].astype(str).str.strip().str.lower()

    merged_df = pd.merge(merged_df, df_culture, on='ID', how='inner')

# Step 3: Create new consis_flag
answer_columns = [col for col in merged_df.columns if col.startswith('GPT_Answer')]
merged_df['consistency_flag'] = merged_df[answer_columns].apply(lambda row: int(all(ans == 'yes' for ans in row)), axis=1)

# Step 4: Write new consistency_flag back to each culture file
for culture in cultures:
    file_culture = os.path.join(base_input_folder, f"{model_name}_NC8002_MS_{culture}_all.csv")
    if not os.path.exists(file_culture):
        continue

    df_culture = pd.read_csv(file_culture)

    # Clean up previous consistency_flag columns
    cols_to_drop = [col for col in df_culture.columns if col.startswith('consistency_flag')]
    if cols_to_drop:
        df_culture.drop(columns=cols_to_drop, inplace=True)

    # Merge new consistency_flag
    df_culture = pd.merge(df_culture, merged_df[['ID', 'consistency_flag']], on='ID', how='left')
    df_culture.to_csv(file_culture, index=False)

print("Consistency flags successfully updated for all cultures.")

# Step 5: Generate final comparison report
final_df = pd.DataFrame({'Culture': cultures})
for metric in ['PM', 'PMR', 'WBR', '#REC']:
    final_df[metric] = None

df_english_full.rename(columns={'moral_action_selected': 'stance_english'}, inplace=True)
df_english_full['stance_english'] = df_english_full['stance_english'].apply(to_int)
df_english_full['GPT_Answer'] = df_english_full['GPT_Answer'].astype(str).str.strip()

for culture in cultures:
    file_culture = os.path.join(base_input_folder, f"{model_name}_NC8002_MS_{culture}_all.csv")
    if not os.path.exists(file_culture):
        print(f"[Warning] File not found: {file_culture}")
        continue

    df_culture = pd.read_csv(file_culture)

    # No dropping here: we just created consistency_flag, we need it now!
    df_culture.rename(columns={'moral_action_selected': 'stance_other'}, inplace=True)
    df_culture['stance_other'] = df_culture['stance_other'].apply(to_int)
    df_culture['GPT_Answer'] = df_culture['GPT_Answer'].astype(str).str.strip()

    merged = pd.merge(
        df_english_full[['ID', 'GPT_Answer']],
        df_culture[['ID', 'GPT_Answer', 'consistency_flag']],
        on='ID',
        suffixes=('_eng', '_other')
    )

    filtered = merged[(merged['GPT_Answer_eng'] == 'Yes') & (merged['GPT_Answer_other'] == 'Yes') & (merged['consistency_flag'] == 1)]

    if filtered.empty:
        continue

    filtered = filtered.merge(df_english_full[['ID', 'stance_english']], on='ID').merge(df_culture[['ID', 'stance_other']], on='ID')

    filtered = filtered.dropna(subset=['stance_english', 'stance_other'])
    filtered = filtered[(filtered['stance_english'].isin([1, 2])) &(filtered['stance_other'].isin([1, 2]))]

    total_records = len(filtered)
    mismatches = filtered[filtered['stance_english'] != filtered['stance_other']]
    mismatch_count = len(mismatches)
    pmr = mismatch_count / total_records if total_records else 0
    wbr = mismatches[(mismatches['stance_english'] == 1) & (mismatches['stance_other'] == 2)].shape[0] / mismatch_count if mismatch_count else 0

    final_df.loc[final_df['Culture'] == culture, 'PM'] = mismatch_count
    final_df.loc[final_df['Culture'] == culture, 'PMR'] = f"{pmr:.2%}"
    final_df.loc[final_df['Culture'] == culture, 'WBR'] = f"{wbr:.2%}"
    final_df.loc[final_df['Culture'] == culture, '#REC'] = total_records

# save the final report
output_file = os.path.join(base_input_folder, f'{model_name}_comparison_report_2_allgender.csv')
final_df.to_csv(output_file, index=False)
print(f" Report successfully saved to: {output_file}")




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_english_female.rename(columns={'GPT_Answer': 'GPT_Answer_english'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_english_female['GPT_Answer_english'] = df_english_female['GPT_Answer_english'].astype(str).str.strip().str.lower()


Consistency flags successfully updated for all cultures.
 Report successfully saved to: /home/maz2h/MoralStory/gemma-3-27b-it-NC8002-MS/gemma-3-27b-it_comparison_report_2_allgender.csv
