In [5]:
import pandas as pd
import numpy as np
from scipy import stats
import os
import logging
import time

def compute_confidence_interval_mean(mean, std, n, confidence=0.95):

    if n < 2 or pd.isna(std) or std == 0:
        return np.nan, np.nan
    sem = std / np.sqrt(n)
    ci = stats.t.interval(confidence, df=n-1, loc=mean, scale=sem)
    return ci[0], ci[1]

def analyze_and_save_group(df, group_by_cols, output_filename):

    for col in ['has_ai_policy', 'is_oa']:
        if col in df.columns:
            df[col] = df[col].astype(bool)

    grouped = df.groupby(group_by_cols)['ai_word_freq']
    
    agg_results = grouped.agg(['count', 'mean', 'std']).reset_index()
    
    ci_results = agg_results.apply(
        lambda row: compute_confidence_interval_mean(row['mean'], row['std'], row['count']),
        axis=1,
        result_type='expand'
    )
    agg_results[['ci_lower', 'ci_upper']] = ci_results
    
    agg_results = agg_results.drop(columns=['std'])

    agg_results.to_csv(output_filename, index=False, encoding='utf-8-sig')

def main():

    start_time = time.time()
    
    input_file = r'C:\Users\ningji\Desktop\ai_policy\ai_policy\data\full_text_keyword_count.csv'
    output_dir = r'C:\Users\ningji\Desktop\ai_policy\ai_policy\results\full_text'
    os.makedirs(output_dir, exist_ok=True)

    dtype_spec = {
        'ai_word_freq': 'int32',
        'has_ai_policy': 'boolean',
        'is_oa': 'boolean'
    }
    df = pd.read_csv(input_file, dtype=dtype_spec, parse_dates=['date'])

    df.dropna(subset=['date'], inplace=True)
    df['year'] = df['date'].dt.year
    df['half_year'] = df['date'].dt.month.apply(lambda m: 'H1' if m <= 6 else 'H2')
    
    scenarios = [
        # 1. Overall trend by half-year
        {'group_cols': ['year', 'half_year'], 'filename': 'by_half_year.csv'},
        # 2. Trend by AI policy status
        {'group_cols': ['year', 'half_year', 'has_ai_policy'], 'filename': 'by_half_year_policy.csv'},
        # 3. Trend by OA status
        {'group_cols': ['year', 'half_year', 'is_oa'], 'filename': 'by_half_year_oa.csv'},
    ]
    
    for scenario in scenarios:
        output_path = os.path.join(output_dir, scenario['filename'])
        analyze_and_save_group(df.copy(), scenario['group_cols'], output_path)

    total_time = time.time() - start_time

if __name__ == "__main__":
    main()


In [34]:
import pandas as pd

df = pd.read_csv(r"C:\Users\ningji\Desktop\ai_policy\ai_policy\data\full_text_keyword_count.csv")

data = pd.read_csv(r"C:\Users\ningji\Desktop\ai_policy\ai_policy\results\full_text\disclosure_ai.csv")

In [35]:
df.head(2)
df=df[['paper_id','date','has_ai_policy','journal_name']]

In [36]:
data=data[['doc_id','location','ai_tool','usage_purpose']]

In [None]:
# Create conversion function
def normalize_openalex_id(paper_id):
    """Convert @https://openalex.org/W4225400651 format to https___openalex.org_W4225400651 format"""
    if pd.isna(paper_id):
        return paper_id
    return paper_id.replace('@', '').replace('://', '___').replace('/', '_')

# Convert paper_id
df['paper_id_normalized'] = df['paper_id'].apply(normalize_openalex_id)

# Merge data
merged_df = pd.merge(df, data, left_on='paper_id_normalized', right_on='doc_id', how='right')

# Clean temporary column
merged_df = merged_df.drop('paper_id_normalized', axis=1)

In [38]:
merged_df = merged_df[['paper_id','date','has_ai_policy','location','ai_tool','journal_name','usage_purpose']]
merged_df.head(2)

Unnamed: 0,paper_id,date,has_ai_policy,location,ai_tool,journal_name,usage_purpose
0,https://openalex.org/W4400975186,2024-07-25,True,Declaration of Interests,ChatGPT,PLoS ONE,evaluate the capabilities of AI systems
1,https://openalex.org/W4404702677,2024-11-25,False,Acknowledgements,Claude,Psychological Medicine,English language editing assistance


In [32]:
filtered_df.to_csv(r'C:\Users\ningji\Desktop\ai_policy\ai_policy\results\full_text\disclosure_df.csv')

In [30]:
filtered_df=filtered_df[['paper_id', 'date', 'has_ai_policy', 'location', 'ai_tool',
       'journal_name', 'usage_purpose', 'journal_location', 'journal_category',
       'journal_usage','journal_usage_categories', 'usage_purpose_categories_filtered']]

In [29]:
filtered_df.columns

Index(['paper_id', 'date', 'has_ai_policy', 'location', 'ai_tool',
       'journal_name', 'usage_purpose', 'journal_location', 'journal_category',
       'journal_usage', 'journal_name_std', 'usage_purpose_items',
       'usage_purpose_categories', 'journal_usage_items',
       'journal_usage_categories', 'usage_purpose_categories_filtered'],
      dtype='object')

In [None]:
import re
import unicodedata

def standardize_journal_name(name):
    """Standardize journal name"""
    if pd.isna(name):
        return ''
    
    # Convert to string
    name = str(name)
    
    # Unicode normalization
    name = unicodedata.normalize('NFKD', name)
    
    # Convert to lowercase
    name = name.lower()
    
    # Remove common punctuation and special characters
    name = re.sub(r'[^\w\s]', ' ', name)
    
    # Replace multiple spaces with single space
    name = re.sub(r'\s+', ' ', name)
    
    # Strip leading and trailing spaces
    name = name.strip()
    
    # Handle common journal name variants
    replacements = {
        '&': 'and',
        'journal of': 'j',
        'international journal': 'int j',
        'proceedings of': 'proc',
        'transactions on': 'trans',
        'communications': 'comm',
        'conference': 'conf',
        'society': 'soc',
        'association': 'assoc',
        'research': 'res',
        'science': 'sci',
        'technology': 'tech',
        'medicine': 'med',
        'medical': 'med',
        'engineering': 'eng',
        'computer': 'comp',
        'international': 'int',
        'american': 'am',
        'european': 'eur',
        'national': 'nat'
    }
    
    for old, new in replacements.items():
        name = name.replace(old, new)
    
    # Clean spaces again
    name = re.sub(r'\s+', ' ', name).strip()
    
    return name

# Recreate standardized mapping dictionary
print("Standardizing journal names...")

# Standardize JSON data
journal_location_map_std = {}
journal_category_map_std = {}

for journal in journal_info:
    journal_name = journal.get('journal_name')
    if journal_name:
        std_name = standardize_journal_name(journal_name)
        journal_location_map_std[std_name] = journal.get('location')
        journal_category_map_std[std_name] = journal.get('policy_category')

# Standardize CSV data
journal_usage_map_std = {}
for idx, row in usage_data.iterrows():
    journal_name = row['journal_name']
    allowed_usage = row['Allowed_Usage']
    if pd.notna(journal_name):
        std_name = standardize_journal_name(journal_name)
        if std_name not in journal_usage_map_std:
            journal_usage_map_std[std_name] = allowed_usage

print(f"Standardized mapping dictionary sizes:")
print(f"Location mapping: {len(journal_location_map_std)}")
print(f"Category mapping: {len(journal_category_map_std)}")
print(f"Usage mapping: {len(journal_usage_map_std)}")

# Standardize journal names in merged_df and match
merged_df['journal_name_std'] = merged_df['journal_name'].apply(standardize_journal_name)

# Re-match data
merged_df['journal_location'] = merged_df['journal_name_std'].map(journal_location_map_std)
merged_df['journal_category'] = merged_df['journal_name_std'].map(journal_category_map_std)
merged_df['journal_usage'] = merged_df['journal_name_std'].map(journal_usage_map_std)

# View improved results
print("\nStatistics after standardized matching:")
print(f"journal_location non-null values: {merged_df['journal_location'].notna().sum()}")
print(f"journal_category non-null values: {merged_df['journal_category'].notna().sum()}")
print(f"journal_usage non-null values: {merged_df['journal_usage'].notna().sum()}")

print("\nMatching rates:")
print(f"journal_location matching rate: {merged_df['journal_location'].notna().sum() / len(merged_df) * 100:.1f}%")
print(f"journal_category matching rate: {merged_df['journal_category'].notna().sum() / len(merged_df) * 100:.1f}%")
print(f"journal_usage matching rate: {merged_df['journal_usage'].notna().sum() / len(merged_df) * 100:.1f}%")

# View comparison of first few rows
print("\nFirst 5 rows comparison:")
comparison = merged_df[['journal_name', 'journal_name_std', 'journal_location', 'journal_category']].head()
print(comparison)

# Check journals still unmatched
unmatched_location = merged_df[merged_df['journal_location'].isna()]
if len(unmatched_location) > 0:
    print(f"\nJournals still unmatched for location: {len(unmatched_location)}")
    print("First 10 unmatched journals:")
    for idx, row in unmatched_location[['journal_name', 'journal_name_std']].head(10).iterrows():
        print(f"Original: '{row['journal_name']}' -> Standardized: '{row['journal_name_std']}'")

正在标准化期刊名称...
标准化后的映射字典大小:
Location映射: 5113
Category映射: 5113
Usage映射: 3555

标准化匹配后的统计：
journal_location 非空值: 113
journal_category 非空值: 113
journal_usage 非空值: 81

匹配率：
journal_location 匹配率: 96.6%
journal_category 匹配率: 96.6%
journal_usage 匹配率: 69.2%

前5行对比：
             journal_name        journal_name_std journal_location  \
0                PLoS ONE                plos one          Methods   
1  Psychological Medicine       psychological med    Not Specified   
2  The Historical Journal  the historical journal              NaN   
3  JMIR Medical Education      jmir med education    Not Specified   
4   BMC Medical Education       bmc med education          Methods   

      journal_category  
0  Disclosure Required  
1        Not Mentioned  
2                  NaN  
3        Not Mentioned  
4  Disclosure Required  

仍未匹配location的期刊数量: 4
前10个未匹配的期刊:
原名: 'The Historical Journal' -> 标准化: 'the historical journal'
原名: 'Journal of the Medical Library Association JMLA' -> 标准化: 'j the med libra

In [24]:
merged_df

Unnamed: 0,paper_id,date,has_ai_policy,location,ai_tool,journal_name,usage_purpose,journal_location,journal_category,journal_usage,journal_name_std
0,https://openalex.org/W4400975186,2024-07-25,True,Declaration of Interests,ChatGPT,PLoS ONE,evaluate the capabilities of AI systems,Methods,Disclosure Required,Text content generation,plos one
1,https://openalex.org/W4404702677,2024-11-25,False,Acknowledgements,Claude,Psychological Medicine,English language editing assistance,Not Specified,Not Mentioned,,psychological med
2,https://openalex.org/W4404494880,2024-11-18,False,Acknowledgements,Claude,The Historical Journal,revision process,,,,the historical journal
3,https://openalex.org/W4376866715,2023-05-17,False,Acknowledgements,GPT-4,JMIR Medical Education,case study to discuss opportunities and challe...,Not Specified,Not Mentioned,,jmir med education
4,https://openalex.org/W4408155423,2025-03-04,True,Contributor Section,GPT-4,BMC Medical Education,assistance,Methods,Disclosure Required,AI assisted copy editing; Formatting changes; ...,bmc med education
...,...,...,...,...,...,...,...,...,...,...,...
112,https://openalex.org/W4323050332,2023-03-04,True,Acknowledgements,ChatGPT,Journal of Medical Systems,answering medical questions,Methods,Disclosure Required,General writing assistance,j med systems
113,https://openalex.org/W4398223368,2024-05-22,True,Acknowledgements,"Grammarly, ChatGPT",PLoS ONE,"language improvement, advanced grammar and sty...",Methods,Disclosure Required,Text content generation,plos one
114,https://openalex.org/W4404968551,2024-12-03,True,Acknowledgements,ChatGPT-3.5,International Journal of Cancer,language and readability improvement,Methods,Disclosure Required,General editing; Grammar improvement; Spelling...,int j cancer
115,https://openalex.org/W4376611400,2023-05-15,True,Acknowledgements,ChatGPT,International Journal for Educational Integrity,assessing quality by comparing seminal ideas o...,Methods,Disclosure Required,AI assisted copy editing; Formatting changes; ...,int j for educational integrity


In [None]:
import re
import pandas as pd

# Design comprehensive merge mapping - 8 major categories (for journal_usage field)
comprehensive_merge_mapping = {
    # 1. Language & Grammar Support
    "Grammar checking": "Language & Grammar Support",
    "Grammar correction": "Language & Grammar Support", 
    "Grammar improvement": "Language & Grammar Support",
    "Grammar enhancement": "Language & Grammar Support",
    "Grammar suggestions": "Language & Grammar Support",
    "Grammar": "Language & Grammar Support",
    "Grammar and spelling check": "Language & Grammar Support",
    "Grammar and style improvement": "Language & Grammar Support",
    "Spelling and grammar improvement": "Language & Grammar Support",
    "Spelling checking": "Language & Grammar Support",
    "Spelling correction": "Language & Grammar Support",
    "Spelling improvement": "Language & Grammar Support",
    "Spelling assistance": "Language & Grammar Support",
    "Spelling": "Language & Grammar Support",
    "Spell checking": "Language & Grammar Support",
    "Spelling check": "Language & Grammar Support",
    "Spelling checks": "Language & Grammar Support",
    "Language enhancement": "Language & Grammar Support",
    "Language improvement": "Language & Grammar Support",
    "Language polishing": "Language & Grammar Support",
    "Language editing": "Language & Grammar Support",
    "Language accuracy enhancement": "Language & Grammar Support",
    "Improving language": "Language & Grammar Support",
    "English improvement": "Language & Grammar Support",
    "Linguistic quality enhancement": "Language & Grammar Support",
    "Language barrier overcoming": "Language & Grammar Support",
    "Language proofreading": "Language & Grammar Support",
    
    # 2. Writing & Editing Support
    "General writing assistance": "Writing & Editing Support",
    "Writing assistance": "Writing & Editing Support",
    "Medical writing assistance": "Writing & Editing Support",
    "General editing": "Writing & Editing Support",
    "Editing": "Writing & Editing Support",
    "AI assisted copy editing": "Writing & Editing Support",
    "Text editing": "Writing & Editing Support",
    "Content editing": "Writing & Editing Support",
    "Editing optimization": "Writing & Editing Support",
    "Editing of the manuscript": "Writing & Editing Support",
    "Editing of the text": "Writing & Editing Support",
    "Editing text": "Writing & Editing Support",
    "General text editing": "Writing & Editing Support",
    "Light editing": "Writing & Editing Support",
    "Writing editing": "Writing & Editing Support",
    "Copyediting": "Writing & Editing Support",
    "Editing tool usage": "Writing & Editing Support",
    "Readability improvement": "Writing & Editing Support",
    "Readability enhancement": "Writing & Editing Support",
    "Clarity improvement": "Writing & Editing Support",
    "Quality improvement": "Writing & Editing Support",
    "Structure improvement": "Writing & Editing Support",
    "Structure enhancement": "Writing & Editing Support",
    "Sentence structure improvement": "Writing & Editing Support",
    "Phrase suggestion": "Writing & Editing Support",
    "Synonym suggestions": "Writing & Editing Support",
    "Personalized feedback": "Writing & Editing Support",
    "Accessibility enhancement": "Writing & Editing Support",
    
    # 3. Style & Formatting
    "Style enhancement": "Style & Formatting",
    "Style improvement": "Style & Formatting",
    "Style and tone suggestions": "Style & Formatting",
    "Tone adjustment": "Style & Formatting",
    "Wording changes": "Style & Formatting",
    "Formatting changes": "Style & Formatting",
    "Formatting": "Style & Formatting",
    "Punctuation correction": "Style & Formatting",
    "Text formatting": "Style & Formatting",
    
    # 4. Content Creation & Generation
    "Content generation": "Content Creation & Generation",
    "Content creation": "Content Creation & Generation",
    "Text generation": "Content Creation & Generation",
    "Text content generation": "Content Creation & Generation",
    "Text creation": "Content Creation & Generation",
    "Abstract writing": "Content Creation & Generation",
    "Drafting text": "Content Creation & Generation",
    "Drafting of the manuscript": "Content Creation & Generation",
    "Drafting of scientific manuscripts": "Content Creation & Generation",
    "Writing of the manuscript": "Content Creation & Generation",
    "Writing of a manuscript": "Content Creation & Generation",
    "Manuscript preparation": "Content Creation & Generation",
    "Plain-language summaries": "Content Creation & Generation",
    "Literature review assistance": "Content Creation & Generation",
    "Reporting standards": "Content Creation & Generation",
    
    # 5. Reference & Citation Support
    "Reference checking": "Reference & Citation Support",
    "Reference generation": "Reference & Citation Support",
    "Reference management": "Reference & Citation Support",
    "Reference organizing": "Reference & Citation Support",
    "Reference clean up": "Reference & Citation Support",
    "Reference list preparation": "Reference & Citation Support",
    "Compilation of references": "Reference & Citation Support",
    
    # 6. Translation Services
    "Translation": "Translation Services",
    "Language translation": "Translation Services",
    "Translation of own words": "Translation Services",
    
    # 7. Text Processing & Refinement
    "Text correcting": "Text Processing & Refinement",
    "Text refining": "Text Processing & Refinement",
    "Text revision": "Text Processing & Refinement",
    "Text incorporation": "Text Processing & Refinement",
    "Text snippet inclusion": "Text Processing & Refinement",
    "Paraphrasing": "Text Processing & Refinement",
    "Summarizing": "Text Processing & Refinement",
    "Condensing writing": "Text Processing & Refinement",
    "Polishing writing": "Text Processing & Refinement",
    "Revising": "Text Processing & Refinement",
    "Reviewing": "Text Processing & Refinement",
    
    # 8. Error Analysis & Others
    "Error in analysis": "Error Analysis & Others",
}


# Redefine usage_purpose_mapping, only keeping real "author using AI" scenarios
usage_purpose_mapping_filtered = {
    # 1. Language & Grammar Support
    "English language editing assistance": "Language & Grammar Support",
    "grammatical editing of the manuscript": "Language & Grammar Support",
    "correcting grammatical errors": "Language & Grammar Support",
    "improve the grammar and style of this paper": "Language & Grammar Support",
    "polishing English language and grammatical check": "Language & Grammar Support",
    "improve readability and grammar": "Language & Grammar Support",
    "text editing (English language fluency, grammar, word choice, sentence structure)": "Language & Grammar Support",
    "language checking in selected sentences": "Language & Grammar Support",
    "proofreading to improve readability and language": "Language & Grammar Support",
    "language and readability improvement": "Language & Grammar Support",
    "language improvement": "Language & Grammar Support",
    "checking and correcting English expressions": "Language & Grammar Support",
    "improve writing": "Language & Grammar Support",
    "English editing and checking the grammar": "Language & Grammar Support",
    
    # 2. Writing & Editing Support
    "proofreading and editing the manuscript": "Writing & Editing Support",
    "proofreading": "Writing & Editing Support",
    "text editing": "Writing & Editing Support",
    "revising the authors' written texts": "Writing & Editing Support",
    "revision process": "Writing & Editing Support",
    "improve the readability and conciseness of the manuscript": "Writing & Editing Support",
    "correction and improvement of scientific medical writing": "Writing & Editing Support",
    "language editing and rephrasing": "Writing & Editing Support",
    "language editing": "Writing & Editing Support",
    "language revisions of intellectual content": "Writing & Editing Support",
    "refine the written content of this publication": "Writing & Editing Support",
    "enhance the language and readability of the manuscript": "Writing & Editing Support",
    "improve the clarity and language of the manuscript": "Writing & Editing Support",
    "improve the readability and language of some paragraphs in the text": "Writing & Editing Support",
    "language editing and improving the clarity of the manuscript": "Writing & Editing Support",
    "improve language": "Writing & Editing Support",
    "identify improvements in the writing style": "Writing & Editing Support",
    "editorial purposes in improving the clarity and language of the manuscript": "Writing & Editing Support",
    "enhance the clarity of select portions of the text": "Writing & Editing Support",
    "proofreading and copyediting": "Writing & Editing Support",
    "revising sentences, improving grammar and enriching the vocabulary": "Writing & Editing Support",
    "improve the clarity, coherence, and overall presentation of the manuscript": "Writing & Editing Support",
    "improving readability and fitting the length of the article": "Writing & Editing Support",
    "language improvement, advanced grammar and style checks, refining manuscript's language and ensuring readability": "Writing & Editing Support",
    
    # 3. Style & Formatting
    "formatting": "Style & Formatting",
    "creating the graphical abstract": "Style & Formatting",
    "refining phrasing and brainstorming alternative title suggestions": "Style & Formatting",
    "phrasing of the manuscript": "Style & Formatting",
    
    # 4. Content Creation & Generation
    "text generation": "Content Creation & Generation",
    "content generation": "Content Creation & Generation",
    "poem generation": "Content Creation & Generation",
    "TOC generation": "Content Creation & Generation",
    "creating a convincing looking scientific abstract or article": "Content Creation & Generation",
    "co-authorship": "Content Creation & Generation",
    "summarized content": "Content Creation & Generation",
    "proposing titles, structuring papers, crafting abstracts, and summarizing research": "Content Creation & Generation",
    "creation of this manuscript": "Content Creation & Generation",
    "writing certain parts of the article": "Content Creation & Generation",
    
    # 5. Reference & Citation Support
    "classifying citation statements": "Reference & Citation Support",
    "grammar, structure, citations, and adherence to disciplinary standards": "Reference & Citation Support",
    
    # 6. Translation Services
    "English text translation and extensive manuscript proofreading and revision": "Translation Services",
    
    # 7. Text Processing & Refinement
    "paraphrasing": "Text Processing & Refinement",
    "summarizing": "Text Processing & Refinement", 
    "condensing writing": "Text Processing & Refinement",
    "text refining": "Text Processing & Refinement",
    "text revision": "Text Processing & Refinement",
    
    # 8. Error Analysis & Others - only keep real "author using AI" scenarios
    "assistance": "Error Analysis & Others",  # ambiguous but possibly usage
    "assisting in the preparation of Python codes": "Error Analysis & Others",  # author using AI to write code
    "assist CAD design for microfluidic devices": "Error Analysis & Others",  # author using AI for design
    "designing true–false questions": "Error Analysis & Others",  # author using AI to generate questions
    "generating SCTs for comparison with clinical experts": "Error Analysis & Others",  # author using AI to generate content
    "debugging certain codes": "Error Analysis & Others",  # author using AI to debug code
    "custom fine-tuning": "Error Analysis & Others",  # author using AI
    "writing a 210 multi choice questions-MCQs examination": "Error Analysis & Others",  # author using AI to write questions
    "assigning categories to student responses": "Error Analysis & Others",  # author using AI to classify
    "produce answers for each type of exam": "Error Analysis & Others",  # author using AI to generate answers
    "support the writing of R and Python3 codes and edits throughout the manuscript": "Error Analysis & Others",  # author using AI to write code
    "Generating artificial persona": "Error Analysis & Others",  # author using AI to generate personas
    "Experiments": "Error Analysis & Others",  # author using AI in experiments
}

# Items that need to be filtered out (these are not authors using AI, but research about AI or AI as research subject)
excluded_usage_purposes = [
    "evaluate the capabilities of AI systems",  # evaluating AI system capabilities
    "case study to discuss opportunities and challenges in medical education",  # using AI as case study
    "processing and integrating complex data",  # AI functionality description
    "Demonstration video",  # demonstration video
    "capturing nuanced biases in peer review",  # researching AI capabilities
    "Providing a quote",  # AI providing quotes, not author usage
    "data analysis",  # possibly AI in research
    "tagging and retrieval capabilities for efficient and thorough analysis",  # researching AI capabilities
    "Enhancing social role and humaneness in interactions",  # research content
    "creating more natural conversations by logging input-response history",  # research content
    "querying about amblyopia and comparing answers",  # research content
    "analyzing political statements",  # research content
    "Identifying healthcare needs",  # research content
    "response to the Home Blood Pressure Monitoring (HBPM) knowledge checklist",  # research content
    "supporting the oncological workflow by helping confirm the absence or presence of lesions",  # AI tool functionality description
    "Quantifying confidence shifts",  # research content
    "tackle the side effects of PLEs from the learner and learning perspective",  # research content
    "personalised learning",  # research content
    "integrating AI-driven components into curricula",  # research content
    "development of conversational technologies",  # research content
    "answering medical questions",  # research content
    "assessing quality by comparing seminal ideas of assignments",  # research content
    "Unspecified",  # unspecified
]

def categorize_usage_purpose_filtered(text):
    """Categorize usage_purpose, filter out non-AI usage scenarios"""
    if pd.isna(text):
        return None  # return None indicates need to be filtered
    
    text = str(text).strip()
    if not text or text.lower() == 'unspecified':
        return None  # filter out
    
    # Check if in exclusion list
    if text in excluded_usage_purposes:
        return None  # filter out
    
    # Direct mapping
    if text in usage_purpose_mapping_filtered:
        return usage_purpose_mapping_filtered[text]
    
    # Fuzzy matching (only for clear AI usage scenarios)
    text_lower = text.lower()
    
    # Language & Grammar Support
    if any(word in text_lower for word in [
        'grammar', 'spelling', 'language editing', 'english', 'linguistic', 
        'grammatical', 'language improvement', 'language check', 'proofreading'
    ]):
        return "Language & Grammar Support"
    
    # Writing & Editing Support  
    elif any(word in text_lower for word in [
        'editing', 'readability', 'clarity', 'writing assistance', 
        'revision', 'manuscript editing', 'text editing', 'language revision'
    ]):
        return "Writing & Editing Support"
    
    # Style & Formatting
    elif any(word in text_lower for word in [
        'formatting', 'style', 'phrasing', 'tone', 'graphical', 'title'
    ]):
        return "Style & Formatting"
    
    # Content Creation & Generation
    elif any(word in text_lower for word in [
        'generation', 'creation', 'writing content', 'content creation', 'abstract writing', 
        'manuscript writing', 'text generation', 'co-author'
    ]):
        return "Content Creation & Generation"
    
    # Reference & Citation Support
    elif any(word in text_lower for word in [
        'citation', 'reference', 'bibliography'
    ]):
        return "Reference & Citation Support"
    
    # Translation Services
    elif any(word in text_lower for word in [
        'translation', 'translate'
    ]):
        return "Translation Services"
    
    # Other clear AI usage scenarios
    elif any(phrase in text_lower for phrase in [
        'assist', 'help', 'support', 'aid', 'code', 'programming', 'debug'
    ]):
        return "Error Analysis & Others"
    
    else:
        return None  # filter out uncertain items

# Re-apply filtered classification
print("Starting to filter and standardize usage_purpose field...")

# Apply filtered classification
merged_df['usage_purpose_categories_filtered'] = merged_df['usage_purpose'].apply(categorize_usage_purpose_filtered)

# Filter out rows classified as None (i.e., not real AI usage rows)
filtered_df = merged_df[merged_df['usage_purpose_categories_filtered'].notna()].copy()

print(f"\nTotal rows before filtering: {len(merged_df)}")
print(f"Total rows after filtering: {len(filtered_df)}")
print(f"Rows filtered out: {len(merged_df) - len(filtered_df)}")

# View filtered results
print("\n=== Filtered usage_purpose classification statistics ===")
purpose_counts_filtered = filtered_df['usage_purpose_categories_filtered'].value_counts()
print(purpose_counts_filtered)

# View filtered out items
excluded_items = merged_df[merged_df['usage_purpose_categories_filtered'].isna()]['usage_purpose'].value_counts()
if len(excluded_items) > 0:
    print(f"\n=== Filtered out usage_purpose items ===")
    print(excluded_items)

# Continue processing journal_usage field (keep original logic)
# filtered_df['journal_usage_categories'] = filtered_df['journal_usage_items'].apply(categorize_journal_usage)

# print("\n=== First 5 rows after filtering ===")
# example_cols = ['usage_purpose', 'usage_purpose_categories_filtered', 'journal_usage', 'journal_usage_categories']
# print(filtered_df[example_cols].head())

# Save filtered results
filtered_df.to_csv('filtered_ai_usage_disclosure.csv', index=False, encoding='utf-8-sig')
print("\nFiltered results saved to filtered_ai_usage_disclosure.csv")

def split_and_clean_text(text, separators=[';', ',', ' and ', '&']):
    """Split and clean text"""
    if pd.isna(text):
        return []
    
    text = str(text).strip()
    if not text:
        return []
    
    # Use multiple separators to split
    items = [text]
    for sep in separators:
        new_items = []
        for item in items:
            new_items.extend([x.strip() for x in item.split(sep) if x.strip()])
        items = new_items
    
    return [item for item in items if item and item.lower() != 'unspecified']

def categorize_usage_purpose(text):
    """Categorize usage_purpose"""
    if pd.isna(text):
        return "Error Analysis & Others"
    
    text = str(text).strip()
    if not text or text.lower() == 'unspecified':
        return "Error Analysis & Others"
    
    # Direct mapping
    if text in usage_purpose_mapping:
        return usage_purpose_mapping[text]
    
    # Fuzzy matching
    text_lower = text.lower()
    
    # Language & Grammar Support
    if any(word in text_lower for word in [
        'grammar', 'spelling', 'language editing', 'english', 'linguistic', 
        'grammatical', 'language improvement', 'language check'
    ]):
        return "Language & Grammar Support"
    
    # Writing & Editing Support  
    elif any(word in text_lower for word in [
        'proofreading', 'editing', 'readability', 'clarity', 'writing', 
        'revision', 'manuscript', 'text editing', 'language revision'
    ]):
        return "Writing & Editing Support"
    
    # Style & Formatting
    elif any(word in text_lower for word in [
        'formatting', 'style', 'phrasing', 'tone', 'graphical', 'title'
    ]):
        return "Style & Formatting"
    
    # Content Creation & Generation
    elif any(word in text_lower for word in [
        'generation', 'creation', 'writing', 'content', 'abstract', 
        'manuscript', 'text generation', 'co-author'
    ]):
        return "Content Creation & Generation"
    
    # Reference & Citation Support
    elif any(word in text_lower for word in [
        'citation', 'reference', 'bibliography'
    ]):
        return "Reference & Citation Support"
    
    # Translation Services
    elif any(word in text_lower for word in [
        'translation', 'translate'
    ]):
        return "Translation Services"
    
    else:
        return "Error Analysis & Others"

def categorize_journal_usage(usages):
    """Categorize journal_usage (based on existing mapping)"""
    if not usages:
        return []
    
    categories = set()
    for usage in usages:
        usage = usage.strip()
        if usage in comprehensive_merge_mapping:
            categories.add(comprehensive_merge_mapping[usage])
        else:
            # Automatic classification logic
            usage_lower = usage.lower()
            if any(word in usage_lower for word in ['grammar', 'spelling', 'language']):
                categories.add("Language & Grammar Support")
            elif any(word in usage_lower for word in ['writing', 'editing', 'quality', 'clarity', 'readability']):
                categories.add("Writing & Editing Support")
            elif any(word in usage_lower for word in ['style', 'tone', 'format', 'punctuation']):
                categories.add("Style & Formatting")
            elif any(word in usage_lower for word in ['content', 'generation', 'creation', 'drafting', 'manuscript']):
                categories.add("Content Creation & Generation")
            elif any(word in usage_lower for word in ['reference', 'citation']):
                categories.add("Reference & Citation Support")
            elif any(word in usage_lower for word in ['translation', 'translate']):
                categories.add("Translation Services")
            elif any(word in usage_lower for word in ['text', 'revision', 'refining', 'polishing', 'paraphras', 'summar']):
                categories.add("Text Processing & Refinement")
            else:
                categories.add("Error Analysis & Others")
    
    return list(categories)

# Apply standardization
print("Starting to standardize usage_purpose and journal_usage fields...")

# 1. Standardize usage_purpose field
merged_df['usage_purpose_items'] = merged_df['usage_purpose'].apply(
    lambda x: split_and_clean_text(x, [',', ' and ', '&', ';'])
)

merged_df['usage_purpose_categories'] = merged_df['usage_purpose'].apply(categorize_usage_purpose)

# 2. Standardize journal_usage field
merged_df['journal_usage_items'] = merged_df['journal_usage'].apply(
    lambda x: split_and_clean_text(x, [';']) if pd.notna(x) else []
)

merged_df['journal_usage_categories'] = merged_df['journal_usage_items'].apply(categorize_journal_usage)

# View results
print("\n=== usage_purpose standardization results ===")
purpose_counts = merged_df['usage_purpose_categories'].value_counts()
print(purpose_counts)

print("\n=== journal_usage standardization results ===")
# Expand journal_usage_categories for statistics
all_journal_categories = []
for categories in merged_df['journal_usage_categories']:
    all_journal_categories.extend(categories)

journal_usage_counts = pd.Series(all_journal_categories).value_counts()
print(journal_usage_counts)

# View first few rows example
print("\n=== First 5 rows standardization example ===")
example_cols = ['usage_purpose', 'usage_purpose_categories', 'journal_usage', 'journal_usage_categories']
print(merged_df[example_cols].head())

# Save results
# merged_df.to_csv('merged_df_with_standardized_usage.csv', index=False, encoding='utf-8-sig')
print("\nResults saved to merged_df_with_standardized_usage.csv")

开始过滤和标准化usage_purpose字段...

过滤前总行数: 117
过滤后总行数: 77
被过滤掉的行数: 40

=== 过滤后usage_purpose分类统计 ===
usage_purpose_categories_filtered
Writing & Editing Support        28
Language & Grammar Support       15
Error Analysis & Others          14
Content Creation & Generation    13
Style & Formatting                4
Reference & Citation Support      2
Translation Services              1
Name: count, dtype: int64

=== 被过滤掉的usage_purpose项目 ===
usage_purpose
Unspecified                                                                                  17
evaluate the capabilities of AI systems                                                       1
case study to discuss opportunities and challenges in medical education                       1
processing and integrating complex data                                                       1
Demonstration video                                                                           1
capturing nuanced biases in peer review                                