In [5]:
import pandas as pd
import numpy as np
import re

def smart_id_matching(filter_ids, data_id_series):
    """Smart ID matching function that tries multiple strategies"""
    
    # Clean filter IDs
    filter_ids_clean = [x for x in filter_ids if pd.notna(x)]
    print(f'\nSmart matching analysis - Filter IDs count: {len(filter_ids_clean)}')
    
    # Define matching strategies
    strategies = [
        ('Direct Match', lambda x: x.isin(filter_ids_clean)),
        ('String Match', lambda x: x.astype(str).str.strip().isin([str(num).strip() for num in filter_ids_clean])),
        ('Numeric Only Match', lambda x: x.astype(str).str.replace(r'[^0-9]', '', regex=True).isin([re.sub(r'[^0-9]', '', str(num)) for num in filter_ids_clean])),
        ('Lowercase String Match', lambda x: x.astype(str).str.strip().str.lower().isin([str(num).strip().lower() for num in filter_ids_clean]))
    ]
    
    best_strategy = None
    best_result = 0
    match_indices = None
    
    for strategy_name, match_func in strategies:
        try:
            result = data_id_series[match_func(data_id_series)]
            match_count = len(result)
            print(f'  {strategy_name}: {match_count} records')
            
            if match_count > best_result:
                best_result = match_count
                best_strategy = strategy_name
                match_indices = result.index
        except Exception as e:
            print(f'  {strategy_name}: Error - {str(e)}')
    
    if best_strategy:
        print(f'  Selected strategy: {best_strategy} (matched {best_result} records)')
        return match_indices
    else:
        print('  No suitable matching strategy found')
        return pd.Index([])

# Main program
if __name__ == '__main__':
    # Configure file paths
    filter_file_path = 'C:/Users/Administrator/Downloads/骑手筛选/filter.xlsx'
    file_1_1_path = 'C:/Users/Administrator/Downloads/骑手筛选/1-1.csv'
    file_1_2_path = 'C:/Users/Administrator/Downloads/骑手筛选/1-2.csv'
    output_file_path = 'C:/Users/Administrator/Downloads/骑手筛选/筛选结果.xlsx'
    
    print('='*60)
    print('Filtering Tool')
    print('='*60)
    
    try:
        # Read filter file
        print('\n【1. Reading Filter File】')
        filter_df = pd.read_excel(filter_file_path)
        print(f'Filter file shape: {filter_df.shape}')
        print(f'Filter file columns: {list(filter_df.columns)}')
        
        # Extract filter IDs
        filter_ids_1_1 = filter_df['1-1编号'].dropna().unique() if '1-1编号' in filter_df.columns else []
        filter_ids_1_2 = filter_df['1-2编号'].dropna().unique() if '1-2编号' in filter_df.columns else []
        
        print(f'\nFilter IDs Statistics:')
        print(f'  1-1 IDs: {len(filter_ids_1_1)}')
        print(f'  1-2 IDs: {len(filter_ids_1_2)}')
        
        # Process 1-1.csv
        print('\n' + '='*40)
        print('【2. Processing 1-1.csv】')
        df1_1 = pd.read_csv(file_1_1_path)
        print(f'File shape: {df1_1.shape}')
        
        if '编号' in df1_1.columns and len(filter_ids_1_1) > 0:
            print('\nPerforming smart ID matching...')
            match_indices_1_1 = smart_id_matching(filter_ids_1_1, df1_1['编号'])
            df1_1_filtered = df1_1.loc[match_indices_1_1].copy()
            print(f'\n1-1.csv Final Filtered Results: {len(df1_1_filtered)} records')
        else:
            print('\nWarning: 1-1.csv has no "编号" column or no filter IDs')
            df1_1_filtered = pd.DataFrame()
        
        # Process 1-2.csv
        print('\n' + '='*40)
        print('【3. Processing 1-2.csv】')
        df1_2 = pd.read_csv(file_1_2_path)
        print(f'File shape: {df1_2.shape}')
        
        if '编号' in df1_2.columns and len(filter_ids_1_2) > 0:
            print('\nPerforming smart ID matching...')
            match_indices_1_2 = smart_id_matching(filter_ids_1_2, df1_2['编号'])
            df1_2_filtered = df1_2.loc[match_indices_1_2].copy()
            print(f'\n1-2.csv Final Filtered Results: {len(df1_2_filtered)} records')
        else:
            print('\nWarning: 1-2.csv has no "编号" column or no filter IDs')
            df1_2_filtered = pd.DataFrame()
        
        # Save results
        print('\n' + '='*40)
        print('【4. Saving Results】')
        with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:
            # Save 1-1 results
            if len(df1_1_filtered) > 0:
                df1_1_filtered.to_excel(writer, sheet_name='1-1', index=False)
                print(f'✓ Saved 1-1 sheet ({len(df1_1_filtered)} records)')
            else:
                # Create empty reference table
                empty_table_1_1 = pd.DataFrame({
                    'Filtered_IDs': filter_ids_1_1[:10] if len(filter_ids_1_1) > 0 else [],
                    'Status': ['No match found' for _ in range(min(10, len(filter_ids_1_1)))]
                })
                empty_table_1_1.to_excel(writer, sheet_name='1-1', index=False)
                print(f'⚠ 1-1 sheet is empty, created reference table')
            
            # Save 1-2 results
            if len(df1_2_filtered) > 0:
                df1_2_filtered.to_excel(writer, sheet_name='1-2', index=False)
                print(f'✓ Saved 1-2 sheet ({len(df1_2_filtered)} records)')
            else:
                empty_table_1_2 = pd.DataFrame({
                    'Filtered_IDs': filter_ids_1_2[:10] if len(filter_ids_1_2) > 0 else [],
                    'Status': ['No match found' for _ in range(min(10, len(filter_ids_1_2)))]
                })
                empty_table_1_2.to_excel(writer, sheet_name='1-2', index=False)
                print(f'⚠ 1-2 sheet is empty, created reference table')
        
        # Generate report
        print('\n' + '='*60)
        print('【Filtering Report】')
        print('='*60)
        print(f'Filter File: {filter_file_path}')
        print(f'1-1.csv File: {file_1_1_path}')
        print(f'1-2.csv File: {file_1_2_path}')
        print(f'Output File: {output_file_path}')
        print('-' * 60)
        print(f'1-1.csv Results: {len(df1_1_filtered)} / {len(filter_ids_1_1)} records')
        print(f'1-2.csv Results: {len(df1_2_filtered)} / {len(filter_ids_1_2)} records')
        print('-' * 60)
        
        # Calculate match rates
        if len(filter_ids_1_1) > 0:
            match_rate_1_1 = (len(df1_1_filtered) / len(filter_ids_1_1)) * 100
            print(f'1-1.csv Match Rate: {match_rate_1_1:.1f}%')
        if len(filter_ids_1_2) > 0:
            match_rate_1_2 = (len(df1_2_filtered) / len(filter_ids_1_2)) * 100
            print(f'1-2.csv Match Rate: {match_rate_1_2:.1f}%')
        
        print('\n✓ Operation completed successfully!')
        
    except Exception as e:
        print(f'\n✗ Program execution error: {str(e)}')
        import traceback
        traceback.print_exc()

Smart Data Filtering Tool (Final Version)

【1. Reading Filter File】
Filter file shape: (95, 2)
Filter file columns: ['1-1编号', '1-2编号']

Filter IDs Statistics:
  1-1 IDs: 31
  1-2 IDs: 95

【2. Processing 1-1.csv】
File shape: (129, 72)

Performing smart ID matching...

Smart matching analysis - Filter IDs count: 31
  Direct Match: 31 records
  String Match: 0 records
  Numeric Only Match: 1 records
  Lowercase String Match: 0 records
  Selected strategy: Direct Match (matched 31 records)

1-1.csv Final Filtered Results: 31 records

【3. Processing 1-2.csv】
File shape: (263, 74)

Performing smart ID matching...

Smart matching analysis - Filter IDs count: 95
  Direct Match: 95 records
  String Match: 95 records
  Numeric Only Match: 95 records
  Lowercase String Match: 95 records
  Selected strategy: Direct Match (matched 95 records)

1-2.csv Final Filtered Results: 95 records

【4. Saving Results】
✓ Saved 1-1 sheet (31 records)
✓ Saved 1-2 sheet (95 records)

【Filtering Report】
Filter File