In [None]:
import pandas as pd
import numpy as np

# 1. Prepare Data
file_path = "真社製首個樓宇維修公開資料庫.xlsx"
df = pd.read_excel(file_path)

# Clean bid amount and calculate log bid
df['bid_amount'] = pd.to_numeric(df['涉及費用'].astype(str).str.replace(r'[^\d.]', '', regex=True), errors='coerce')
df['log_bid'] = np.log(df['bid_amount'])
df = df.dropna(subset=['log_bid', '公司名稱', '大廈/屋苑名稱(入標年份)'])

# 2. Calculate Residuals (De-centering)
# Remove project-specific cost factors by subtracting the project mean
df['project_mean'] = df.groupby('大廈/屋苑名稱(入標年份)')['log_bid'].transform('mean')
df['residual'] = df['log_bid'] - df['project_mean']

# 3. Filter Configuration
MIN_JOINT_PROJECTS = 5      # Minimum joint bids required to calculate correlation
CORRELATION_THRESHOLD = 0.8 # High risk threshold

# 4. Build Pivot Matrix (Index=Project, Columns=Company, Values=Residual)
pivot_matrix = df.pivot_table(index='大廈/屋苑名稱(入標年份)', columns='公司名稱', values='residual')

# 5. Detection Algorithm
suspicious_pairs = []
companies = pivot_matrix.columns
n_companies = len(companies)

print(f"Scanning interactions for {n_companies} companies...")

# Iterate through unique pairs of companies
for i in range(n_companies):
    for j in range(i + 1, n_companies):
        firm_a = companies[i]
        firm_b = companies[j]
        
        # Extract data for the pair
        pair_data = pivot_matrix[[firm_a, firm_b]].dropna()
        joint_count = len(pair_data)
        
        if joint_count >= MIN_JOINT_PROJECTS:
            # Calculate correlation of residuals
            corr = pair_data[firm_a].corr(pair_data[firm_b])
            
            if corr > CORRELATION_THRESHOLD:
                suspicious_pairs.append({
                    'Firm_A': firm_a,
                    'Firm_B': firm_b,
                    'Joint_Projects': joint_count,
                    'Correlation': corr
                })

# 6. Output Results
suspicious_df = pd.DataFrame(suspicious_pairs)

if not suspicious_df.empty:
    suspicious_df = suspicious_df.sort_values(by='Correlation', ascending=False)
    print("\n[High Risk Syndicates Found]")
    print(suspicious_df.head(100).to_markdown(index=False))
    
    # Save to CSV
    suspicious_df.to_csv('suspicious_syndicates.csv', index=False)
    print("\nSaved to 'suspicious_syndicates.csv'")
else:
    print("No pairs found exceeding the correlation threshold.")

In [None]:
import pandas as pd
import numpy as np
import re

# ==========================================
# 1. Data Loading & Cleaning
# ==========================================
file_path = "傳真社製首個樓宇維修公開資料庫.xlsx"
df = pd.read_excel(file_path)

# --- Cleaning A: Win Status ---
# Standardize: 1 = Winner, 0 = Lost
df['is_winner'] = pd.to_numeric(df['中標'], errors='coerce').fillna(0).astype(int)

# --- Cleaning B: Rank ---
# 1 = Lowest price, higher number = more expensive
df['rank_num'] = pd.to_numeric(df['排名(平至貴)'], errors='coerce')

# --- Cleaning C: Building Features ---
df['has_mall'] = df['其他設施'].astype(str).str.contains('商場|mall', case=False, na=False).astype(int)
df['has_club'] = df['其他設施'].astype(str).str.contains('會所|club', case=False, na=False).astype(int)
# Extract unit count
df['units_num'] = df['單位'].astype(str).str.extract(r'(\d+)').astype(float)

print(f"Data cleaning complete. Total records: {len(df)}")
print("-" * 30)

# ==========================================
# 2. Core Analysis Logic
# ==========================================

# Filter for winning bids only
winning_bids = df[df['is_winner'] == 1].copy()

# Aggregate company profile
company_profile = winning_bids.groupby('公司名稱').agg({
    '大廈/屋苑名稱(入標年份)': 'count',      # Win Count
    'rank_num': 'mean',                   # Avg Winning Rank
    'has_mall': 'mean',                   # Mall Ratio
    'has_club': 'mean',                   # Club Ratio
    'units_num': 'mean',                  # Avg Project Size (Units)
    '公司性質': 'first'                   # Company Type
}).rename(columns={'大廈/屋苑名稱(入標年份)': 'total_wins'})

# Calculate Global Benchmarks
global_avg_rank = winning_bids['rank_num'].mean()
global_avg_units = winning_bids['units_num'].mean()
print(f"Global Benchmarks: Avg Rank={global_avg_rank:.2f}, Avg Units={global_avg_units:.0f}")

# ==========================================
# 3. Define Risk Indicators
# ==========================================

# Filter: Companies with at least 2 wins
suspects = company_profile[company_profile['total_wins'] >= 2].copy()

# --- Indicator 1: Price Manipulation ---
# Avg winning rank > 3 implies winning despite high prices
suspects['Risk_HighPriceWin'] = suspects['rank_num'] > 3.0

# --- Indicator 2: Predatory Targeting ---
# Targeting large estates (>1000 units) with clubs (>30% ratio)
suspects['Risk_BigTarget'] = (suspects['units_num'] > 1000) & (suspects['has_club'] > 0.3)

# Combined High Risk Flag
suspects['High_Risk_Flag'] = suspects['Risk_HighPriceWin'] | suspects['Risk_BigTarget']

# ==========================================
# 4. Output Results
# ==========================================

top_suspects = suspects[suspects['High_Risk_Flag'] == True].sort_values(by='rank_num', ascending=False)

columns_to_show = ['total_wins', 'rank_num', 'units_num', 'has_club', 'Risk_HighPriceWin', 'Risk_BigTarget', '公司性質']
print("\n[High Risk Companies (Based on Feature Analysis)]")
print(top_suspects[columns_to_show])

# Save to CSV
top_suspects[columns_to_show].to_csv('high_risk_bidders_analysis.csv')
print("\nResults saved to 'high_risk_bidders_analysis.csv'")