In [34]:
import pandas as pd
from langdetect import detect, LangDetectException
import chardet
import re

# Function to detect encoding of a file
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
    return result['encoding'] if result['encoding'] else 'ISO-8859-1' 

# Function to clean text by removing whitespace, invisible characters, and treating dashes as empty
def clean_text(text):
    # 将输入转换为字符串
    text = str(text)
    # 如果 text 是 nan（pandas会返回nan表示null），返回空字符串,否则nan会被处理为string（non-empty）
    if text.lower() == 'nan':
        return ''
    # Define empty cases
    text = text.strip()  # Remove leading and trailing spaces
    if text in ['--', '-', ' ', '...', 'N/A']:
        return ''
    return text  

# Enhanced function to check if a field is non-empty after cleaning
def is_non_empty(field):
    cleaned_field = clean_text(field)
    return 1 if cleaned_field != '' else 0  
    


In [35]:
# Detect encoding of each CSV file
user_ids_lookup_encoding = detect_encoding('user_ids_lookup.csv')
individual_user_encoding = detect_encoding('individual_user.csv')
individual_user_raw_encoding = detect_encoding('individual_user_raw.csv')

# Load CSV files with detected encodings and error handling
user_ids_lookup = pd.read_csv('user_ids_lookup.csv', encoding=user_ids_lookup_encoding)
individual_user = pd.read_csv('individual_user.csv', encoding=individual_user_encoding, encoding_errors='ignore')
individual_user_raw = pd.read_csv('individual_user_raw.csv', encoding=individual_user_raw_encoding, encoding_errors='ignore')

In [36]:
# Initialize new columns with default value 0
user_ids_lookup['in_user'] = 0
user_ids_lookup['in_user_raw'] = 0
user_ids_lookup['location_is_US'] = 0
user_ids_lookup['country_is_US'] = 0
user_ids_lookup['raw_title_non_empty'] = 0
user_ids_lookup['raw_summary_non_empty'] = 0
user_ids_lookup['english_raw'] = 0

In [37]:
# Set of User IDs in individual_user and individual_user_raw for fast lookup
user_ids_in_user = set(individual_user['user_id'])
user_ids_in_user_raw = set(individual_user_raw['user_id'])

# Processing each user_id in user_ids_lookup
for index, row in user_ids_lookup.iterrows():
    user_id = row['user_id']
    
    # 1. Check if user_id is in individual_user.csv
    user_ids_lookup.at[index, 'in_user'] = 1 if user_id in user_ids_in_user else 0
    
    # 2. Check if user_id is in individual_user_raw.csv
    user_ids_lookup.at[index, 'in_user_raw'] = 1 if user_id in user_ids_in_user_raw else 0

    # If the user is found in individual_user, extract location and country data
    if user_id in user_ids_in_user:
        user_data = individual_user[individual_user['user_id'] == user_id].iloc[0]

        # 3. Check if `user_location` indicates US
        location = str(user_data['user_location']).lower()
        if any(keyword in location for keyword in ['united states', 'usa', 'u.s.', 'america']):
            user_ids_lookup.at[index, 'location_is_US'] = 1

        # 4. Check if `user_country` is "United States"
        country = user_data['user_country']
        user_ids_lookup.at[index, 'country_is_US'] = 1 if country == 'United States' else 0

        # If the user is found in individual_user_raw, check profile title and summary
    if user_id in user_ids_in_user_raw:
        raw_data = individual_user_raw[individual_user_raw['user_id'] == user_id].iloc[0]
        
        # 5. Check if `profile_title` is non-empty after removing whitespace and invisible characters
        profile_title = clean_text(str(raw_data['profile_title']))
        user_ids_lookup.at[index, 'raw_title_non_empty'] = 1 if profile_title != '' else 0
        
        # 6. Debug and check if `profile_summary` is truly empty after cleaning
        profile_summary= clean_text(str(raw_data['profile_summary']))
        user_ids_lookup.at[index, 'raw_summary_non_empty'] = 1 if profile_summary != '' else 0
        
        # 7. Check if `profile_summary` is in English
        try:
            if profile_summary and detect(profile_summary) == 'en':
                user_ids_lookup.at[index, 'english_raw'] = 1
        except LangDetectException:
            user_ids_lookup.at[index, 'english_raw'] = 0
    


In [38]:
# Save the results to a new CSV
user_ids_lookup.to_csv('processed_user_ids_lookup.csv', index=False)

print("Processing complete. Results saved to 'processed_user_ids_lookup.csv'.")

Processing complete. Results saved to 'processed_user_ids_lookup.csv'.
