In [None]:
!pip install gender-guesser

# Full pipeline: load CSVs -> compute c_user (HHA-style) -> build pairs -> mixed-logit
# Requirements: pandas, numpy, scipy, sklearn, statsmodels
import time, random
import numpy as np
import pandas as pd
from itertools import combinations
from scipy import sparse, optimize, stats
from scipy.stats import qmc
from scipy.special import expit
from sklearn.linear_model import LogisticRegression
import re
import gender_guesser.detector as gd

random.seed(1); np.random.seed(1)

# -------------------------
# 0. Load data (adjust filenames if necessary)
# -------------------------
users = pd.read_csv('Users.csv')
orgs  = pd.read_csv('UserOrganizations.csv')
ach   = pd.read_csv('UserAchievements.csv')
teams = pd.read_csv('Teams.csv')
tm    = pd.read_csv('TeamMemberships.csv')

# normalize id columns
if 'Id' in users.columns and 'UserId' not in users.columns:
    users = users.rename(columns={'Id':'UserId'})
if 'Id' in teams.columns and 'TeamId' not in teams.columns:
    teams = teams.rename(columns={'Id':'TeamId'})

# Keep competition achievements only
if 'AchievementType' in ach.columns:
    ach = ach[ach['AchievementType'] == 'Competitions'].copy()

# Merge users -> organizations -> achievements
users_org = users.merge(orgs, how='left', on='UserId')
users_org_ach = users_org.merge(ach, how='left', on='UserId')

# Merge team memberships -> teams -> user info to create 'merged'
team_member = tm.merge(teams, how='left', on='TeamId')
merged = team_member.merge(users_org_ach, how='left', on='UserId')

# Ensure string IDs
for col in ['UserId','TeamId','CompetitionId']:
    if col in merged.columns:
        merged[col] = merged[col].astype(str)


In [2]:
# -------------------------
# 1. Build users_comp (one row per user-competition) and compute Solo indicator
# -------------------------
# team sizes per (TeamId, CompetitionId)
mask_team = merged['TeamId'].notnull() & (merged['TeamId'] != 'nan')
team_sizes = (merged[mask_team]
              .groupby(['TeamId','CompetitionId'], dropna=False)
              .size()
              .reset_index(name='team_size'))

merged = merged.merge(team_sizes, on=['TeamId','CompetitionId'], how='left')
merged['team_size'] = merged['team_size'].fillna(1).astype(int)  # default 1 for solo/missing team

users_comp = (merged[['CompetitionId','UserId','TeamId','team_size']]
              .drop_duplicates()
              .reset_index(drop=True))
users_comp['Solo'] = ((users_comp['team_size'] == 1) | (users_comp['TeamId'].isna()) | (users_comp['TeamId']=='nan')).astype(int)

# compute n_competitions per user
ncomp = (merged[['UserId','CompetitionId']].drop_duplicates()
         .groupby('UserId')['CompetitionId'].nunique().reset_index().rename(columns={'CompetitionId':'n_competitions'}))
ncomp['UserId'] = ncomp['UserId'].astype(str)
users_comp = users_comp.merge(ncomp, on='UserId', how='left')
users_comp['n_competitions'] = users_comp['n_competitions'].fillna(0).astype(int)

# pull HighestRanking into users_comp if available from users_org_ach
if 'HighestRanking' in users_org_ach.columns:
    tmp = users_org_ach[['UserId','HighestRanking']].drop_duplicates('UserId')
    tmp['UserId'] = tmp['UserId'].astype(str)
    users_comp = users_comp.merge(tmp, on='UserId', how='left')
else:
    users_comp['HighestRanking'] = np.nan

print("users_comp: rows =", len(users_comp), "unique users =", users_comp['UserId'].nunique())



users_comp: rows = 8615864 unique users = 3119546


In [None]:
# --- Extract first name and infer gender using gender-guesser ---


# Load users if not yet loaded
try:
    users
except NameError:
    users = pd.read_csv("Users.csv")

# Extract first name from DisplayName
def extract_first_name(name):
    if pd.isna(name): return ''
    s = str(name).strip()
    s = re.sub(r'\(.*?\)', '', s)
    if ',' in s:  # handle "Last, First"
        parts = [p.strip() for p in s.split(',') if p.strip()]
        if len(parts) > 1: return parts[1].split()[0].capitalize()
    return s.split()[0].capitalize() if s else ''

users['first_name'] = users['DisplayName'].fillna('').astype(str).apply(extract_first_name)

# Predict gender
detector = gd.Detector(case_sensitive=False)
users['Gender'] = users['first_name'].apply(lambda x: (
    'male' if detector.get_gender(x) in ['male','mostly_male'] else
    'female' if detector.get_gender(x) in ['female','mostly_female'] else
    'unknown'
))
users['is_male'] = (users['Gender'] == 'male').astype(int)

# Quick check
print(users[['DisplayName','first_name','Gender']].head())
print(users['Gender'].value_counts(dropna=False))

# Merge into users_org_ach if it exists
if 'users_org_ach' in globals():
    users['UserId'] = users['UserId'].astype(str)
    users_org_ach['UserId'] = users_org_ach['UserId'].astype(str)
    users_org_ach = users_org_ach.merge(users[['UserId','first_name','Gender','is_male']], on='UserId', how='left')


         DisplayName first_name   Gender
0    Kaggle Team Bot     Kaggle  unknown
1  Anthony Goldbloom    Anthony     male
2           Isabelle   Isabelle   female
3      David Stephan      David     male
4        Gabe Warren       Gabe     male
Gender
unknown    17147341
male        6639358
female      2901679
Name: count, dtype: int64


In [4]:
# -------------------------
# 2. HHA-style person fixed effects c_user
#    (fit penalized logistic with ONLY user dummies; no intercept or pooled regressors)
# -------------------------
# Keep users with >=2 observations for FE estimation (practical)
counts = users_comp['UserId'].value_counts()
users_keep = counts[counts >= 2].index.astype(str).tolist()
uc_fe = users_comp[users_comp['UserId'].isin(users_keep)].reset_index(drop=True)
print("Estimating FE for users with >=2 obs:", len(users_keep), "users; obs:", len(uc_fe))

if len(uc_fe) == 0:
    raise RuntimeError("No users with >=2 observations — cannot estimate person FE. Consider lowering threshold.")

# build sparse user-dummy matrix
users_unique = uc_fe['UserId'].unique()
n_obs = len(uc_fe); n_users = len(users_unique)
rows = np.arange(n_obs)
cols = uc_fe['UserId'].map({u:i for i,u in enumerate(users_unique)}).values
X_user = sparse.csr_matrix((np.ones(n_obs),(rows,cols)), shape=(n_obs,n_users))
y = uc_fe['Solo'].astype(int).values

# fit penalized logistic (no intercept)
clf = LogisticRegression(penalty='l2', C=0.5, solver='saga', max_iter=1000, tol=1e-3, fit_intercept=False)
t0 = time.time()
clf.fit(X_user, y)
t1 = time.time()
print("FE fit time (s):", round(t1-t0,2))

coef_user = clf.coef_.ravel()
user_c = pd.DataFrame({'UserId': users_unique.astype(str), 'c_user': coef_user})
mean_c = user_c['c_user'].mean()
print("c_user mean:", round(mean_c,4), "sd:", round(user_c['c_user'].std(),4))

# assign mean to users not estimated
all_users = users_comp['UserId'].unique().astype(str)
missing = set(all_users) - set(users_unique)
if missing:
    user_c = pd.concat([user_c, pd.DataFrame({'UserId':list(missing),'c_user':mean_c})], ignore_index=True)

# also include users in users_org_ach not in user_c for safety
extra = set(users_org_ach['UserId'].astype(str).unique()) - set(user_c['UserId'].astype(str))
if extra:
    user_c = pd.concat([user_c, pd.DataFrame({'UserId':list(extra),'c_user':mean_c})], ignore_index=True)

# merge into users_org_ach for lookup
users_org_ach['UserId'] = users_org_ach['UserId'].astype(str)
users_org_ach = users_org_ach.merge(user_c[['UserId','c_user']], on='UserId', how='left')
users_org_ach['c_user'] = users_org_ach['c_user'].fillna(mean_c)


Estimating FE for users with >=2 obs: 1307717 users; obs: 6804035
FE fit time (s): 68.05
c_user mean: 0.604 sd: 0.3841


In [5]:
# --- ensure un-interacted fields exist in users_org_ach ---
users_org_ach['HighestRanking'] = pd.to_numeric(
    users_org_ach['HighestRanking'] if 'HighestRanking' in users_org_ach.columns else np.nan,
    errors='coerce'
)

# Safe handling for n_competitions (avoid .fillna() on int)
if 'n_competitions' in users_org_ach.columns:
    users_org_ach['n_competitions'] = pd.to_numeric(users_org_ach['n_competitions'], errors='coerce').fillna(0)
else:
    users_org_ach['n_competitions'] = 0

# is_male exists from earlier gender inference
if 'is_male' not in users_org_ach.columns:
    users_org_ach['is_male'] = 0

In [6]:
# -------------------------
# 3. Build pair dataset: positives (team pairs) + sampled negatives (K per positive)
# -------------------------
# Build positives
pos_rows = []
for team, g in merged.groupby('TeamId'):
    mems = sorted(g['UserId'].astype(str).unique())
    if len(mems) < 2: 
        continue
    comp = str(g['CompetitionId'].iloc[0])
    m = len(mems)
    for i in range(m):
        for j in range(i+1, m):
            pos_rows.append((comp, mems[i], mems[j], m))
pos_df = pd.DataFrame(pos_rows, columns=['CompetitionId','User_i','User_j','team_size'])
print("Positives:", len(pos_df))

# participant lists
comp_users = merged.groupby('CompetitionId')['UserId'].unique().to_dict()
for c in list(comp_users.keys()):
    comp_users[c] = [str(x) for x in comp_users[c]]

# sample negatives: K per positive (or 200 if no positives)
K = 5
neg_list = []
for comp, users in comp_users.items():
    users = sorted(users)
    n = len(users)
    if n < 2: continue
    pos_c = pos_df[pos_df['CompetitionId']==comp]
    P = len(pos_c)
    target = K*P if P>0 else min(200, n*(n-1)//2)
    sampled = set(); tries = 0
    while len(sampled) < target and tries < target*20:
        a,b = random.sample(users,2)
        if a == b:
            tries += 1; continue
        if a > b: a,b = b,a
        key = (a,b)
        if key in sampled: tries += 1; continue
        if ((pos_c['User_i']==a)&(pos_c['User_j']==b)).any():
            tries += 1; continue
        sampled.add(key)
        neg_list.append((comp,a,b))
        tries += 1

neg_df = pd.DataFrame(neg_list, columns=['CompetitionId','User_i','User_j'])
neg_df['team_size'] = 0
print("Negatives sampled:", len(neg_df))

# combine
pos_df['label'] = 1
neg_df['label'] = 0
pairs = pd.concat([pos_df, neg_df], ignore_index=True)
pairs = pairs.drop_duplicates(subset=['CompetitionId','User_i','User_j']).reset_index(drop=True)
print("Total pairs:", len(pairs), "Positives:", int(pairs['label'].sum()))


Positives: 366726
Negatives sampled: 2433121
Total pairs: 2780004 Positives: 346883


In [7]:
# -------------------------
# 4. Merge user-level info (c_user and un-interacted terms) into pairs
# -------------------------
lookup = users_org_ach[['UserId','c_user','HighestRanking','n_competitions','is_male']].drop_duplicates('UserId').copy()
lookup['UserId'] = lookup['UserId'].astype(str)

pairs['User_i'] = pairs['User_i'].astype(str); pairs['User_j'] = pairs['User_j'].astype(str)
pairs = pairs.merge(lookup.rename(columns={'UserId':'User_i','c_user':'ci','HighestRanking':'i_HighestRanking','n_competitions':'i_n_competitions','is_male':'i_is_male'}), on='User_i', how='left')
pairs = pairs.merge(lookup.rename(columns={'UserId':'User_j','c_user':'cj','HighestRanking':'j_HighestRanking','n_competitions':'j_n_competitions','is_male':'j_is_male'}), on='User_j', how='left')

# fill missing
pairs['ci'] = pairs['ci'].fillna(mean_c)
pairs['cj'] = pairs['cj'].fillna(mean_c)
pairs['i_HighestRanking'] = pd.to_numeric(pairs['i_HighestRanking'], errors='coerce').fillna(-1)
pairs['j_HighestRanking'] = pd.to_numeric(pairs['j_HighestRanking'], errors='coerce').fillna(-1)
pairs['i_n_competitions'] = pd.to_numeric(pairs['i_n_competitions'], errors='coerce').fillna(0)
pairs['j_n_competitions'] = pd.to_numeric(pairs['j_n_competitions'], errors='coerce').fillna(0)
pairs['i_is_male'] = pairs['i_is_male'].fillna(0).astype(int)
pairs['j_is_male'] = pairs['j_is_male'].fillna(0).astype(int)

# pair covariates
pairs['same_org'] = (pairs['ci'].notnull() & (pairs['ci'].notnull()) & (pairs['ci']*0==0)).astype(int)  # placeholder -> we'll compute properly below
# Proper same_org: compare Organization if exists
if 'Organization' in users_org_ach.columns:
    u_org = users_org_ach[['UserId','Organization']].drop_duplicates('UserId').copy()
    u_org['UserId'] = u_org['UserId'].astype(str)
    pairs = pairs.merge(u_org.rename(columns={'UserId':'User_i','Organization':'i_Organization'}), on='User_i', how='left')
    pairs = pairs.merge(u_org.rename(columns={'UserId':'User_j','Organization':'j_Organization'}), on='User_j', how='left')
    pairs['same_org'] = (pairs['i_Organization'] == pairs['j_Organization']).astype(int)
else:
    # if Organization not available, set same_org=0
    pairs['same_org'] = 0

# same_country if available
if 'Country' in users_org_ach.columns:
    u_ct = users_org_ach[['UserId','Country']].drop_duplicates('UserId').copy()
    u_ct['UserId'] = u_ct['UserId'].astype(str)
    pairs = pairs.merge(u_ct.rename(columns={'UserId':'User_i','Country':'i_Country'}), on='User_i', how='left')
    pairs = pairs.merge(u_ct.rename(columns={'UserId':'User_j','Country':'j_Country'}), on='User_j', how='left')
    pairs['same_country'] = (pairs['i_Country'] == pairs['j_Country']).astype(int)
else:
    pairs['same_country'] = 0

pairs['abs_rank_diff'] = (pairs['i_HighestRanking'] - pairs['j_HighestRanking']).abs().fillna(0)
pairs['avg_rank'] = (pairs['i_HighestRanking'] + pairs['j_HighestRanking'])/2

# avg_c_neg HHA-style
pairs['avg_c_neg'] = -0.5 * (pairs['ci'] + pairs['cj'])

# standardize HRank for i/j if present
pairs['i_HRank_s'] = (pairs['i_HighestRanking'] - pairs['i_HighestRanking'].mean()) / max(1e-6, pairs['i_HighestRanking'].std())
pairs['j_HRank_s'] = (pairs['j_HighestRanking'] - pairs['j_HighestRanking'].mean()) / max(1e-6, pairs['j_HighestRanking'].std())

# ensure label numeric
pairs['label'] = pairs['label'].astype(int)

print("Pairs with covariates ready. Example:")
print(pairs[['CompetitionId','User_i','User_j','label','same_org','same_country','abs_rank_diff','avg_c_neg']].head())


Pairs with covariates ready. Example:
  CompetitionId    User_i    User_j  label  same_org  same_country  \
0         45040  14029285   5176723      1         0             0   
1         46105   1465504   9847588      1         0             0   
2         37794  10291473  14029206      1         0             0   
3         46801   4735773   8671882      1         0             0   
4          7372    599610    611059      1         0             0   

   abs_rank_diff  avg_c_neg  
0            0.0  -0.258157  
1            0.0  -1.029243  
2            0.0  -0.047643  
3            0.0  -0.534327  
4            0.0  -0.181955  


In [8]:
# -------------------------
# 5. Mixed-logit simulated MLE (random coef on same_org) - memory-friendly
# -------------------------
# Fixed and random covariates
fixed_vars = ['same_country','abs_rank_diff','avg_rank','avg_c_neg','i_HRank_s','j_HRank_s','i_n_competitions','j_n_competitions','i_is_male','j_is_male']
random_vars = ['same_org']

df = pairs.reset_index(drop=True)
y = df['label'].astype(float).values
w = np.ones_like(y)   # uniform weights

# build Xf, Xr
Xf = df[fixed_vars].fillna(0).astype(float).values
Xf = np.hstack([np.ones((Xf.shape[0],1)), Xf])  # intercept included
Kf = Xf.shape[1]
if len(random_vars) > 0:
    Xr = df[random_vars].fillna(0).astype(float).values
    Kr = Xr.shape[1]
else:
    Xr = np.zeros((Xf.shape[0],0)); Kr = 0
N = Xf.shape[0]

# Halton draws
R_draws = 50
if Kr > 0:
    sampler = qmc.Halton(d=Kr, scramble=True, seed=0)
    z = stats.norm.ppf(sampler.random(n=R_draws))
else:
    z = np.zeros((1,0))

def pack(bf, mu, logsd):
    return np.concatenate([bf, mu, logsd])

def unpack(p):
    bf = p[:Kf]
    mu = p[Kf:Kf+Kr] if Kr>0 else np.array([])
    logsd = p[Kf+Kr:Kf+2*Kr] if Kr>0 else np.array([])
    return bf, mu, np.exp(logsd)

batch_size = 20000
def sim_probs_fast(p):
    bf, mu, sd = unpack(p)
    fixed_u = Xf.dot(bf)
    if Kr == 0:
        return np.clip(expit(fixed_u),1e-12,1-1e-12)
    P = np.empty(N, dtype=np.float64)
    R = z.shape[0]
    for start in range(0, N, batch_size):
        end = min(N, start + batch_size)
        fixed_b = fixed_u[start:end]
        Xr_b = Xr[start:end,:]
        Psum = np.zeros(end-start, dtype=np.float64)
        for r in range(R):
            beta_r = mu + sd * z[r,:]
            Psum += expit(fixed_b + Xr_b.dot(beta_r))
        P[start:end] = Psum / float(R)
    return np.clip(P,1e-12,1-1e-12)

def negll(p):
    P = sim_probs_fast(p)
    return -float(np.sum(w * (y * np.log(P) + (1-y) * np.log(1-P))))

# init & optimize (light)
bf0 = np.zeros(Kf)
mu0 = np.zeros(Kr) if Kr>0 else np.array([])
logsd0 = np.log(0.2 * np.ones(Kr)) if Kr>0 else np.array([])
init = pack(bf0, mu0, logsd0)

print("Quick sim test (init) mean P:", float(sim_probs_fast(init).mean()))
t0 = time.time()
res = optimize.minimize(negll, init, method='L-BFGS-B', options={'maxiter':100, 'disp':True})
t1 = time.time()
print("Optimization finished in {:.1f}s; success={}, msg={}".format(t1-t0, res.success, res.message))

bf_hat, mu_hat, sd_hat = unpack(res.x)
print("Fixed coeffs (incl intercept):", bf_hat)
if Kr>0:
    print("Random mean/sd:", mu_hat, sd_hat)

# attach predicted probability
df['pred_prob'] = sim_probs_fast(res.x)
print("Example predictions:")
print(df[['label','pred_prob']].head())

# Save results if desired
# df.to_csv('pairs_with_preds.csv', index=False)


Quick sim test (init) mean P: 0.5
Optimization finished in 3716.4s; success=False, msg=STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT
Fixed coeffs (incl intercept): [ 7.58842674e-02  2.68745732e+00 -1.80502806e-03  4.93915469e-03
  3.24828412e+00  7.88416565e-02 -3.83680216e-01  0.00000000e+00
  0.00000000e+00 -3.04751280e-01 -2.81826564e-01]
Random mean/sd: [0.] [0.2]
Example predictions:
   label  pred_prob
0      1   0.212599
1      1   0.038145
2      1   0.490266
3      1   0.127352
4      1   0.383381
