In [9]:
!pip install datasets scikit-learn pandas numpy joblib




In [12]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


file_path = "/content/legacy_pantheon.tsv.bz2"
df = pd.read_csv(file_path, sep="\t", dtype=str)
print("Dataset loaded, shape:", df.shape)
print(df.head())

features = ["gender", "birthyear", "countryCode"]
target = "domain"

df2 = df.dropna(subset=features + [target])
df2["birthyear"] = pd.to_numeric(df2["birthyear"], errors="coerce")
df2 = df2.dropna(subset=["birthyear"])
df2 = df2.sample(n=5000, random_state=42)
print("After cleaning & sampling:", df2.shape)

le_target = LabelEncoder()
df2["target_label"] = le_target.fit_transform(df2[target])


le_gender = LabelEncoder()
df2["gender_enc"] = le_gender.fit_transform(df2["gender"])

le_country = LabelEncoder()
df2["country_enc"] = le_country.fit_transform(df2["countryCode"])

df2["birthyear_norm"] = (df2["birthyear"] - df2["birthyear"].mean()) / df2["birthyear"].std()

X = df2[["gender_enc", "country_enc", "birthyear_norm"]].values.astype("float32")
y = df2["target_label"].values.astype("int32")


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)


rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
rf.fit(X_train, y_train)


y_pred = rf.predict(X_test)
top1_acc = accuracy_score(y_test, y_pred)
print(f"Top-1 Accuracy: {top1_acc:.4f}")

y_proba = rf.predict_proba(X_test)

top3_acc = np.mean([
    y_test[i] in np.argsort(y_proba[i])[-3:]
    for i in range(len(y_test))
])
print(f"Top-3 Accuracy: {top3_acc:.4f}")


def akinator_guess(model, le_target, gender, countryCode, birthyear):
    g = le_gender.transform([gender])[0] if gender in le_gender.classes_ else -1
    c = le_country.transform([countryCode])[0] if countryCode in le_country.classes_ else -1
    by_norm = (birthyear - df2["birthyear"].mean()) / df2["birthyear"].std()
    x = np.array([[g, c, by_norm]], dtype="float32")
    probs = model.predict_proba(x)[0]
    idxs = np.argsort(probs)[::-1]
    print(" Model guesses:")
    for rank, idx in enumerate(idxs[:3]):
        print(f"{rank+1}. {le_target.inverse_transform([idx])[0]} (p={probs[idx]:.3f})")


akinator_guess(rf, le_target, gender="male", countryCode="US", birthyear=1970)


Dataset loaded, shape: (11341, 23)
  en_curid             name numlangs         birthcity birthstate  \
0      307  Abraham Lincoln      131       Hodgenville         KY   
1      308        Aristotle      152          Stageira        NaN   
2      339         Ayn Rand       55  Saint Petersburg        NaN   
3      595     Andre Agassi       69         Las Vegas         NV   
4      628    Aldous Huxley       62         Godalming        NaN   

      countryName countryCode countryCode3        LAT          LON  ...  \
0   UNITED STATES          US          USA  37.571111   -85.738611  ...   
1          Greece          GR          GRC  40.333333         23.5  ...   
2          Russia          RU          RUS      59.95         30.3  ...   
3   UNITED STATES          US          USA  36.121514  -115.173851  ...   
4  UNITED KINGDOM          GB          GBR     51.185        -0.61  ...   

      occupation           industry        domain TotalPageViews       L_star  \
0     POLITICIAN  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["birthyear"] = pd.to_numeric(df2["birthyear"], errors="coerce")


Top-1 Accuracy: 0.5253
Top-3 Accuracy: 0.8620
🔮 Model guesses:
1. ARTS (p=0.991)
2. SPORTS (p=0.004)
3. PUBLIC FIGURE (p=0.002)
