In [1]:
# Ελένη Γιακουμάκη, ΑΜ: 4651
# Μαρία Δημητροπούλου, ΑΜ: 4664
# Ομάδα Kaggle: *ChatGPTeam*

In [2]:
# ------- Imports ------- #
import os
import pandas as pd
import numpy as np
import networkx as nx
import random
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util

import xgboost as xgb

In [3]:
# ----- Δήλωση Path & Δημιουργία φακέλων (αν δεν υπάρχουν) ----- #

FEATURE_DIR = "features"
MODEL_DIR = "model_data"
DATA_DIR = "data"
SUBMISSION_DIR = "submissions"

for folder in [FEATURE_DIR, MODEL_DIR, SUBMISSION_DIR]:
    os.makedirs(folder, exist_ok=True)

In [4]:
# ------- Έλεγχος αν υπάρχει GPU (για sentence-transformers) ------- #
import torch
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))
else:
    print("Χρήση CPU")

Χρήση CPU


In [5]:
# ------- Προεπισκόπηση test set ------- #

# Φόρτωση ζευγών για πρόβλεψη
df_test = pd.read_csv(f"{DATA_DIR}/test.txt", sep=",", header=None, names=["id1", "id2"])
df_test["id1"] = df_test["id1"].astype(str)
df_test["id2"] = df_test["id2"].astype(str)

print("Δείγμα test set:")
print(df_test.head())

print("Συνολικός αριθμός ζευγών για πρόβλεψη:", len(df_test))


FileNotFoundError: [Errno 2] No such file or directory: 'data/test.txt'

In [None]:
# ------- Δημιουργία training pairs (θετικά & αρνητικά) από edgelist.txt ------- #

# Φόρτωση citation graph
edgelist_df = pd.read_csv(f"{DATA_DIR}/edgelist.txt", header=None, names=["source", "target"])
edgelist_df["source"] = edgelist_df["source"].astype(str)
edgelist_df["target"] = edgelist_df["target"].astype(str)

# Συλλογή όλων των paper IDs
all_papers = set(edgelist_df["source"]).union(set(edgelist_df["target"]))

# Θετικά citation pairs (υπάρχει ακμή)
positive_pairs = set(zip(edgelist_df["source"], edgelist_df["target"]))

# Τυχαίο δείγμα από θετικά
n_pos = 20000
positive_samples = random.sample(list(positive_pairs), n_pos)

# Δημιουργία αρνητικών pairs (χωρίς ακμή)
n_neg = 2 * n_pos
negative_samples = set()

with tqdm(total=n_neg, desc="Sampling negative pairs") as pbar:
    while len(negative_samples) < n_neg:
        a, b = random.sample(list(all_papers), 2)
        if (a, b) not in positive_pairs:
            negative_samples.add((a, b))
            pbar.update(1)

# Μετατροπή σε DataFrames
df_pos = pd.DataFrame(positive_samples, columns=["id1", "id2"])
df_pos["label"] = 1

df_neg = pd.DataFrame(list(negative_samples), columns=["id1", "id2"])
df_neg["label"] = 0

# Συνένωση και ανακάτεμα
train_df = pd.concat([df_pos, df_neg], ignore_index=True)
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Αποθήκευση
train_df.to_csv(f"{DATA_DIR}/train_pairs.csv", index=False)
print("Δημιουργήθηκε το train_pairs.csv με shape:", train_df.shape)


Sampling negative pairs: 100%|██████████| 40000/40000 [05:52<00:00, 113.38it/s]


Δημιουργήθηκε το train_pairs.csv με shape: (60000, 3)


In [None]:
# ------- Υπολογισμός TF-IDF similarity ------- #

# Φόρτωση abstracts
abstracts_df = pd.read_csv(f"{DATA_DIR}/abstracts.txt", sep="\\|--\\|", engine="python", names=["paper_id", "abstract"])
# Αφαίρεση NaN (κενών abstracts)
abstracts_df = abstracts_df.dropna(subset=["abstract"])
abstracts_df["paper_id"] = abstracts_df["paper_id"].astype(str)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(lowercase=True, stop_words="english")
tfidf_matrix = vectorizer.fit_transform(abstracts_df["abstract"])

# Map από paper_id σε index στο TF-IDF matrix
paper_index = {pid: idx for idx, pid in enumerate(abstracts_df["paper_id"])}

# Συνάρτηση υπολογισμού similarity
def tfidf_similarity(id1, id2):
    if id1 in paper_index and id2 in paper_index:
        vec1 = tfidf_matrix[paper_index[id1]]
        vec2 = tfidf_matrix[paper_index[id2]]
        return cosine_similarity(vec1, vec2)[0][0]
    return 0.0

# -------- TRAIN -------- #
train_df = pd.read_csv(f"{DATA_DIR}/train_pairs.csv").astype(str)
tqdm.pandas(desc="TF-IDF similarity (train)")
train_df["text_similarity"] = train_df.progress_apply(
    lambda row: tfidf_similarity(row["id1"], row["id2"]), axis=1
)
train_df[["id1", "id2", "text_similarity"]].to_csv(f"{FEATURE_DIR}/text_similarity_train.csv", index=False)
print("Αποθηκεύτηκε το text_similarity_train.csv")

# -------- TEST -------- #
test_df = pd.read_csv(f"{DATA_DIR}/test.txt", names=["id1", "id2"]).astype(str)
tqdm.pandas(desc="TF-IDF similarity (test)")
test_df["text_similarity"] = test_df.progress_apply(
    lambda row: tfidf_similarity(row["id1"], row["id2"]), axis=1
)
test_df[["id1", "id2", "text_similarity"]].to_csv(f"{FEATURE_DIR}/text_similarity_test.csv", index=False)
print("Αποθηκεύτηκε το text_similarity_test.csv")


TF-IDF similarity (train): 100%|██████████| 60000/60000 [00:41<00:00, 1437.09it/s]


Αποθηκεύτηκε το text_similarity_train.csv


TF-IDF similarity (test): 100%|██████████| 106692/106692 [01:23<00:00, 1280.85it/s]


Αποθηκεύτηκε το text_similarity_test.csv


In [None]:
# ------- Υπολογισμός BERT similarity ------- #

# Φόρτωση abstracts
abstracts_df = pd.read_csv(f"{DATA_DIR}/abstracts.txt", sep="\\|--\\|", engine="python", names=["paper_id", "abstract"])
abstracts_df = abstracts_df.dropna(subset=["abstract"])
abstracts_df["paper_id"] = abstracts_df["paper_id"].astype(str)

# Φόρτωση BERT μοντέλου
model = SentenceTransformer("all-MiniLM-L6-v2")

# Υπολογισμός embeddings για όλα τα abstracts
abstract_embeddings = model.encode(abstracts_df["abstract"].tolist(), show_progress_bar=True)
paper_index = {pid: idx for idx, pid in enumerate(abstracts_df["paper_id"])}

# Συνάρτηση similarity
def bert_similarity(id1, id2):
    if id1 in paper_index and id2 in paper_index:
        emb1 = abstract_embeddings[paper_index[id1]]
        emb2 = abstract_embeddings[paper_index[id2]]
        return util.cos_sim(emb1, emb2).item()
    return 0.0

# -------- TRAIN -------- #
train_df = pd.read_csv(f"{DATA_DIR}/train_pairs.csv").astype(str)
tqdm.pandas(desc="BERT similarity (train)")
train_df["bert_similarity"] = train_df.progress_apply(
    lambda row: bert_similarity(row["id1"], row["id2"]), axis=1
)
train_df[["id1", "id2", "bert_similarity"]].to_csv(f"{FEATURE_DIR}/bert_similarity_train.csv", index=False)
print("Αποθηκεύτηκε το bert_similarity_train.csv")

# -------- TEST -------- #
test_df = pd.read_csv(f"{DATA_DIR}/test.txt", names=["id1", "id2"]).astype(str)
tqdm.pandas(desc="BERT similarity (test)")
test_df["bert_similarity"] = test_df.progress_apply(
    lambda row: bert_similarity(row["id1"], row["id2"]), axis=1
)
test_df[["id1", "id2", "bert_similarity"]].to_csv(f"{FEATURE_DIR}/bert_similarity_test.csv", index=False)
print("Αποθηκεύτηκε το bert_similarity_test.csv")


Batches:   0%|          | 0/4102 [00:00<?, ?it/s]

BERT similarity (train): 100%|██████████| 60000/60000 [00:06<00:00, 8593.19it/s] 


Αποθηκεύτηκε το bert_similarity_train.csv


BERT similarity (test): 100%|██████████| 106692/106692 [00:10<00:00, 9801.77it/s]


Αποθηκεύτηκε το bert_similarity_test.csv


In [None]:
# ------- Υπολογισμός Jaccard similarity για authors ------- #

# Φόρτωση authors.txt
author_rows = []
with open(f"{DATA_DIR}/authors.txt", "r", encoding="utf-8") as f:
    for line in f:
        if "|--|" in line:
            parts = line.strip().split("|--|")
            if len(parts) == 2:
                paper_id = parts[0].strip()
                authors = [a.strip() for a in parts[1].split(",")]
                author_rows.append((paper_id, set(authors)))

# Λεξικό paper_id -> set of authors
author_dict = dict(author_rows)

# Συνάρτηση Jaccard similarity
def jaccard_authors(u, v):
    a1 = author_dict.get(u, set())
    a2 = author_dict.get(v, set())
    if not a1 or not a2:
        return 0.0
    return len(a1 & a2) / len(a1 | a2)

# -------- TRAIN -------- #
train_df = pd.read_csv(f"{DATA_DIR}/train_pairs.csv").astype(str)
tqdm.pandas(desc="Author Jaccard (train)")
train_df["author_jaccard"] = train_df.progress_apply(
    lambda row: jaccard_authors(row["id1"], row["id2"]), axis=1
)
train_df[["id1", "id2", "author_jaccard"]].to_csv(f"{FEATURE_DIR}/author_similarity_train.csv", index=False)
print("Αποθηκεύτηκε το author_similarity_train.csv")

# -------- TEST -------- #
test_df = pd.read_csv(f"{DATA_DIR}/test.txt", names=["id1", "id2"]).astype(str)
tqdm.pandas(desc="Author Jaccard (test)")
test_df["author_jaccard"] = test_df.progress_apply(
    lambda row: jaccard_authors(row["id1"], row["id2"]), axis=1
)
test_df[["id1", "id2", "author_jaccard"]].to_csv(f"{FEATURE_DIR}/author_similarity_test.csv", index=False)
print("Αποθηκεύτηκε το author_similarity_test.csv")


Author Jaccard (train): 100%|██████████| 60000/60000 [00:00<00:00, 95616.25it/s] 


Αποθηκεύτηκε το author_similarity_train.csv


Author Jaccard (test): 100%|██████████| 106692/106692 [00:01<00:00, 104687.80it/s]


Αποθηκεύτηκε το author_similarity_test.csv


In [None]:
# ------- Υπολογισμός graph-based χαρακτηριστικών ------- #

# Φόρτωση citation graph ως directed γράφος
G = nx.read_edgelist(f"{DATA_DIR}/edgelist.txt", delimiter=",", create_using=nx.DiGraph(), nodetype=str)
G_undirected = G.to_undirected()

print("Nodes:", G.number_of_nodes(), "Edges:", G.number_of_edges())

# Συνάρτηση υπολογισμού graph metrics
def compute_graph_features(u, v):
    try:
        # Common Neighbors: Πλήθος κοινών γειτόνων στον μη κατευθυνόμενο γράφο
        cn = len(list(nx.common_neighbors(G_undirected, u, v)))
    except:
        cn = 0
    try:
        # Jaccard Coefficient: |N(u) ∩ N(v)| / |N(u) ∪ N(v)|
        jacc = list(nx.jaccard_coefficient(G_undirected, [(u, v)]))[0][2]
    except:
        jacc = 0.0
    try:
        # Adamic-Adar Index: Δίνει βάρος στους "πιο μοναδικούς" κοινούς γείτονες
        aa = list(nx.adamic_adar_index(G_undirected, [(u, v)]))[0][2]
    except:
        aa = 0.0
    try:
        # Shortest Path Length: Μήκος συντομότερου μονοπατιού (στο directed γράφο)
        spl = nx.shortest_path_length(G, u, v)
    except:
        spl = -1  # αν δεν υπάρχει διαδρομή
    return pd.Series([cn, jacc, aa, spl])

# -------- TRAIN -------- #
train_df = pd.read_csv(f"{DATA_DIR}/train_pairs.csv").astype(str)
tqdm.pandas(desc="Graph features (train)")
train_df[["common_neighbors", "jaccard", "adamic_adar", "shortest_path"]] = train_df.progress_apply(
    lambda row: compute_graph_features(row["id1"], row["id2"]), axis=1
)
train_df[["id1", "id2", "common_neighbors", "jaccard", "adamic_adar", "shortest_path"]]\
    .to_csv(f"{FEATURE_DIR}/graph_features_train.csv", index=False)
print("Αποθηκεύτηκε το graph_features_train.csv")

# -------- TEST -------- #
test_df = pd.read_csv(f"{DATA_DIR}/test.txt", names=["id1", "id2"]).astype(str)
tqdm.pandas(desc="Graph features (test)")
test_df[["common_neighbors", "jaccard", "adamic_adar", "shortest_path"]] = test_df.progress_apply(
    lambda row: compute_graph_features(row["id1"], row["id2"]), axis=1
)
test_df[["id1", "id2", "common_neighbors", "jaccard", "adamic_adar", "shortest_path"]]\
    .to_csv(f"{FEATURE_DIR}/graph_features_test.csv", index=False)
print("Αποθηκεύτηκε το graph_features_test.csv")


Nodes: 138499 Edges: 1091955


Graph features (train): 100%|██████████| 60000/60000 [07:19<00:00, 136.58it/s]


Αποθηκεύτηκε το graph_features_train.csv


Graph features (test): 100%|██████████| 106692/106692 [18:50<00:00, 94.40it/s] 


Αποθηκεύτηκε το graph_features_test.csv


In [None]:
# ------- Υπολογισμός shared_word_count & shared_word_ratio ------- #

# Φόρτωση abstracts
abstracts_df = pd.read_csv(f"{DATA_DIR}/abstracts.txt", sep="\\|--\\|", engine="python", names=["paper_id", "abstract"])
abstracts_df = abstracts_df.dropna(subset=["abstract"])
abstracts_df["paper_id"] = abstracts_df["paper_id"].astype(str)
abstracts_dict = dict(zip(abstracts_df["paper_id"], abstracts_df["abstract"]))

# Προετοιμασία λεξικών
word_sets = {pid: set(text.lower().split()) for pid, text in abstracts_dict.items()}
lengths = {pid: len(text.split()) for pid, text in abstracts_dict.items()}

def compute_shared_features(id1, id2):
    w1 = word_sets.get(id1, set())
    w2 = word_sets.get(id2, set())
    l1 = lengths.get(id1, 1)
    l2 = lengths.get(id2, 1)

    shared = len(w1 & w2)
    ratio = shared / min(l1, l2) if min(l1, l2) > 0 else 0.0
    return pd.Series([shared, ratio])

# -------- TRAIN -------- #
train_df = pd.read_csv(f"{DATA_DIR}/train_pairs.csv").astype(str)
tqdm.pandas(desc="Υπολογισμός shared features (train)")
train_df[["shared_word_count", "shared_word_ratio"]] = train_df.progress_apply(
    lambda row: compute_shared_features(row["id1"], row["id2"]), axis=1
)


# Αποθήκευση αρχείων
train_df[["id1", "id2", "shared_word_count", "shared_word_ratio"]].to_csv(f"{FEATURE_DIR}/shared_features_train.csv", index=False)
train_df[["id1", "id2", "shared_word_count"]].to_csv(f"{FEATURE_DIR}/shared_word_count_train.csv", index=False)
train_df[["id1", "id2", "shared_word_ratio"]].to_csv(f"{FEATURE_DIR}/shared_word_ratio_train.csv", index=False)
print("Αποθηκεύτηκαν τα εξής αρχεία για train:")
print("- shared_features_train.csv")
print("- shared_word_count_train.csv")
print("- shared_word_ratio_train.csv")

# -------- TEST -------- #
test_df = pd.read_csv(f"{DATA_DIR}/test.txt", names=["id1", "id2"]).astype(str)
test_df[["shared_word_count", "shared_word_ratio"]] = test_df.progress_apply(
    lambda row: compute_shared_features(row["id1"], row["id2"]), axis=1
)

# Αποθήκευση αρχείων
test_df[["id1", "id2", "shared_word_count", "shared_word_ratio"]].to_csv(f"{FEATURE_DIR}/shared_features_test.csv", index=False)
test_df[["id1", "id2", "shared_word_count"]].to_csv(f"{FEATURE_DIR}/shared_word_count_test.csv", index=False)
test_df[["id1", "id2", "shared_word_ratio"]].to_csv(f"{FEATURE_DIR}/shared_word_ratio_test.csv", index=False)
print("Αποθηκεύτηκαν τα εξής αρχεία για test:")
print("- shared_features_test.csv")
print("- shared_word_count_test.csv")
print("- shared_word_ratio_test.csv")


Υπολογισμός shared features (train): 100%|██████████| 60000/60000 [00:09<00:00, 6249.73it/s] 


Αποθηκεύτηκαν τα εξής αρχεία για train:
- shared_features_train.csv
- shared_word_count_train.csv
- shared_word_ratio_train.csv


Υπολογισμός shared features (train): 100%|██████████| 106692/106692 [00:17<00:00, 6204.37it/s] 


Αποθηκεύτηκαν τα εξής αρχεία για test:
- shared_features_test.csv
- shared_word_count_test.csv
- shared_word_ratio_test.csv


In [None]:
# ------- Υπολογισμός text_ratio ------- #

# === TRAIN ===
text_df   = pd.read_csv(f"{FEATURE_DIR}/text_similarity_train.csv")
shared_df = pd.read_csv(f"{FEATURE_DIR}/shared_word_ratio_train.csv")

merged = text_df.merge(shared_df, on=["id1", "id2"])
merged["text_ratio"] = merged["text_similarity"] * merged["shared_word_ratio"]
merged[["id1", "id2", "text_ratio"]].to_csv(f"{FEATURE_DIR}/text_ratio_train.csv", index=False)
print("Αποθηκεύτηκε το text_ratio_train.csv")

# === TEST ===
text_df   = pd.read_csv(f"{FEATURE_DIR}/text_similarity_test.csv")
shared_df = pd.read_csv(f"{FEATURE_DIR}/shared_word_ratio_test.csv")

merged = text_df.merge(shared_df, on=["id1", "id2"])
merged["text_ratio"] = merged["text_similarity"] * merged["shared_word_ratio"]
merged[["id1", "id2", "text_ratio"]].to_csv(f"{FEATURE_DIR}/text_ratio_test.csv", index=False)
print("Αποθηκεύτηκε το text_ratio_test.csv")


Αποθηκεύτηκε το text_ratio_train.csv
Αποθηκεύτηκε το text_ratio_test.csv


In [None]:
# ------- Δημιουργία X_train, y_train, X_test ------- #

# === TRAIN SET ===

# Βασικά ζεύγη
train_df = pd.read_csv(f"{DATA_DIR}/train_pairs.csv").astype(str)
train_df["label"] = train_df["label"].astype(int)

# Φόρτωση χαρακτηριστικών
text_df     = pd.read_csv(f"{FEATURE_DIR}/text_similarity_train.csv")
bert_df     = pd.read_csv(f"{FEATURE_DIR}/bert_similarity_train.csv")
author_df   = pd.read_csv(f"{FEATURE_DIR}/author_similarity_train.csv")
graph_df    = pd.read_csv(f"{FEATURE_DIR}/graph_features_train.csv")
shared_df   = pd.read_csv(f"{FEATURE_DIR}/shared_features_train.csv")
textratio_df = pd.read_csv(f"{FEATURE_DIR}/text_ratio_train.csv")

# Λογαριθμική κλίμακα για το adamic_adar για μείωση outliers
graph_df["log_adamic_adar"] = np.log1p(graph_df["adamic_adar"])

# Μετατροπή σε id σε string
for df in [train_df, text_df, bert_df, author_df, graph_df, shared_df, textratio_df]:
    df["id1"] = df["id1"].astype(str)
    df["id2"] = df["id2"].astype(str)

# Merge όλων
merged = train_df.merge(text_df, on=["id1", "id2"]) \
                 .merge(bert_df, on=["id1", "id2"]) \
                 .merge(author_df, on=["id1", "id2"]) \
                 .merge(graph_df, on=["id1", "id2"]) \
                 .merge(shared_df, on=["id1", "id2"]) \
                 .merge(textratio_df, on=["id1", "id2"])

# Επιλογή τελικών features
final_features = [
    "id1", "id2",
    "text_similarity", "bert_similarity", "author_jaccard",
    "common_neighbors", "jaccard", "log_adamic_adar",
    "shared_word_count", "shared_word_ratio", "text_ratio"
]

X_train = merged[final_features]
y_train = merged[["label"]]

X_train.to_csv(f"{MODEL_DIR}/X_train.csv", index=False)
y_train.to_csv(f"{MODEL_DIR}/y_train.csv", index=False)
print("Αποθηκεύτηκαν τα X_train.csv και y_train.csv")

# === TEST SET ===

test_df = pd.read_csv(f"{DATA_DIR}/test.txt", names=["id1", "id2"]).astype(str)

# Φόρτωση χαρακτηριστικών
text_df     = pd.read_csv(f"{FEATURE_DIR}/text_similarity_test.csv")
bert_df     = pd.read_csv(f"{FEATURE_DIR}/bert_similarity_test.csv")
author_df   = pd.read_csv(f"{FEATURE_DIR}/author_similarity_test.csv")
graph_df    = pd.read_csv(f"{FEATURE_DIR}/graph_features_test.csv")
shared_df   = pd.read_csv(f"{FEATURE_DIR}/shared_features_test.csv")
textratio_df = pd.read_csv(f"{FEATURE_DIR}/text_ratio_test.csv")

graph_df["log_adamic_adar"] = np.log1p(graph_df["adamic_adar"])

# Μετατροπή σε id σε string
for df in [test_df, text_df, bert_df, author_df, graph_df, shared_df, textratio_df]:
    df["id1"] = df["id1"].astype(str)
    df["id2"] = df["id2"].astype(str)


merged_test = test_df.merge(text_df, on=["id1", "id2"]) \
                     .merge(bert_df, on=["id1", "id2"]) \
                     .merge(author_df, on=["id1", "id2"]) \
                     .merge(graph_df, on=["id1", "id2"]) \
                     .merge(shared_df, on=["id1", "id2"]) \
                     .merge(textratio_df, on=["id1", "id2"])

X_test = merged_test[final_features]
X_test.to_csv(f"{MODEL_DIR}/X_test.csv", index=False)
print("Αποθηκεύτηκε το X_test.csv")


Αποθηκεύτηκαν τα X_train.csv και y_train.csv
Αποθηκεύτηκε το X_test.csv


In [None]:
# ------- Τελικό submission: submission_best.csv ------- #
# Αναπαράγει την Υποβολή 13 από το submission_experiments.ipynb
# Χρήση όλων των διαθέσιμων χαρακτηριστικών χωρίς feature selection
# Μοντέλο: XGBClassifier() με default παραμέτρους
# Kaggle Public Score: 0.13758

# === Φόρτωση δεδομένων ===
X_train = pd.read_csv(f"{MODEL_DIR}/X_train.csv")
X_test = pd.read_csv(f"{MODEL_DIR}/X_test.csv")
y_train = pd.read_csv(f"{MODEL_DIR}/y_train.csv")

# === Εκπαίδευση μοντέλου ===
model = xgb.XGBClassifier()
model.fit(X_train, y_train)

# === Πρόβλεψη και δημιουργία υποβολής ===
y_pred = model.predict_proba(X_test)[:, 1]
submission = pd.DataFrame({"ID": range(len(y_pred)), "Label": y_pred})
submission.to_csv(f"{SUBMISSION_DIR}/submission_best.csv", index=False)

print("Δημιουργήθηκε το submission_best.csv (Score: 0.13758)")


Δημιουργήθηκε το submission_best.csv (Score: 0.13758)


In [None]:
# ------- Εναλλακτικό submission: submission_2nd.csv ------- #
# Αναπαράγει την Υποβολή 12 από το submission_experiments.ipynb
# Χρήση επιλεγμένων χαρακτηριστικών με tuned hyperparameters
# Μοντέλο: XGBClassifier με n_estimators=100, max_depth=5, learning_rate=0.1
# Kaggle Public Score: 0.14199

# === Φόρτωση δεδομένων ===
X_train = pd.read_csv(f"{MODEL_DIR}/X_train.csv")
X_test = pd.read_csv(f"{MODEL_DIR}/X_test.csv")
y_train = pd.read_csv(f"{MODEL_DIR}/y_train.csv")

# === Επιλογή χαρακτηριστικών ===
selected_features = [
    "text_similarity", "bert_similarity", "author_jaccard",
    "common_neighbors", "jaccard", "log_adamic_adar",
    "shared_word_count", "shared_word_ratio", "text_ratio"
]

# === Εκπαίδευση μοντέλου ===
model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    eval_metric="logloss",
    random_state=42
)
model.fit(X_train[selected_features], y_train)

# === Πρόβλεψη και δημιουργία υποβολής ===
y_pred = model.predict_proba(X_test[selected_features])[:, 1]
submission = pd.DataFrame({"ID": range(len(y_pred)), "Label": y_pred})
submission.to_csv(f"{SUBMISSION_DIR}/submission_2nd.csv", index=False)

print("Δημιουργήθηκε το submission_2nd.csv (Score: 0.14199)")


Δημιουργήθηκε το submission_2nd.csv (Score: 0.14199)
