In [3]:
from abc import ABC
import numpy as np

class Embedder(ABC):
    
    """
    Given contract string return its embedding vector
    """
    def transform(self, contract : str) -> np.array:
        pass

In [4]:
from openai import OpenAI
import json

class OpenAIEmbedder(Embedder):

    def __init__(self, api_key_file : str = None, api_key : str = None):
        if api_key_file: 
            with open(api_key_file, "r") as file:
                key = json.load(file)['OpenAI_key']
        elif api_key: 
            key = api_key
        else: 
            raise ValueError("You must provide your Open AI Account Key to use this embedder!")
        
        self.client = OpenAI(api_key = key)
        
    def transform(self, contract : str) -> np.array: 
        response = self.client.embeddings.create(
            input=contract,
            model="text-embedding-3-small",
        )
        
        return response.data[0].embedding
    
embedder = OpenAIEmbedder("API_KEY.json")

In [20]:
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import torch

class InstructionEmbedder(Embedder):
    
    def __init__(self, instruction : str = None): 
        if not instruction: 
            self.instruction = "Represent the contract for semantic matching:"
    
        self.model = SentenceTransformer("hkunlp/instructor-xl", trust_remote_code=True)
    
    def transform(self, contract : str) -> np.array: 
        return self.model.encode([[self.instruction, contract]], normalize_embeddings=True)[0]
    
class BAAIEmbedder(Embedder): 
    
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5")
        self.model = AutoModel.from_pretrained("BAAI/bge-small-en-v1.5")

    def transform(self, contract : str) -> np.array: 
        inputs = self.tokenizer(contract, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            output = self.model(**inputs)
        return output.last_hidden_state[:, 0].squeeze().numpy()

In [33]:
from time import time
from sklearn.metrics.pairwise import cosine_similarity

def compute_similarity(embedding1, embedding2):
    sim = cosine_similarity([embedding1], [embedding2])[0][0]
    return sim

def compare_models(models, pairs):
    for contract1, contract2 in pairs: 
        print(f"Testing {contract1=} {contract2=}")
        
        for embedder in models: 
            start = time()

            emb1 = embedder.transform(contract1)
            emb2 = embedder.transform(contract2)
            score = compute_similarity(emb1, emb2)

            end = time()

            print(f"\t{embedder.__class__.__name__} predicted similarity score of: {score:.5f} in {end - start:.5f} seconds")
        print()
            
models = [InstructionEmbedder(), BAAIEmbedder()]
pairs = [
    [
        "Will Trump pardon Daniel Penny in 2025?", 
        "Will Daniel Penny be pardoned in 2025"
    ], 
    [
        "Will the weather tomorrow be sunny?", 
        "Will the google forecast for tomorrow say cloudy?"
    ], 
    [
        "Will Daniel Rodriguez win the Holland vs Rodriguez professional MMA fight scheduled for Jul 19, 2025?",
        "Will Kevin Holland win the Holland vs Rodriguez professional MMA fight scheduled for Jul 19, 2025?"
    ], 
    [
        "Will Patricio Pitbull win the Ige vs Pitbull professional MMA fight scheduled for Jul 19, 2025?",
        "Will Kevin Holland win the Holland vs Rodriguez professional MMA fight scheduled for Jul 19, 2025?"
    ], 
    [
        "Will the weather tomorrow be sunny?",
        "Will Daniel Rodriguez win the Holland vs Rodriguez professional MMA fight scheduled for Jul 19, 2025?"
    ]
]

compare_models(models, pairs)

Testing contract1='Will Trump pardon Daniel Penny in 2025?' contract2='Will Daniel Penny be pardoned in 2025'
	InstructionEmbedder predicted similarity score of: 0.96547 in 3.88478 seconds
	BAAIEmbedder predicted similarity score of: 0.95304 in 0.10458 seconds

Testing contract1='Will the weather tomorrow be sunny?' contract2='Will the google forecast for tomorrow say cloudy?'
	InstructionEmbedder predicted similarity score of: 0.94277 in 2.64780 seconds
	BAAIEmbedder predicted similarity score of: 0.82267 in 0.05662 seconds

Testing contract1='Will Daniel Rodriguez win the Holland vs Rodriguez professional MMA fight scheduled for Jul 19, 2025?' contract2='Will Kevin Holland win the Holland vs Rodriguez professional MMA fight scheduled for Jul 19, 2025?'
	InstructionEmbedder predicted similarity score of: 0.97825 in 2.65138 seconds
	BAAIEmbedder predicted similarity score of: 0.92181 in 0.07294 seconds

Testing contract1='Will Patricio Pitbull win the Ige vs Pitbull professional MMA fi

In [49]:
import pandas as pd
from itertools import combinations

kalshi_data = pd.read_csv("kalshi_markets.csv")
polymarket_data = pd.read_csv("polymarket_markets.csv")

kalshi_titles = kalshi_data["Title"][:10]
polymarket_titles = polymarket_data["Title"][:10]
all_data = [["kalshi", idx, title] for idx, title in enumerate(kalshi_titles)] 
all_data += [["polymarket", idx, title] for idx, title in enumerate(polymarket_titles)]

model = BAAIEmbedder()
for data in all_data: 
    data.append(model.transform(data[2]))
    
processed = []
for item1, item2 in combinations(all_data, 2):
    score = compute_similarity(item1[3], item2[3])
    processed.append((score, item1, item2))
    
processed.sort(reverse=True)
for thing in processed[:5]:
    score, item1, item2 = thing
    print(f"With {score=:.3f} match found between {item1[2]} and {item2[2]}")
    print(f"Coming from {item1[0]} and {item2[0]}")
    print()

With score=0.954 match found between Will Roman Kopylov win the Costa vs Kopylov professional MMA fight scheduled for Jul 19, 2025? and Will Paulo Henrique Costa win the Costa vs Kopylov professional MMA fight scheduled for Jul 19, 2025?
Coming from kalshi and kalshi

With score=0.938 match found between Will Dustin Poirier win the Poirier vs Holloway professional MMA fight scheduled for Jul 19, 2025? and Will Max Holloway win the Poirier vs Holloway professional MMA fight scheduled for Jul 19, 2025?
Coming from kalshi and kalshi

With score=0.922 match found between Doge ETF approved by July 31? and Litecoin ETF approved by July 31?
Coming from polymarket and polymarket

With score=0.922 match found between Will Daniel Rodriguez win the Holland vs Rodriguez professional MMA fight scheduled for Jul 19, 2025? and Will Kevin Holland win the Holland vs Rodriguez professional MMA fight scheduled for Jul 19, 2025?
Coming from kalshi and kalshi

With score=0.919 match found between Ripple ET

In [54]:
kalshi_data.head(3)

Unnamed: 0,ID,Title,Option 1,Option 2,Option 1 Ask (¢),Option 2 Ask (¢),Status,Expires
0,KXATPMATCH-25JUL16CAZETC-ETC,Will Tomas Martin Etcheverry be a winner of th...,Yes,No,93,93,active,2025-07-16T11:30:00Z
1,KXATPMATCH-25JUL16CAZETC-CAZ,Will Arthur Cazaux be a winner of the round of...,Yes,No,93,93,active,2025-07-16T11:30:00Z
2,KXMAXWELLTESTIFY-26JAN,Will the CEO of State Farm testify in front of...,Yes,No,10,100,active,2026-01-01T15:00:00Z


In [52]:
polymarket_data.head(3)

Unnamed: 0,ID,Title,Option 1,Option 2,Option 1 Ask (¢),Option 2 Ask (¢),Status,Expires
0,502517,ARCH Will the match be a draw?,Yes,No,,,active,2024-06-17T12:00:00Z
1,513304,Will Trump impose large tariffs in his first 6...,Yes,No,97.3,3.5,active,2025-08-31T12:00:00Z
2,513317,Bird flu pandemic before August 2025?,Yes,No,0.9,99.3,active,2025-07-31T12:00:00Z


In [9]:
kalshi_data.shape, polymarket_data.shape

((3401, 8), (3066, 8))

In [22]:
def filterdf(df):
    keywords = ["trump", "dan"]
    mask = df['Title'].str.contains(keywords[0], case=False, na=False)
    for kw in keywords[1:]:
        mask &= df['Title'].str.contains(kw, case=False, na=False)
    return df[mask]

In [23]:
filterdf(polymarket_data)


Unnamed: 0,ID,Title,Option 1,Option 2,Option 1 Ask (¢),Option 2 Ask (¢),Status,Expires
924,537910,Will Trump pardon Daniel Penny in 2025?,Yes,No,5.0,96.0,active,2025-12-31T12:00:00Z


In [24]:
filterdf(kalshi_data)

Unnamed: 0,ID,Title,Option 1,Option 2,Option 1 Ask (¢),Option 2 Ask (¢),Status,Expires
3299,KXLEAVEADMIN-26-DBON,Will Dan Bongino leave the Trump Administratio...,Yes,No,37,65,active,2026-01-01T15:00:00Z


In [18]:
polymarket_data[polymarket_data['Title'].str.contains("trump", case = False, na = False)]

Unnamed: 0,ID,Title,Option 1,Option 2,Option 1 Ask (¢),Option 2 Ask (¢),Status,Expires
1,513304,Will Trump impose large tariffs in his first 6...,Yes,No,97.3,3.5,active,2025-08-31T12:00:00Z
5,512644,Will Trump repeal Presidential term limits?,Yes,No,1.1,99.1,active,2025-07-31T12:00:00Z
89,516941,Trump divorce in 2025?,Yes,No,4.0,96.1,active,2025-12-31T12:00:00Z
92,516944,Will Donald Trump win Nobel Peace Prize in 2025?,Yes,No,6.2,94.0,active,2025-12-31T12:00:00Z
140,517310,"Will Trump deport less than 250,000?",Yes,No,21.8,79.3,active,2025-12-31T12:00:00Z
...,...,...,...,...,...,...,...,...
2964,558549,Will Trump agree to a tariff agreement with Fr...,Yes,No,3.0,97.6,active,2025-08-01T00:00:00Z
2965,558550,Will Trump agree to a tariff agreement with Br...,Yes,No,26.0,81.0,active,2025-08-01T00:00:00Z
2966,558551,Will Trump agree to a tariff agreement with Ar...,Yes,No,51.0,65.0,active,2025-08-01T00:00:00Z
2967,558552,Will Trump agree to a tariff agreement with Is...,Yes,No,40.0,63.0,active,2025-08-01T00:00:00Z
