# Pulling Price Data

Intro here ***

In [None]:
import requests as rq

data = []

# We have to pull data in batches of 20 to avoid the API limit. The for loop is slow but it helps stay under the limit. 

for i in range(0, 1):
    offset = i * 20 # Updates offset parameter 
    url = f"https://gamma-api.polymarket.com/markets?offset={offset}" # Updated offset
    print(i)
    
    response = rq.get(url) # Gets data from the API
    
    # Check for okay
    if response.status_code == 200:
        results = response.json()
        
        # Collect the specific data we need
        for entry in results:
            data.append({
                "id": entry.get("id"),
                "question": entry.get("question"),
                'clobTokenIds': entry.get("clobTokenIds"),
                "createdAt": entry.get("createdAt"),
                "closedTime": entry.get("closedTime"),
                "volume": entry.get('volume')})
    else:
        print(f"Failed to retrieve data at offset {offset}") #error message for failed loop
        break  # break to end loop

print(f"Total entries: {len(data)}")

In [None]:
len(data)
data[21074]

{'id': '512467',
 'question': 'Trump transgender sports ban?',
 'clobTokenIds': '["58668210401587939289020765000543806275224107926473315229845019791676629013454", "81936995067248997323251800413823769523809167682153265802173595241649787471091"]',
 'createdAt': '2024-11-08T21:31:27.673555Z',
 'closedTime': None}

In [None]:
good_data_save = data
good_data_save

[{'id': '253241',
  'question': "Will the Kyberswap exploiter's demands be met?",
  'clobTokenIds': '["94215300813633684393979707084001590078516032145107157692225454426960846402510", "18422839170192700589157522418896210198686471407724165685620516772499958137036"]',
  'createdAt': '2023-11-30T16:48:21.52Z',
  'closedTime': '2023-12-16 07:05:17+00',
  'volume': '2703.991728'},
 {'id': '253242',
  'question': 'Will Shein IPO at a $90b valuation?',
  'clobTokenIds': '["88656357696372913125084001877058757403296440646573406149718344363425859314242", "114153085104998354124539851117038947242822317394788066472003897451599877058315"]',
  'createdAt': '2023-11-30T16:58:15.728Z',
  'closedTime': '2024-07-01 08:23:54+00',
  'volume': '7311.887402'},
 {'id': '253243',
  'question': 'Will Kraken IPO by June?',
  'clobTokenIds': '["71591500442209525776652596431402678366355726049348183326767906760151176610830", "6929593591033460707675959441849941911465378234415791355273462580630358096049"]',
  'created

In [None]:
import csv
import pandas as pd

data_df_good = pd.DataFrame(good_data_save)
print(data_df_good.columns)
len(data_df_good)

Index(['id', 'question', 'clobTokenIds', 'createdAt', 'closedTime', 'volume'], dtype='object')


14026

In [None]:
data_df_good.to_csv('new_polydata.csv', index=False)

REPLACE EVERYTHING BEFORE THIS POINT WITH AN EARLIER SAVE WHERE WE ACTUALLY PULLED THE DATA

In [None]:
import pandas as pd

data_df_good = pd.read_csv(r"C:\Users\oogim\Downloads\new_polydata.csv")
bool_list = ~data_df_good['clobTokenIds'].isna() #filtering out Nan CLOB tokens
data_df_best = data_df_good.loc[bool_list].reset_index(drop=True)

There are two CLOB tokens per market, each associated with the token for one outcome. For exampe, a token for "yes" and "no" on "Will X Happen?". By saving the data as a CSV, it turned the tuple into a different datatype that was unseperated, so we had to convert it. 

In [None]:
cleaned_CLOBs = []

# The CLOBs were stored like this "('num', 'num')", but the numbers were not uniform in length, so to isolate num we found the location of the quotation marks

for i in range(len(data_df_best)): 
    row = str(data_df_best.loc[i]['clobTokenIds']) # Turning it into a string
    first_index = row.index('"') # Finding the location of quotation marks
    second_index = row.index('"', first_index+1)
    third_index = row.index('"', second_index+1)
    fourth_index = row.index('"', third_index+1)

    #Now we have the locations of the quotation marks, we can isolate the IDs in between. 
    cleaned_CLOBs.append((int(data_df_best.loc[i]['clobTokenIds'][first_index+1:second_index]), int(data_df_best.loc[i]['clobTokenIds'][third_index+1:fourth_index])))
    #appending new int CLOBs to list

len(cleaned_CLOBs) == len(data_df_best) #checking nothing was lost

data_df_best['clobTokenIds']=cleaned_CLOBs #replacing old CLOBs with cleaned values
print(data_df_best['clobTokenIds'])

0        (531350724629078801914001407064408677530449899...
1        (239578856151154309223841856612944839895212124...
2        (719297353092590301528700481322625696108672726...
3        (448546879784327080536190768858503310506013419...
4        (362723741995362299664450596440940506732699213...
                               ...                        
20999    (537855030533494087400152786648172286927910530...
21000    (928620714814449982069306488872738618597308474...
21001    (532598916201761901898544512095113341996657747...
21002    (265278293478220398941737479227880899891131149...
21003    (586682104015879392890207650005438062752241079...
Name: clobTokenIds, Length: 21004, dtype: object


Once we had the CLOB token IDs, we were able to pull price data from Polymarket's API. However, after looking at the data, we realized price history didn't exist until market 4720 in our dataset. After that point, the data was filled with .5, sometimes interspersed with actual price data, we're not sure why. The long chains of .5 disappeared at around market 8479 in our dataset. We discovered this through trial and error with the following block of code. 

In [None]:
import requests as rq

a, b = data_df_best['clobTokenIds'][4900] #Figuring out GOOD data started at 8479
print(a)
url_a = f"https://clob.polymarket.com/prices-history?market={a}&startTs=0&endTs=10000000000"# Updated offset
print(url_a)
response_a = rq.get(url_a) # Gets data from the API
print(response_a.status_code)
results_a = response_a.json()
url_b = f"https://clob.polymarket.com/prices-history?market={b}&startTs=0&endTs=10000000000"# Updated offset
print(url_b)
response_b = rq.get(url_b) # Gets data from the API
print(response_b.status_code)
results_b = response_b.json()
print(results_a['history'][100]['p'])

38533708059183496482396436337628663016977781241069578366354949231739219895465
https://clob.polymarket.com/prices-history?market=38533708059183496482396436337628663016977781241069578366354949231739219895465&startTs=0&endTs=10000000000
200
https://clob.polymarket.com/prices-history?market=7383078199576823886643038741729069867468150381224481526293786173706803650616&startTs=0&endTs=10000000000
200
0.5


The following block of code is how we pulled the price data for each token. We batched the data into sets of 250 markets, as it was too large to keep it all in one file. The price data was stored as midpoints, so we only needed to pull the price history for one token ID per pair, as they were equal to 1 minus the other. 

In [None]:
import requests as rq
import pandas as pd
import math
import orjson

price_data = []

# Offset Loop

startpoint = 8479
loopcount = 1
chunk_size = 250

length = len(data_df_best) - startpoint # Calculating the number of markets we will have data for
chunk_total = int(math.ceil(length/chunk_size)) # The total number of files we will have
print(f'You shall receieve {chunk_total} files, get excited!')

while loopcount <= chunk_total: # Iterating until we reach the number of files we expect
    # The loop inside a loop is slow but it avoided getting rate limited, which was breaking our loop. It avoided us having to add a wait timer between calls.
    for i in list(range(startpoint + (chunk_size * loopcount), startpoint+(chunk_size *(loopcount+1)))):
        if i > len(data_df_best)-1:
                break
        a, b = data_df_best['clobTokenIds'][i]
        url_a = f"https://clob.polymarket.com/prices-history?market={a}&startTs=0&endTs=10000000000" # Updated offset
        response_a = rq.get(url_a) # Gets data from the API
        
        if response_a.status_code == 200: # Checks the call was successful
            results_a = response_a.json()
            print(f'Success at entry {i}!')
            
            # Collecting the specific data we need
            price_data.append({
                'identification': int(data_df_best['id'][i]),
                'CLOB_a': results_a['history']})
            
        # If the call fails, break
        else:
            print(f"Failed to retrieve data at {i}") 
            break
    
    # If we're not at the final chunk, title the file start point to start point + chunk size
    if loopcount != chunk_total: 
        filename = f'price_data_from_{startpoint + (chunk_size * loopcount)}_to_{startpoint + (chunk_size * (loopcount + 1)) - 1}.json'

    # If we're at the final chunk, title the file start point to start point + expected length of dataset
    else: 
         filename = f'price_data_from_{startpoint + (chunk_size * loopcount)}_to_{len(data_df_best)-1}.json'

    # Saving the file with orjson as it was ~6 times faster than json
    with open(filename, 'wb') as json_file:
        json_file.write(orjson.dumps(price_data, option=orjson.OPT_INDENT_2))
    print(f"Saved: {filename}")
    
    price_data.clear()   
    
    loopcount += 1

print("Data collection complete")

NameError: name 'data_df_best' is not defined

COPY THE OUTPUT FROM THIS CELL FROM A PREVIOUS FILE