In [1]:
import os
import json
import pandas

## Situated QA

In [2]:
PATH = '/mnt/hlilabshare/jjunshim/data/'
files = (_ for _ in os.listdir(PATH) if 'json' == _.split('.')[-1])


In [None]:
data = []

for file in files:
    with open(
        os.path.join(PATH, file),
        'r'
    ) as fp:
        df = pandas.read_json(fp, lines=True)
    df['snap'] = file.split('_')[1]
    data.append(df)

df = pandas.concat(
    data,
    axis=0,
    ignore_index=True
)

del data
df.info()

In [4]:
print(df.index, df.size, df.memory_usage().sum())


RangeIndex(start=0, stop=10097860, step=1) 70685020 565480292


In [5]:
df.sample(5)


Unnamed: 0,id,timestamp,text,title,infobox,wikitable,snap
4101265,33491790,2021-01-14,"the church of st michael in alnham, in the eng...","Church of St Michael, Alnham","infobox church |name = st michael's church, al...",,20211220
9124390,61290903,2023-01-09,"staroyantuzovo (; , ""iske yandız"") is a rural ...",Staroyantuzovo,,,20231220
9375082,70384279,2023-10-12,kalfou danjere is an album by the haitian band...,Kalfou Danjere,,,20231220
1661574,418595,2018-09-04,The Hugo Award for Best Novelette is one of th...,Hugo Award for Best Novelette,,{| wikitable 1em auto 1em auto |- ! ! ! ! ...,20181220
3425126,20173750,2021-12-05,Balado railway station served the villages of ...,Balado railway station,Infobox station \n name = Balado \n status = D...,,20211220


In [6]:
df.groupby('snap').size()


snap
20181220    2807425
20211220    3493050
20231220    3797385
dtype: int64

In [9]:
df['snap'] = pandas.to_datetime(df.snap)

df.sort_values(
    by=['id', 'snap'],
    ascending=True,
    inplace=True
)

In [8]:
df.to_parquet(
    os.path.join(PATH, 'situated_qa/situated_qa_cleansed.parquet'),
    index=False
)

In [3]:
df = pandas.read_parquet(os.path.join(PATH, 'situated_qa/situated_qa_cleansed.parquet'))

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10097860 entries, 0 to 10097859
Data columns (total 7 columns):
 #   Column     Dtype         
---  ------     -----         
 0   id         int64         
 1   timestamp  datetime64[ns]
 2   text       object        
 3   title      object        
 4   infobox    object        
 5   wikitable  object        
 6   snap       datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(4)
memory usage: 539.3+ MB


In [5]:
from rapidfuzz import fuzz

In [6]:
from itertools import combinations

In [7]:
def compare_within_group(group):
    docs = group[["id", "text"]].values
    results = []
    for (id, text1), (_, text2) in combinations(docs, 2):
        score = fuzz.token_set_ratio(text1, text2)
        results.append({
            "group_id": id,
            "similarity": score
        })
    return pandas.DataFrame(results)

In [8]:
similarity_df = df.groupby("id").apply(compare_within_group).reset_index(drop=True)

  similarity_df = df.groupby("id").apply(compare_within_group).reset_index(drop=True)


In [9]:
similarity_df.to_json(
    'situated_qa_similarity.json',
    orient='records',
    lines=True
)

In [8]:
fuzz.ratio("this is a test", "this is a test!")


97

In [6]:
data = df[df.timestamp.dt.year == df.snap.dt.year]
temp = data.groupby('id').count()['timestamp'] >= 3
temp = temp[temp].sample(1000).index
data = df[df['id'].isin(temp)]

data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3000 entries, 4113 to 8760558
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   id         3000 non-null   int64         
 1   timestamp  3000 non-null   datetime64[ns]
 2   text       3000 non-null   object        
 3   title      3000 non-null   object        
 4   infobox    3000 non-null   object        
 5   wikitable  3000 non-null   object        
 6   snap       3000 non-null   datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(4)
memory usage: 187.5+ KB


In [7]:
data.to_parquet(
    os.path.join(PATH, 'situated_qa/situated_qa_sample_id1000.parquet'),
    index=False
)

## Templama

In [2]:
PATH = '/mnt/hlilabshare/jjunshim/data/temporal-alignment-qa'
df = pandas.read_json(os.path.join(PATH, 'test.jsonl'), lines=True, nrows=100)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   question           100 non-null    object 
 1   answer             100 non-null    object 
 2   title              100 non-null    object 
 3   section            100 non-null    object 
 4   caption            100 non-null    object 
 5   table              100 non-null    object 
 6   column             100 non-null    object 
 7   time_col           100 non-null    object 
 8   id                 100 non-null    int64  
 9   pageview           100 non-null    object 
 10  avg_pageview       100 non-null    float64
 11  numerical_density  100 non-null    float64
 12  date_density       100 non-null    float64
 13  year_density       100 non-null    int64  
 14  answer_per_year    100 non-null    float64
 15  word_per_answer    100 non-null    float64
 16  answer_frequency   100 non-

In [3]:
df.sample()

Unnamed: 0,question,answer,title,section,caption,table,column,time_col,id,pageview,avg_pageview,numerical_density,date_density,year_density,answer_per_year,word_per_answer,answer_frequency,answer_option_num,answer_set_num
12,What is the original title of the most recent ...,"{'2000': ['Merci pour le chocolat'], '2001': [...",Louis Delluc Prize,"Winners, Louis Delluc Prize for Best Film, 201...",,"Year,Original title,English title,Director(s),...",[Original title],[Year],9174,"[{'timestamp': '2015070100', 'views': 342}, {'...",299.048544,0.02381,0.0,0,1.083333,3.153846,4.769231,26,24


In [16]:
i = 15
data = [
    {
        'id': df.iloc[i].id,
        'year': year,
        'text': f"In {year}, {df.iloc[i].question} {', '.join(answer)}"
    } for year, answer in df.iloc[i].answer.items()
]
pandas.DataFrame(data).to_parquet(
    os.path.join(PATH, 'sample.parquet'),
    index=False
)

In [12]:
i = 15
df.iloc[i].question

"Who is the leading scorer for the Eastern Michigan Eagles men's basketball team this season?"

In [13]:
df.iloc[i].answer

{'2000': ['Melvin Hicks', 'Calvin Warner'],
 '2001': ['Melvin Hicks', 'Ricky Cottrill'],
 '2002': ['Ryan Prillman', 'Ricky Cottrill'],
 '2003': ['Ryan Prillman', 'Markus Austin'],
 '2004': ['Markus Austin'],
 '2005': ['Markus Austin', 'John Bowler'],
 '2006': ['John Bowler', 'Jesse Bunkley'],
 '2007': ['Carlos Medlock', 'Jesse Bunkley'],
 '2008': ['Brandon Bowdry', 'Carlos Medlock'],
 '2009': ['Brandon Bowdry', 'Carlos Medlock'],
 '2010': ['Brandon Bowdry', 'Carlos Medlock'],
 '2011': ['Darrell Lampley', 'Brandon Bowdry'],
 '2012': ['Darrell Lampley', 'Derek Thompson'],
 '2013': ['Karrington Ward', 'Derek Thompson'],
 '2014': ['Raven Lee', 'Karrington Ward'],
 '2015': ['Raven Lee'],
 '2016': ['Raven Lee', 'Ray Lee'],
 '2017': ['Ray Lee', 'Elijah Minnie'],
 '2018': ['Elijah Minnie', 'Paul Jackson'],
 '2019': ['Ty Groce', 'Paul Jackson'],
 '2020': ['Ty Groce'],
 '2021': ['Ty Groce', 'Noah Farrakhan'],
 '2022': ['Emoni Bates', 'Noah Farrakhan'],
 '2023': ['Emoni Bates']}

In [14]:
[
    {
        'id': df.iloc[i].id,
        'year': year,
        'text': f"In {year}, {df.iloc[i].question} {''.join(answer)}"
    } for year, answer in df.iloc[i].answer.items()
][0]

{'id': np.int64(18609),
 'year': '2000',
 'text': "In 2000, Who is the leading scorer for the Eastern Michigan Eagles men's basketball team this season? Melvin HicksCalvin Warner"}