In [None]:
import os
import pandas as pd
import pyarrow.parquet as pq

def merge_parquet_files(input_folder, output_file):
    # 입력 폴더 내의 모든 parquet 파일 목록 가져오기
    parquet_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith('.parquet')]
    
    # 모든 parquet 파일 읽어서 데이터프레임 리스트에 저장
    dataframes = []
    for file in parquet_files:
        df = pd.read_parquet(file)
        dataframes.append(df)
    
    # 데이터프레임 병합
    merged_df = pd.concat(dataframes, ignore_index=True)
    
    # 병합된 데이터프레임을 parquet 파일로 저장
    merged_df.to_parquet(output_file)
    print(f"Merged {len(parquet_files)} files into {output_file}")

# 예시 사용법
input_folder = 'parquets'
output_file = 'T5_results.parquet'
merge_parquet_files(input_folder, output_file)


In [None]:
target = r'C:\Users\wjdrb\vscode_code\MathBridge_new\data\until_gpt_not_len_5_cleaned_unique_eq.parquet'
df = pd.read_parquet(target)
df

In [None]:
T5 = pd.read_parquet('T5_results.parquet')

In [None]:
input = pd.read_parquet('T5_allocate.parquet')

In [None]:
input

In [None]:
dataset = pd.read_parquet('T5_allocate.parquet',engine="fastparquet")

In [None]:
dataset

In [None]:
import pandas as pd
import os
import glob

def get_start_index(batch_num, part_num, batch_size, prev_parts_counts):
    # 이전 파트들의 개수를 합산하여 시작 인덱스 계산
    start_index = batch_num * batch_size + sum(prev_parts_counts[:part_num])
    return start_index

def load_and_process_file(filename, dataset, batch_size, prev_parts_counts):
    # 파일 로드
    df_spoken = pd.read_parquet(filename, engine='fastparquet')
    
    # 파일명에서 배치 번호와 파트 번호 추출
    batch_num = int(filename.split('batch')[1].split('_')[0])
    part_num = int(filename.split('part')[1].split('.')[0])
    
    # 시작 인덱스 계산
    start_idx = get_start_index(batch_num, part_num, batch_size, prev_parts_counts[batch_num])
    
    # `equation` 데이터 추가
    df_spoken['equation'] = dataset['equation'].iloc[start_idx:start_idx + len(df_spoken)].reset_index(drop=True)
    
    # 결과 반환
    return df_spoken

def main(dataset):
    # 모든 parquet 파일을 찾아 처리
    files = glob.glob('parquets/*.parquet')
    num_models = 25  # 모델 수
    num_samples = len(dataset)
    batch_size = num_samples // num_models
    
    # 각 배치의 파트 수 저장 (파트 수는 파일 수에서 추론)
    prev_parts_counts = [[] for _ in range(num_models)]
    for file in files:
        batch_num = int(file.split('batch')[1].split('_')[0])
        part_num = int(file.split('part')[1].split('.')[0])
        if len(prev_parts_counts[batch_num]) <= part_num:
            prev_parts_counts[batch_num].extend([0] * (part_num + 1 - len(prev_parts_counts[batch_num])))
        prev_parts_counts[batch_num][part_num] = pd.read_parquet(file, engine='fastparquet').shape[0]
    
    results = [load_and_process_file(file, dataset, batch_size, prev_parts_counts) for file in files]
    
    # 모든 결과를 하나의 데이터프레임으로 합치기
    final_result = pd.concat(results, ignore_index=True)
    return final_result

# 가정: dataset이 이미 로드되어 있음
final_df = main(dataset)
print(final_df.head())


In [None]:
final_df

In [45]:
target = r'C:\Users\wjdrb\vscode_code\MathBridge_new\data\until_gpt_not_len_5_cleaned_unique_eq.parquet'
df = pd.read_parquet(target, engine='fastparquet')
df

Unnamed: 0,context_before,equation,context_after,eq_type,paper_number,paper_type,equation_len,spoken_English
0,The horizontal axis represents the exponent range,"$ \in [ 3 , 7 ] $",. We selected those categorical colors from Co...,0,2301.00002,arxiv2023,17,Belongs to the interval from 3 to 7.
1,Here,$ d\eta $,"denotes the system noise , modeled as a Wiener...",0,2301.00005,arxiv2023,9,d eta.
2,The agent 's actions,$ a ( t ) $,are modeled by a stochastic control process wi...,0,2301.00005,arxiv2023,11,a function of t
3,are modeled by a stochastic control process wi...,$ \sigma^2_t $,controlled by the agent and with a mean of zer...,0,2301.00005,arxiv2023,14,sigma squared sub t.
4,Here,$ \deltat $,"denotes the physical time step , and we adopte...",0,2301.00005,arxiv2023,11,delta t.
...,...,...,...,...,...,...,...,...
12052503,The coordinates of a projected demeaned data p...,$ \Gamma_ { k } =\Sigma_ { k } V_ { k } ^ { T } $,,0,Understanding Linear Algebra,textbook,49,
12052504,Let 's now study the,$ 48^ { th } $,"case , which is represented by the column of",0,Understanding Linear Algebra,textbook,14,
12052505,fitting the points,"$ $ ( 0,1 ) , ( 1,0 ) , ( 2,1.5 ) , ( 3,4 ) , ...",1,1,Understanding Linear Algebra,textbook,59,
12052506,Suppose that,$ \mathbf { u } =\left [ \begin { array } { c ...,and,0,Understanding Linear Algebra,textbook,79,


In [46]:
df['spoken_English'].isna().sum()

11142308

In [47]:
final_df

Unnamed: 0,spoken_English,equation
0,succ zero.,$ \succ \0 $
1,comma j in J plus,"$ , \quad j \in J^+ \ , , $"
2,Minus one times X sub d.,$ -\1 \odot X_d $
3,comma j in J minus,"$ , \quad j \in J^- \ , , $"
4,"Open parenthesis, one box plus negative one, c...",$ ( \1 \boxplus -\1 ) \odot X_d $
...,...,...
2216395,P times the derivative of Phi sub 1 times Max ...,$ \PWord \Max_1 \Max_2 = \PWord' \Max_1 \Max_2 $
2216396,P WORd equals QWord C multiplied by Witness of...,$ \PWord ' = \QWord \Conc \Witness { u } { u }...
2216397,P a r P open parenthesis i d close parenthesis...,"$ \Par { \P ( \Monoid ) , \CoveringRT_1 } $"
2216398,P a r P open parenthesis i d close parenthesis...,"$ \Par { \P ( \Monoid ) , \CoveringRT_2 } $"


In [48]:
# 'equation' 열을 기준으로 병합
merged_df = df.merge(final_df[['equation', 'spoken_English']], on='equation', suffixes=('', '_temp'), how='left')

# 'spoken_English' 열 업데이트
df['spoken_English'] = merged_df['spoken_English_temp'].combine_first(df['spoken_English'])


In [49]:
df['spoken_English'].isna().sum()

8925908

In [50]:
df

Unnamed: 0,context_before,equation,context_after,eq_type,paper_number,paper_type,equation_len,spoken_English
0,The horizontal axis represents the exponent range,"$ \in [ 3 , 7 ] $",. We selected those categorical colors from Co...,0,2301.00002,arxiv2023,17,Belongs to the interval from 3 to 7.
1,Here,$ d\eta $,"denotes the system noise , modeled as a Wiener...",0,2301.00005,arxiv2023,9,d eta.
2,The agent 's actions,$ a ( t ) $,are modeled by a stochastic control process wi...,0,2301.00005,arxiv2023,11,a function of t
3,are modeled by a stochastic control process wi...,$ \sigma^2_t $,controlled by the agent and with a mean of zer...,0,2301.00005,arxiv2023,14,sigma squared sub t.
4,Here,$ \deltat $,"denotes the physical time step , and we adopte...",0,2301.00005,arxiv2023,11,delta t.
...,...,...,...,...,...,...,...,...
12052503,The coordinates of a projected demeaned data p...,$ \Gamma_ { k } =\Sigma_ { k } V_ { k } ^ { T } $,,0,Understanding Linear Algebra,textbook,49,
12052504,Let 's now study the,$ 48^ { th } $,"case , which is represented by the column of",0,Understanding Linear Algebra,textbook,14,
12052505,fitting the points,"$ $ ( 0,1 ) , ( 1,0 ) , ( 2,1.5 ) , ( 3,4 ) , ...",1,1,Understanding Linear Algebra,textbook,59,
12052506,Suppose that,$ \mathbf { u } =\left [ \begin { array } { c ...,and,0,Understanding Linear Algebra,textbook,79,


In [55]:
df_filtered = df.filter(['equation', 'spoken_English'])
df_filtered

Unnamed: 0,equation,spoken_English
0,"$ \in [ 3 , 7 ] $",Belongs to the interval from 3 to 7.
1,$ d\eta $,d eta.
2,$ a ( t ) $,a function of t
3,$ \sigma^2_t $,sigma squared sub t.
4,$ \deltat $,delta t.
...,...,...
12052503,$ \Gamma_ { k } =\Sigma_ { k } V_ { k } ^ { T } $,
12052504,$ 48^ { th } $,
12052505,"$ $ ( 0,1 ) , ( 1,0 ) , ( 2,1.5 ) , ( 3,4 ) , ...",
12052506,$ \mathbf { u } =\left [ \begin { array } { c ...,


In [51]:
target = r'C:\Users\wjdrb\vscode_code\MathBridge_new\data\df_not_len_5_cleaned.parquet'
df_origin_without_5 = pd.read_parquet(target, engine='fastparquet')
df_origin_without_5

Unnamed: 0,context_before,equation,context_after,eq_type,paper_number,paper_type,equation_len
0,The horizontal axis represents the exponent range,"$ \in [ 3 , 7 ] $",. We selected those categorical colors from Co...,0,2301.00002,arxiv2023,17
1,Here,$ d\eta $,"denotes the system noise , modeled as a Wiener...",0,2301.00005,arxiv2023,9
2,The agent 's actions,$ a ( t ) $,are modeled by a stochastic control process wi...,0,2301.00005,arxiv2023,11
3,are modeled by a stochastic control process wi...,$ \sigma^2_t $,controlled by the agent and with a mean of zer...,0,2301.00005,arxiv2023,14
4,Here,$ \deltat $,"denotes the physical time step , and we adopte...",0,2301.00005,arxiv2023,11
...,...,...,...,...,...,...,...
33014810,What does the product,$ A\mathbf { v } $,represent ? Use the following cell to evaluate...,0,Understanding Linear Algebra,textbook,18
33014811,gives matrices so that,$ PA=LU $,,0,Understanding Linear Algebra,textbook,9
33014812,The,$ QR $,factorization of A is A,0,Understanding Linear Algebra,textbook,6
33014813,,$ RGB $,"color model , 179",0,Understanding Linear Algebra,textbook,7


In [56]:
df_merged_without_5 = pd.merge(df_origin_without_5, df_filtered, on='equation', how='left')
df_merged_without_5

Unnamed: 0,context_before,equation,context_after,eq_type,paper_number,paper_type,equation_len,spoken_English
0,The horizontal axis represents the exponent range,"$ \in [ 3 , 7 ] $",. We selected those categorical colors from Co...,0,2301.00002,arxiv2023,17,Belongs to the interval from 3 to 7.
1,Here,$ d\eta $,"denotes the system noise , modeled as a Wiener...",0,2301.00005,arxiv2023,9,d eta.
2,The agent 's actions,$ a ( t ) $,are modeled by a stochastic control process wi...,0,2301.00005,arxiv2023,11,a function of t
3,are modeled by a stochastic control process wi...,$ \sigma^2_t $,controlled by the agent and with a mean of zer...,0,2301.00005,arxiv2023,14,sigma squared sub t.
4,Here,$ \deltat $,"denotes the physical time step , and we adopte...",0,2301.00005,arxiv2023,11,delta t.
...,...,...,...,...,...,...,...,...
33014810,What does the product,$ A\mathbf { v } $,represent ? Use the following cell to evaluate...,0,Understanding Linear Algebra,textbook,18,
33014811,gives matrices so that,$ PA=LU $,,0,Understanding Linear Algebra,textbook,9,P A equals L times U.
33014812,The,$ QR $,factorization of A is A,0,Understanding Linear Algebra,textbook,6,QR
33014813,,$ RGB $,"color model , 179",0,Understanding Linear Algebra,textbook,7,RGB: None


In [57]:
target = r'C:\Users\wjdrb\vscode_code\MathBridge_new\data\df_len_5.parquet'
df_5 = pd.read_parquet(target, engine='fastparquet')
df_5

Unnamed: 0,context_before,equation,context_after,eq_type,paper_number,paper_type,equation_len,spoken_English
1,Figures on the top row are magnified views of ...,$ 1 $,", marked by orange-box on the left image , and...",0,2301.00002,arxiv2023,5,one
2,and the bottom row shows region,$ 2 $,". With white background , the white cylinders ...",0,2301.00002,arxiv2023,5,two
4,Experiment II : examples of selected exponent ...,$ 3 $,",",0,2301.00002,arxiv2023,5,three
5,,$ 5 $,", and",0,2301.00002,arxiv2023,5,five
6,and,$ 7 $,( from the second left to right ) . We could s...,0,2301.00002,arxiv2023,5,seven
...,...,...,...,...,...,...,...,...
48780808,T `` ` * Find the singular values of,$ A $,and use them to determine the variance in the ...,0,Understanding Linear Algebra,textbook,5,A
48780809,* For what fraction of the variance do the fir...,$ A $,and construct the,0,Understanding Linear Algebra,textbook,5,A
48780811,matrix,$ B $,whose entries are the coordinates of the demea...,0,Understanding Linear Algebra,textbook,5,B
48780814,Evaluating the following cell will load the vo...,$ A $,,0,Understanding Linear Algebra,textbook,5,A


In [64]:
df_combined = pd.concat([df_merged_without_5, df_5], ignore_index=True)
df_combined

Unnamed: 0,context_before,equation,context_after,eq_type,paper_number,paper_type,equation_len,spoken_English
0,The horizontal axis represents the exponent range,"$ \in [ 3 , 7 ] $",. We selected those categorical colors from Co...,0,2301.00002,arxiv2023,17,Belongs to the interval from 3 to 7.
1,Here,$ d\eta $,"denotes the system noise , modeled as a Wiener...",0,2301.00005,arxiv2023,9,d eta.
2,The agent 's actions,$ a ( t ) $,are modeled by a stochastic control process wi...,0,2301.00005,arxiv2023,11,a function of t
3,are modeled by a stochastic control process wi...,$ \sigma^2_t $,controlled by the agent and with a mean of zer...,0,2301.00005,arxiv2023,14,sigma squared sub t.
4,Here,$ \deltat $,"denotes the physical time step , and we adopte...",0,2301.00005,arxiv2023,11,delta t.
...,...,...,...,...,...,...,...,...
40127014,T `` ` * Find the singular values of,$ A $,and use them to determine the variance in the ...,0,Understanding Linear Algebra,textbook,5,A
40127015,* For what fraction of the variance do the fir...,$ A $,and construct the,0,Understanding Linear Algebra,textbook,5,A
40127016,matrix,$ B $,whose entries are the coordinates of the demea...,0,Understanding Linear Algebra,textbook,5,B
40127017,Evaluating the following cell will load the vo...,$ A $,,0,Understanding Linear Algebra,textbook,5,A


In [65]:
df_cleaned = df_combined[df_combined['spoken_English'].notna() & 
        (df_combined['spoken_English'] != 'None') & 
        (df_combined['spoken_English'] != 'None.')]

In [66]:
df_cleaned

Unnamed: 0,context_before,equation,context_after,eq_type,paper_number,paper_type,equation_len,spoken_English
0,The horizontal axis represents the exponent range,"$ \in [ 3 , 7 ] $",. We selected those categorical colors from Co...,0,2301.00002,arxiv2023,17,Belongs to the interval from 3 to 7.
1,Here,$ d\eta $,"denotes the system noise , modeled as a Wiener...",0,2301.00005,arxiv2023,9,d eta.
2,The agent 's actions,$ a ( t ) $,are modeled by a stochastic control process wi...,0,2301.00005,arxiv2023,11,a function of t
3,are modeled by a stochastic control process wi...,$ \sigma^2_t $,controlled by the agent and with a mean of zer...,0,2301.00005,arxiv2023,14,sigma squared sub t.
4,Here,$ \deltat $,"denotes the physical time step , and we adopte...",0,2301.00005,arxiv2023,11,delta t.
...,...,...,...,...,...,...,...,...
40127014,T `` ` * Find the singular values of,$ A $,and use them to determine the variance in the ...,0,Understanding Linear Algebra,textbook,5,A
40127015,* For what fraction of the variance do the fir...,$ A $,and construct the,0,Understanding Linear Algebra,textbook,5,A
40127016,matrix,$ B $,whose entries are the coordinates of the demea...,0,Understanding Linear Algebra,textbook,5,B
40127017,Evaluating the following cell will load the vo...,$ A $,,0,Understanding Linear Algebra,textbook,5,A


In [67]:
df_cleaned.to_parquet('MathBridge_ver1', engine='pyarrow')

## 재시작

In [15]:
import pandas as pd
df_cleaned = pd.read_parquet('MathBridge_ver1', engine='fastparquet')

In [16]:
df_cleaned["paper_number"]

0                             2301.00002
1                             2301.00005
2                             2301.00005
3                             2301.00005
4                             2301.00005
                        ...             
40127014    Understanding Linear Algebra
40127015    Understanding Linear Algebra
40127016    Understanding Linear Algebra
40127017    Understanding Linear Algebra
40127018    Understanding Linear Algebra
Name: paper_number, Length: 23392383, dtype: object

In [17]:
df_cleaned['arxiv_number'] = df_cleaned['paper_number'].str.extract('(\d+\.\d+)')
df_cleaned['textbook_title'] = df_cleaned['paper_number'].str.extract('([^\d.]+)')


  df_cleaned['arxiv_number'] = df_cleaned['paper_number'].str.extract('(\d+\.\d+)')
  df_cleaned['textbook_title'] = df_cleaned['paper_number'].str.extract('([^\d.]+)')


In [18]:
df_cleaned

Unnamed: 0,context_before,equation,context_after,eq_type,paper_number,paper_type,equation_len,spoken_English,arxiv_number,textbook_title
0,The horizontal axis represents the exponent range,"$ \in [ 3 , 7 ] $",. We selected those categorical colors from Co...,0,2301.00002,arxiv2023,17,Belongs to the interval from 3 to 7.,2301.00002,
1,Here,$ d\eta $,"denotes the system noise , modeled as a Wiener...",0,2301.00005,arxiv2023,9,d eta.,2301.00005,
2,The agent 's actions,$ a ( t ) $,are modeled by a stochastic control process wi...,0,2301.00005,arxiv2023,11,a function of t,2301.00005,
3,are modeled by a stochastic control process wi...,$ \sigma^2_t $,controlled by the agent and with a mean of zer...,0,2301.00005,arxiv2023,14,sigma squared sub t.,2301.00005,
4,Here,$ \deltat $,"denotes the physical time step , and we adopte...",0,2301.00005,arxiv2023,11,delta t.,2301.00005,
...,...,...,...,...,...,...,...,...,...,...
40127014,T `` ` * Find the singular values of,$ A $,and use them to determine the variance in the ...,0,Understanding Linear Algebra,textbook,5,A,,Understanding Linear Algebra
40127015,* For what fraction of the variance do the fir...,$ A $,and construct the,0,Understanding Linear Algebra,textbook,5,A,,Understanding Linear Algebra
40127016,matrix,$ B $,whose entries are the coordinates of the demea...,0,Understanding Linear Algebra,textbook,5,B,,Understanding Linear Algebra
40127017,Evaluating the following cell will load the vo...,$ A $,,0,Understanding Linear Algebra,textbook,5,A,,Understanding Linear Algebra


In [19]:
df_cleaned = df_cleaned.drop('paper_number', axis=1)


In [20]:
df_cleaned

Unnamed: 0,context_before,equation,context_after,eq_type,paper_type,equation_len,spoken_English,arxiv_number,textbook_title
0,The horizontal axis represents the exponent range,"$ \in [ 3 , 7 ] $",. We selected those categorical colors from Co...,0,arxiv2023,17,Belongs to the interval from 3 to 7.,2301.00002,
1,Here,$ d\eta $,"denotes the system noise , modeled as a Wiener...",0,arxiv2023,9,d eta.,2301.00005,
2,The agent 's actions,$ a ( t ) $,are modeled by a stochastic control process wi...,0,arxiv2023,11,a function of t,2301.00005,
3,are modeled by a stochastic control process wi...,$ \sigma^2_t $,controlled by the agent and with a mean of zer...,0,arxiv2023,14,sigma squared sub t.,2301.00005,
4,Here,$ \deltat $,"denotes the physical time step , and we adopte...",0,arxiv2023,11,delta t.,2301.00005,
...,...,...,...,...,...,...,...,...,...
40127014,T `` ` * Find the singular values of,$ A $,and use them to determine the variance in the ...,0,textbook,5,A,,Understanding Linear Algebra
40127015,* For what fraction of the variance do the fir...,$ A $,and construct the,0,textbook,5,A,,Understanding Linear Algebra
40127016,matrix,$ B $,whose entries are the coordinates of the demea...,0,textbook,5,B,,Understanding Linear Algebra
40127017,Evaluating the following cell will load the vo...,$ A $,,0,textbook,5,A,,Understanding Linear Algebra


In [21]:
df_cleaned.to_parquet('MathBridge_ver2', engine='pyarrow')

In [9]:
# 'arxiv_number' 열에서 고유한 값만 추출
unique_arxiv_numbers = df_cleaned['arxiv_number'].drop_duplicates()

# 새 데이터프레임 생성
df_arxiv_number = pd.DataFrame(unique_arxiv_numbers)

# 데이터프레임 인덱스 재설정 (선택적)
df_arxiv_number.reset_index(drop=True, inplace=True)
df_arxiv_number

Unnamed: 0,arxiv_number
0,2301.00002
1,2301.00005
2,2301.00006
3,2301.00007
4,2301.00008
...,...
141027,2312.17641
141028,2312.1766
141029,2312.17674
141030,2312.17712


## 분야

In [77]:
import requests

def fetch_arxiv_category(arxiv_id):
    url = f'http://export.arxiv.org/api/query?id_list={arxiv_id}'
    response = requests.get(url)
    if response.status_code == 200:
        # API로부터 응답 받기
        data = response.text
        # 응답에서 분야 정보 추출하기
        start = data.find('<arxiv:primary_category')
        start = data.find('term="', start) + 6
        end = data.find('"', start)
        category = data[start:end]
        return category
    else:
        return "API 요청에 실패하였습니다."

# 예시 사용법
arxiv_id = "1707.01495"
category = fetch_arxiv_category(arxiv_id)
print(f'논문 분야: {category}')


논문 분야: cs.LG


In [78]:
import pandas as pd
import requests

def fetch_arxiv_category(arxiv_id):
    url = f'http://export.arxiv.org/api/query?id_list={arxiv_id}'
    response = requests.get(url)
    if response.status_code == 200:
        # API로부터 응답 받기
        data = response.text
        # 응답에서 분야 정보 추출하기
        start = data.find('<arxiv:primary_category')
        start = data.find('term="', start) + 6
        end = data.find('"', start)
        category = data[start:end]
        print(category)
        return category
    else:
        return None  # API 요청 실패 시 None 반환


# 각 논문 번호에 대해 분야를 조회하고 새 열로 저장
df_arxiv_number['category'] = df_arxiv_number['arxiv_number'].apply(fetch_arxiv_category)



cs.HC
cs.AI
cs.HC
cs.LG
cs.LG
cs.LG
cs.NE


KeyboardInterrupt: 

In [11]:
import pandas as pd
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

def fetch_arxiv_category(arxiv_id):
    url = f'http://export.arxiv.org/api/query?id_list={arxiv_id}'
    try:
        response = requests.get(url)
        response.raise_for_status()  # 응답 코드가 오류를 나타내면 예외 발생
        data = response.text
        start = data.find('<arxiv:primary_category')
        start = data.find('term="', start) + 6
        end = data.find('"', start)
        category = data[start:end]
        print(category)
        return category
    except requests.RequestException as e:
        print(f"Request failed for {arxiv_id}: {e}")
        return None

def fetch_categories(paper_ids):
    # 최대 스레드 수를 지정하여 ThreadPoolExecutor 생성
    with ThreadPoolExecutor(max_workers=20) as executor:
        # 각 paper_id에 대한 fetch_arxiv_category 함수 호출 스케줄링
        future_to_id = {executor.submit(fetch_arxiv_category, pid): pid for pid in paper_ids}
        # 완료되면 결과 수집
        for future in as_completed(future_to_id):
            arxiv_id = future_to_id[future]
            try:
                yield arxiv_id, future.result()
            except Exception as exc:
                print(f'{arxiv_id} generated an exception: {exc}')

# DataFrame에 있는 paper_number에 대해 분야를 조회
categories = {pid: cat for pid, cat in fetch_categories(df_arxiv_number['arxiv_number'])}

# DataFrame에 category 열 추가
df_arxiv_number['category'] = df_arxiv_number['arxiv_number'].map(categories)
df_arxiv_number

cs.HC
cs.AI
cs.LG
cs.NE
cs.LG
cs.HC
cs.LG
physics.ed-ph
cs.LG
cs.LG
gr-qc
astro-ph.GA
quant-ph
astro-ph.HE
cs.HC
cond-mat.mes-hall
cs.CV
gr-qc
astro-ph.HE
hep-th
physics.flu-dyn
cs.LG
math-ph
cs.LG
math.QA
hep-lat
hep-th
stat.OT
hep-ph
quant-ph
quant-ph
cs.FL
gr-qc
quant-ph
cs.SE
stat.AP
cs.NI
cs.HC
math.OC
astro-ph.SR
stat.ME
cs.CL
cs.CL
cs.OS
cs.CC
math.NA
cs.IT
hep-ph
physics.flu-dyn
stat.AP
math.PR
hep-ph
physics.soc-ph
cond-mat.supr-con
cs.LG
math.OC
stat.ML
math.ATmath.GT

cond-mat.quant-gas
cond-mat.str-elcond-mat.mtrl-sci

cond-mat.mes-hall
cs.CR
cs.LG
math.ST
math.CO
cs.SI
cs.LG
astro-ph.HE
cond-mat.mtrl-sci
math.AP
physics.app-ph
quant-ph
eess.IV
cond-mat.str-el
physics.ed-ph
cond-mat.mtrl-sci
math.CO
cs.CC
cond-mat.mtrl-sci
quant-ph
math.ST
cs.HC
cond-mat.mtrl-sci
cs.LG
astro-ph.SR
cs.CV
cs.CV
q-bio.NC
cs.CV
physics.soc-ph
cs.LG
cs.CL
math.AG
cond-mat.stat-mech
cs.IT
physics.plasm-ph
eess.SP
math.AP
cs.SI
astro-ph.SR
math.OC
math.NA
cond-mat.mtrl-sci
cond-mat.soft
cs.LG
math

In [9]:
import requests
import feedparser
import pandas as pd
import time

def fetch_arxiv_metadata(start_index, max_results):
    base_url = 'http://export.arxiv.org/api/query?'
    year = 2023
    query = f'search_query=submittedDate:[{year}01010000+TO+{year}12312359]&sortBy=submittedDate&sortOrder=ascending'
    url = f'{base_url}{query}&start={start_index}&max_results={max_results}'
    
    response = requests.get(url)
    feed = feedparser.parse(response.content)
    
    papers = []
    for entry in feed.entries:
        paper_id = entry.id.split('/abs/')[-1]
        category = entry.arxiv_primary_category['term']
        papers.append({'id': paper_id, 'category': category})
    
    return papers

# 160,000개 데이터를 수집하기 위한 함수
def collect_all_papers(total_papers, max_per_request):
    all_papers = []
    for start in range(0, total_papers, max_per_request):
        print(f"Fetching papers {start + 1} to {start + max_per_request}...")
        current_papers = fetch_arxiv_metadata(start, max_per_request)
        all_papers.extend(current_papers)
        time.sleep(3)  # API 요청 사이에 약간의 지연을 두어 서버에 부하를 주지 않도록 합니다.
        if len(current_papers) < max_per_request:
            break  # 모든 데이터를 가져온 경우 루프를 종료합니다.
    
    return all_papers

# 데이터 수집 실행
total_required_papers = 160000
max_papers_per_request = 1600
papers_2023 = collect_all_papers(total_required_papers, max_papers_per_request)

# 데이터프레임으로 변환
papers_df = pd.DataFrame(papers_2023)
papers_df


Fetching papers 1 to 1600...
Fetching papers 1601 to 3200...


Unnamed: 0,id,category
0,2301.00309v1,math.CO
1,2301.00310v2,cs.SI
2,2301.00311v2,gr-qc
3,2301.00312v2,cs.SI
4,2301.00313v2,astro-ph.SR
...,...,...
1595,2301.01936v1,math.PR
1596,2301.02631v3,gr-qc
1597,2301.01768v1,cs.CL
1598,2301.01937v1,astro-ph.GA


In [2]:
import matplotlib.pyplot as plt
csfont = {'fontname':'Times New Roman'}

plt.rcParams['font.family'] = 'Times New Roman' # 예: 'serif', 'sans-serif', 'monospace' 등
plt.rcParams['font.serif'] = 'Times New Roman' # 특정 글꼴로 지정
plt.rcParams['font.size'] = 30  # 글꼴 크기를 12pt로 설정4
fontsize = plt.rcParams['font.size']





# 'category'에서 주 분야 추출
papers_df['primary_category'] = papers_df['category'].apply(lambda x: x.split('.')[0])

# 각 주 분야의 개수 계산
category_counts = papers_df['primary_category'].value_counts()

# 전체 논문 수 계산
total_papers = category_counts.sum()

# 3% 이하인 카테고리를 'etc'로 통합
threshold = 0.06 * total_papers
category_counts = category_counts[category_counts >= threshold]
category_counts['etc'] = papers_df['primary_category'].value_counts()[papers_df['primary_category'].value_counts() < threshold].sum()

# 원 그래프 그리기
plt.figure(figsize=(8, 8))
plt.pie(category_counts, labels=category_counts.index, autopct='%1.1f%%', startangle=140)
#plt.title('Primary Category Distribution as Pie Chart')
plt.savefig('AAAI_Pie Chart.pdf', format='pdf', dpi=300)  # DPI 설정으로 해상도 조정
plt.show()
plt.close()

NameError: name 'papers_df' is not defined

In [22]:
df = pd.read_parquet('MathBridge_ver2', engine='fastparquet')

In [23]:
df.columns

Index(['context_before', 'equation', 'context_after', 'eq_type', 'paper_type',
       'equation_len', 'spoken_English', 'arxiv_number', 'textbook_title'],
      dtype='object')

In [24]:
df = df.drop(columns=['eq_type','paper_type','arxiv_number','textbook_title'])

In [26]:
df.columns

Index(['context_before', 'equation', 'context_after', 'equation_len',
       'spoken_English'],
      dtype='object')

In [27]:
order = ['context_before', 'equation', 'context_after', 'spoken_English', 'equation_len']
df = df[order]

In [28]:
df

Unnamed: 0,context_before,equation,context_after,spoken_English,equation_len
0,The horizontal axis represents the exponent range,"$ \in [ 3 , 7 ] $",. We selected those categorical colors from Co...,Belongs to the interval from 3 to 7.,17
1,Here,$ d\eta $,"denotes the system noise , modeled as a Wiener...",d eta.,9
2,The agent 's actions,$ a ( t ) $,are modeled by a stochastic control process wi...,a function of t,11
3,are modeled by a stochastic control process wi...,$ \sigma^2_t $,controlled by the agent and with a mean of zer...,sigma squared sub t.,14
4,Here,$ \deltat $,"denotes the physical time step , and we adopte...",delta t.,11
...,...,...,...,...,...
40127014,T `` ` * Find the singular values of,$ A $,and use them to determine the variance in the ...,A,5
40127015,* For what fraction of the variance do the fir...,$ A $,and construct the,A,5
40127016,matrix,$ B $,whose entries are the coordinates of the demea...,B,5
40127017,Evaluating the following cell will load the vo...,$ A $,,A,5


In [29]:
df.to_parquet('MathBridge_ver3.parquet', engine='pyarrow')