In [1]:
import gzip
import json
import random
import pandas as pd
# Specify the path to your JSON Lines file
file_path = 'Arxiv_data/arxiv-abstracts.jsonl.gz'

# Setting seed for reproducability
random.seed(42)

# Open the gzipped JSON Lines file
with gzip.open(file_path, 'rt', encoding='utf-8') as file:
    # Read a random subset of lines (adjust the sample_size as needed)
    sample_size = 500000  # Adjust this number as needed
    data = [json.loads(line) for line in random.sample(file.readlines(), sample_size)]

# Convert the list of dictionaries into a pandas DataFrame
df = pd.DataFrame(data)

# Display the first few rows of the DataFrame
df.head()
print(df.shape)

(500000, 11)


In [3]:
import sys

# Set the target size in megabytes
target_size_mb = 25

# Convert megabytes to bytes
target_size_bytes = target_size_mb * (1024 ** 2)

# Get the current size of the DataFrame
current_size_bytes = sys.getsizeof(df)

# Initialize an empty list to store DataFrames
subset_dfs = []

# Initialize a variable to keep track of the total size
total_size_bytes = 0

# Iterate through rows until the target size is reached or exceeded
for index, row in df.iterrows():
    # Convert the current row to a DataFrame with a single row
    row_df = pd.DataFrame([row])
    
    # Check the size of the row DataFrame
    row_size_bytes = sys.getsizeof(row_df)
    
    # Break the loop if adding the current row exceeds the target size
    if total_size_bytes + row_size_bytes > target_size_bytes:
        break
    
    # Append the current row DataFrame to the list
    subset_dfs.append(row_df)
    
    # Update the total size
    total_size_bytes += row_size_bytes

# Concatenate the list of DataFrames into the final subset DataFrame
subset_df = pd.concat(subset_dfs, ignore_index=True)

# Display the subset DataFrame
print(subset_df.head())
print(subset_df.shape)


           id         submitter  \
0  2008.13253      Edgar Galvan   
1   1012.4110       Eoin Butler   
2   0803.0691  Paul E. Gunnells   
3  2111.01041          Tong Liu   
4   1411.6806     Huy Pham Cong   

                                             authors  \
0  Edgar Galv\'an, Oxana Gorshkova, Peter Mooney,...   
1  Gorm B. Andresen, Mohammad D. Ashkezari, Marce...   
2                 Gautam Chinta and Paul E. Gunnells   
3                 Shuang-Xi Yi, Mei Du, and Tong Liu   
4                     C. Huy Pham and V. Lien Nguyen   

                                               title  \
0  Statistical Tree-based Population Seeding for ...   
1                    Search For Trapped Antihydrogen   
2  Constructing Weyl group multiple Dirichlet series   
3  Statistical analyses on the energies of X-ray ...   
4  Tunnelling through finite graphene superlattic...   

                                            comments  \
0                                 14 Pages, 5 Tables   
1  

In [5]:
# Specify the path for saving the JSON Lines file
output_jsonl_path = 'C:\\Users\\kimlu\\3. Semester\\News and Market sentiment\\Exam\\Arxiv_data\\subset_data.jsonl'

# Save the subset DataFrame to a JSON Lines file
subset_df.to_json(output_jsonl_path, orient='records', lines=True)

# Display the path to the saved JSON Lines file
print(f"Subset DataFrame saved to: {output_jsonl_path}")


Subset DataFrame saved to: C:\Users\kimlu\3. Semester\News and Market sentiment\Exam\Arxiv_data\subset_data.jsonl
