In [9]:
import os
os.getcwd()

'/Users/ming/Desktop/COM_SCI_245/cs245_project/AgentSocietyChallenge/rec_agent_experiment'

In [13]:
import json
import pandas as pd
from tqdm import tqdm
import os

In [27]:
pd.set_option('display.max_columns', None)

In [14]:
def load_jsonlines(file_path, n_rows=None):
    """
    Load a JSON Lines (.json or .jsonl) file into a Pandas DataFrame safely.

    Args:
        file_path (str): Path to the JSON Lines file
        n_rows (int, optional): Limit number of rows to load (for large files)

    Returns:
        pd.DataFrame: Parsed dataframe
    """
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(tqdm(f, desc=f"Loading {os.path.basename(file_path)}")):
            try:
                record = json.loads(line.strip())
                data.append(record)
            except json.JSONDecodeError:
                continue  # skip malformed lines
            if n_rows and i >= n_rows:
                break
    df = pd.DataFrame(data)
    print(f"✅ Loaded {len(df):,} rows × {len(df.columns)} columns.")
    return df

In [33]:
user_df = load_jsonlines('../data/user.json')
item_df = load_jsonlines('../data/item.json')
review_df = load_jsonlines('../data/review.json')

Loading user.json: 889698it [00:04, 202468.27it/s]


✅ Loaded 889,698 rows × 23 columns.


Loading item.json: 358923it [00:11, 30693.34it/s]


✅ Loaded 358,923 rows × 55 columns.


Loading review.json: 5171890it [00:31, 163865.51it/s]


✅ Loaded 5,171,890 rows × 23 columns.


In [35]:
sampled = (
    item_df
    .groupby(['source', 'type'], group_keys=False)
    .apply(lambda x: x.sample(min(len(x), 2), random_state=42))
    .reset_index(drop=True)
)
display(sampled[['source', 'type']].head(10))

  .apply(lambda x: x.sample(min(len(x), 2), random_state=42))


Unnamed: 0,source,type
0,amazon,product
1,amazon,product
2,goodreads,book
3,goodreads,book
4,yelp,business
5,yelp,business


In [62]:
sources = ['yelp','amazon', 'goodreads']
data = {'item': item_df, 'user': user_df, 'review': review_df}

### item df eda

In [129]:
for name, df in data.items():
    for source in sources:
        not_nan_col = df[df.source == source].columns[~df[df.source == source].isna().all()]
        print(f'{name}_{source} columns: {not_nan_col}')

item_yelp columns: Index(['item_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours', 'source', 'type'],
      dtype='object')
item_amazon columns: Index(['item_id', 'categories', 'source', 'type', 'main_category', 'title',
       'average_rating', 'rating_number', 'features', 'description', 'price',
       'images', 'videos', 'store', 'details', 'subtitle', 'author'],
      dtype='object')
item_goodreads columns: Index(['item_id', 'source', 'type', 'title', 'average_rating', 'description',
       'isbn', 'text_reviews_count', 'series', 'country_code', 'language_code',
       'popular_shelves', 'asin', 'is_ebook', 'kindle_asin', 'similar_books',
       'format', 'link', 'authors', 'publisher', 'num_pages',
       'publication_day', 'isbn13', 'publication_month', 'edition_information',
       'publication_year', 'url', 'image_url', 'ratings_count', 'work_id',
       'title_w

In [105]:
item_df['type']

0         business
1         business
2         business
3         business
4         business
            ...   
358918        book
358919        book
358920        book
358921        book
358922        book
Name: type, Length: 358923, dtype: object

In [None]:
#in items.json, amazon's categories data is list and yelp's categories data is str
cols_to_check = ['type', 'source', 'categories']

for col in cols_to_check:
    mask = item_df[col].apply(lambda x: isinstance(x, (list, dict, set)))
    if mask.any():  # 如果这一列有问题
        print(f"\n🚨 Column '{col}' has {mask.sum()} unhashable elements:")
        for idx, val in item_df.loc[mask, col].items():
            print(f"  Row {idx}: {val}")


🚨 Column 'categories' has 76047 unhashable elements:
  Row 32869: ['Industrial & Scientific', 'Test, Measure & Inspect', 'Dimensional Measurement', 'Calipers', 'Dial Calipers']
  Row 32870: ['Industrial & Scientific', 'Industrial Electrical', 'Passive Components', 'Resistors', 'Variable Resistors', 'Potentiometers']
  Row 32871: ['Industrial & Scientific', 'Food Service Equipment & Supplies', 'Disposables', 'Take Out Containers', 'Bakery Take Out Containers']
  Row 32872: ['Industrial & Scientific', 'Material Handling Products', 'Industrial Magnets', 'Rare Earth Magnets']
  Row 32873: ['Industrial & Scientific', 'Abrasive & Finishing Products', 'Finishing Products', 'Manual Sanding Products', 'Sanding Sponges']
  Row 32874: ['Industrial & Scientific', 'Professional Medical Supplies', 'Diagnostics & Screening', 'Stethoscopes']
  Row 32875: ['Industrial & Scientific', 'Hydraulics, Pneumatics & Plumbing', 'Fittings', 'Pipe Fittings']
  Row 32876: ['Industrial & Scientific', 'Fasteners', 

In [121]:
yelp_categories = item_df.loc[item_df['source'] == 'yelp',['type','source','categories']].drop_duplicates()
yelp_categories['categories_large'] = yelp_categories['categories'].str.split(', ').str[1]

In [128]:
print('item data: yelp different categories; yelp task-"candidate_category" is type + categories_large')
display(yelp_categories[['type','source','categories_large']].drop_duplicates())
print('item data: amazon,goodreads type and source')
display(item_df.loc[item_df['source'] != 'yelp',['type','source']].drop_duplicates())
print('user data: source')
display(user_df[['source']].drop_duplicates())
print('review data: type and source')
display(review_df[['type','source']].drop_duplicates())

item data: yelp different categories; yelp task-"candidate_category" is type + categories_large


Unnamed: 0,type,source,categories_large
0,business,yelp,Shopping
1,business,yelp,Food
2,business,yelp,Restaurants
3,business,yelp,Auto Parts & Supplies
5,business,yelp,Bars
...,...,...,...
32041,business,yelp,Shoe Shine
32117,business,yelp,Basketball Courts
32472,business,yelp,Food Banks
32720,business,yelp,Osteopathic Physicians


item data: amazon,goodreads type and source


Unnamed: 0,type,source
32869,product,amazon
108916,book,goodreads


user data: source


Unnamed: 0,source
0,yelp
558111,amazon
752327,goodreads


review data: type and source


Unnamed: 0,type,source
0,business,yelp
1827321,product,amazon
3740357,book,goodreads
