# LLM summerization

In [1]:
import pandas as pd
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from tqdm import tqdm

  torch.utils._pytree._register_pytree_node(


In [2]:
# -------------------------
# Step 1: 加载数据
# -------------------------
df = pd.read_csv('archive/yelp_2020.csv')
df["date"] = pd.to_datetime(df["date"])
df.head()

Unnamed: 0,review_id,user_id,business_id,stars_x,useful,funny,cool,text,date,name,...,state,postal_code,latitude,longitude,stars_y,review_count,is_open,attributes,categories,hours
0,J1LZjzbs5bFubvS135SD2g,5TE19zTjTIPq1HANACN7sw,dChRGpit9fM_kZK5pafNyA,5.0,1,0,1,Had a great big meal with family and we loved ...,2020-01-20 00:36:44,The Love,...,PA,19103.0,39.950656,-75.170899,4.0,618,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Restaurants, American (New), Breakfast & Brunc...","{'Monday': '0:0-0:0', 'Tuesday': '17:0-21:0', ..."
1,ecMiAOFucDM3zwXYfY-Q6A,5Z8S9OsHWCnE8wbxk1poQQ,s3Q1J4XEVOBiZy9dYUpqpg,5.0,0,0,0,Many locations. All have lines so be prepared...,2020-02-16 22:52:13,Green Eggs Cafe,...,PA,19125.0,39.976974,-75.124114,3.5,93,1,"{'BusinessAcceptsCreditCards': 'False', 'HasTV...","Breakfast & Brunch, Restaurants","{'Monday': '9:0-15:0', 'Tuesday': '9:0-15:0', ..."
2,yuFQRhHo3z4TgE6drPXSgg,hcw7ndQKWGEH4P7BYAlG9w,JUlsvVAvZvGHWFfkKm0nlg,5.0,1,0,0,Compliments to the chef and to the rest of the...,2020-01-12 00:55:58,El Camino Real,...,PA,19123.0,39.9673,-75.140398,3.5,1014,1,"{'BikeParking': 'True', 'RestaurantsPriceRange...","Mexican, Tex-Mex, Restaurants, Barbeque","{'Monday': '0:0-0:0', 'Tuesday': '11:0-23:0', ..."
3,Zdh0_HtE724MnohLOrB5Iw,OYaEBYLBrLY4mla8bOMbnA,9b0Mrvs6uJu2jJqet_Jwew,4.0,0,0,0,I decided to try this spot out -- and it didn'...,2020-01-15 19:29:25,Asia Nail 2 & Spa,...,PA,19146.0,39.938156,-75.172322,3.5,26,1,"{'BusinessAcceptsCreditCards': 'True', 'Busine...","Waxing, Nail Salons, Beauty & Spas, Health & M...","{'Monday': '9:30-20:0', 'Tuesday': '9:30-20:0'..."
4,y_XYEZk2Cin-q4N0czeaYw,_9VhEn9zaB-6txE3STNfLw,PYUI1OJVksGUbCrteU68bw,3.0,0,0,0,"First off, finding parking is atrocious. Your ...",2020-02-17 13:53:51,Bourbon & Branch,...,PA,19123.0,39.96207,-75.14104,4.0,392,1,"{'Alcohol': ""u'full_bar'"", 'BusinessParking': ...","Bars, American (Traditional), Breakfast & Brun...","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'..."


In [3]:
# Define the dataset
reviews = df[['user_id', 'business_id', 'text']]

In [4]:
# -------------------------
# Step 2: 定义关键词
# -------------------------
aspect_keywords = {
    "Food": ["food", "taste", "flavor", "delicious", "fresh", "quality", "poor"],
    "Service": ["staff", "friendly", "rude", "attitude", "service", "slow", "amazing"],
    "Pricing": ["price", "cheap", "expensive", "value", "high", "reasonable"]
}

In [5]:
# -------------------------
# Step 3: 加载情感分析模型
# -------------------------
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english",
    device=0  # 指定 GPU
)



In [6]:
# -------------------------
# Step 4: 文本向量化和 LDA
# -------------------------
# 向量化评论文本
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
review_vectors = vectorizer.fit_transform(reviews['text'])

# LDA 模型（启用多线程）
lda = LatentDirichletAllocation(n_components=3, random_state=42, n_jobs=-1)
lda.fit(review_vectors)

# 打印主题关键词
words = vectorizer.get_feature_names_out()
topics = {i: [words[j] for j in topic.argsort()[-10:]] for i, topic in enumerate(lda.components_)}
print("LDA Topics:", topics)

LDA Topics: {0: ['friendly', 'amazing', 'philly', 'service', 'love', 'best', 'good', 'place', 'food', 'great'], 1: ['like', 'sauce', 'delicious', 'got', 'cheese', 'pizza', 'ordered', 'food', 'good', 'chicken'], 2: ['people', 'work', 'don', 'told', 'customer', 'order', 'just', 'did', 'service', 'time']}


In [7]:
# -------------------------
# Step 5: 定义分析函数
# -------------------------

def classify_aspects(text, sentiment_pipeline, aspect_keywords):
    # 分割文本，每段长度不超过 512 个字符
    max_length = 512
    chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
    
    aspect_scores = {"Food": 0, "Service": 0, "Pricing": 0}
    for chunk in chunks:
        sentiment = sentiment_pipeline(chunk)[0]  # 获取情感极性
        score = 1 if sentiment['label'] == 'POSITIVE' else -1  # 情感正负分数
        
        for aspect, keywords in aspect_keywords.items():
            if any(word in chunk.lower() for word in keywords):  # 匹配关键词
                aspect_scores[aspect] += score

    return aspect_scores

# 综合评分函数
def analyze_review(text, lda, vectorizer, sentiment_pipeline, aspect_keywords):
    # 情感分析评分
    sentiment_scores = classify_aspects(text, sentiment_pipeline, aspect_keywords)
    
    # LDA 主题分布
    review_vector = vectorizer.transform([text])
    topic_distribution = lda.transform(review_vector)[0]
    
    # 将主题分布纳入评分
    aspect_scores = sentiment_scores
    aspect_scores["Topic_Distribution"] = topic_distribution
    return aspect_scores

In [None]:
# -------------------------
# Step 6: 应用到数据框
# -------------------------

# 批量大小
batch_size = 5000

# 初始化结果
food_scores = []
service_scores = []
pricing_scores = []

# 分批处理评论
for i in tqdm(range(0, len(reviews), batch_size), desc="Processing batches"):
    batch = reviews['text'].iloc[i:i+batch_size]
    
    for text in tqdm(batch, desc=f"Processing batch {i//batch_size + 1}", leave=False):
        scores = analyze_review(text, lda, vectorizer, sentiment_pipeline, aspect_keywords)
        food_scores.append(scores["Food"])
        service_scores.append(scores["Service"])
        pricing_scores.append(scores["Pricing"])

# 将结果存入新列
reviews['food'] = food_scores
reviews['service'] = service_scores
reviews['price'] = pricing_scores

Processing batches:   0%|          | 0/11 [00:00<?, ?it/s]
Processing batch 1:   0%|          | 0/5000 [00:00<?, ?it/s][A

Processing batch 1:   0%|          | 8/5000 [00:00<02:47, 29.82it/s][A
Processing batch 1:   0%|          | 16/5000 [00:00<01:46, 46.61it/s][A
Processing batch 1:   0%|          | 25/5000 [00:00<01:22, 60.53it/s][A
Processing batch 1:   1%|          | 34/5000 [00:00<01:15, 65.85it/s][A
Processing batch 1:   1%|          | 42/5000 [00:00<01:13, 67.84it/s][A
Processing batch 1:   1%|          | 52/5000 [00:00<01:04, 76.37it/s][A
Processing batch 1:   1%|          | 62/5000 [00:00<01:00, 81.73it/s][A
Processing batch 1:   1%|▏         | 72/5000 [00:01<00:57, 85.80it/s][A
Processing batch 1:   2%|▏         | 82/5000 [00:01<00:55, 88.57it/s][A
Processing batch 1:   2%|▏         | 91/5000 [00:01<00:55, 88.78it/s][A
Processing batch 1:   2%|▏         | 101/5000 [00:01<00:54, 90.68it/s][A
Processing batch 1:   2%|▏         | 111/5000 [00:01<00:54, 89.47it/s][A

In [9]:
# -------------------------
# Step 7: 保存结果
# -------------------------
# 保存到文件
reviews.to_csv("processed_yelp_reviews.csv", index=False)
print("Processed reviews saved to 'processed_yelp_reviews.csv'.")

Processed reviews saved to 'processed_yelp_reviews.csv'.


In [10]:
reviews

Unnamed: 0,user_id,business_id,text,food,service,price
0,5TE19zTjTIPq1HANACN7sw,dChRGpit9fM_kZK5pafNyA,Had a great big meal with family and we loved ...,1,1,0
1,5Z8S9OsHWCnE8wbxk1poQQ,s3Q1J4XEVOBiZy9dYUpqpg,Many locations. All have lines so be prepared...,1,0,0
2,hcw7ndQKWGEH4P7BYAlG9w,JUlsvVAvZvGHWFfkKm0nlg,Compliments to the chef and to the rest of the...,1,1,0
3,OYaEBYLBrLY4mla8bOMbnA,9b0Mrvs6uJu2jJqet_Jwew,I decided to try this spot out -- and it didn'...,0,1,1
4,_9VhEn9zaB-6txE3STNfLw,PYUI1OJVksGUbCrteU68bw,"First off, finding parking is atrocious. Your ...",-1,0,0
...,...,...,...,...,...,...
50198,zQSW0_fi46U_066lo88GXQ,ujsFpiiGu92nUsjlFc9pew,I only come to this location because the parki...,-1,-1,0
50199,M5ogJkrrJEtR6Ao3rtQqIA,cu-s8VMSSUKPosPS9ktGaw,What a selection! WOWand not just a wide range...,0,0,0
50200,RSh_2CaUDojKJBxL-pVdNQ,-_0w82rV27l51N3ABkCI0A,Definitely the best tacos in philly. Being fro...,0,0,0
50201,wKX_BuB3qihHeBJQme_tYg,oD3mqAFsT04wPeKvOU7JaA,I contacted Epic and received an almost immedi...,0,1,0
