In [36]:
import pandas as pd 
import numpy as np 
import tensorflow as tf 
from textblob import TextBlob
from transformers import T5Tokenizer, T5ForConditionalGeneration
import language_tool_python
import re

In [4]:
df = pd.read_csv("yelp_review.csv")
df1 = pd.read_csv("yelp_business.csv")
df3 = pd.merge(df1, df, on='business_id', how="outer")
df3.drop(
    columns=["neighborhood", "latitude", "longitude", "is_open", "review_id", "user_id", "date", "useful", "funny", "cool", "review_count", "stars_y"],
    inplace=True
)
df4 = df3[df3["state"] == "PA"][['name', 'text']]
df4 = df4.groupby('name').head(10)
df5 = df4.groupby('name')['text'].apply(lambda rows: ' + '.join(rows)).reset_index()
for i in range(len(df5)):
    df5.loc[i, 'name'] = df5.loc[i, 'name'].strip('"')

In [19]:
df5

Unnamed: 0,name,text
0,#1 Cochran Buick GMC of Monroeville,I bought my car at Cochran and the salesman wa...
1,#1 Cochran Buick GMC of Robinson,Bought a new truck and was lied to from the st...
2,#1 Cochran Cadillac - Monroeville,My parents have been buying cars off of Donna ...
3,#1 Cochran Hyundai - Monroeville,Always have a great experience here whether it...
4,#1 Cochran Hyundai of South Hills,"Hi,\n\nThis is to detail my June of 2017 buyin..."
...,...,...
8184,pet valu,I absolutely love this place. I just recently ...
8185,terraFITNESS,I have been working out with Sarah at terraFit...
8186,täkō,This place is YUMMY! A friend and I made our o...
8187,uBreakiFix,This place is pretty awesome. Had my iPhone sc...


In [10]:
df6 = df3[df3["state"]=="PA"][["name","text","city"]]

In [13]:
df6 = df6[df6['city'] == 'Pittsburgh']

In [20]:
df6 = df6.groupby('name').head(5)
df6.drop(columns='city',inplace=True)

In [21]:
df6 =df6.groupby('name')['text'].apply(lambda rows: ' + '.join(rows)).reset_index()


In [27]:
df7 = df6[:500]

In [28]:
def get_business_reviews(business_name):
    reviews = df7[df7['name'] == business_name]['text'].tolist()
    return " ".join(reviews)

def generate_summary(text, bullet_points):
    tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base")
    model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-base")
    # Prompt for more emphasis on certain words.
    prompt = (
        "summarize: Summarize the following reviews into 3 distinct bullet points that are grammatically correct and provide clear context.\n\n"
        "Reviews:\n" + text
    )  
    input_ids = tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)
    output = model.generate(
        input_ids,
        max_length=500,
        min_length=50,
        num_beams=6,
        early_stopping=True
    )
    
    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    sentence = [s.strip() for s in summary.split('. ') if s.strip()]
    bullet = ["- " + sen for sen in sentence][:bullet_points]
    
    return bullet

def correct_grammer(bullet_list):
    points = []
    tool = language_tool_python.LanguageTool('en-US')
    for point in bullet_list:
        corrected = tool.correct(point)
        points.append(corrected)
    return points

def get_prediction(business_name):
    reviews = get_business_reviews(business_name)
    summary = generate_summary(reviews, bullet_points=3)
    
    print("Original:", summary)
    
    correct_summary = correct_grammer(summary)
    return correct_summary


In [45]:
records = []
for i in range(len(df7)):
    business_name = df7['name'].loc[i]
    blob = TextBlob(get_business_reviews(business_name))
    polarity = blob.sentiment[0]
    subjectivity = blob.sentiment.subjectivity
    positive_percent = ((polarity + 1) / 2) * 100
    negative_percent = 100 - positive_percent
    subjectivity_percent = subjectivity * 100
    objectivity_percent = 100 - subjectivity_percent
    final_summary = get_prediction(business_name)
    #print("Positive: {:.2f}%".format(positive_percent))
    #print("Negative: {:.2f}%".format(negative_percent))     
    #print("Subjectivity: {:.2f}%".format(subjectivity_percent))
    #print("Objectivity: {:.2f}%".format(objectivity_percent))
    review_1 = final_summary[0] if len(final_summary) > 0 else ""
    review_2 = final_summary[1] if len(final_summary) > 1 else ""
    review_3 = final_summary[2] if len(final_summary) > 2 else ""
    record = {
        "Business_name": business_name,
        "positive_percent": positive_percent,
        "negative_percent": negative_percent,
        "subjectivity_percent": subjectivity_percent,
        "objectivity_percent": objectivity_percent,
        "review_1": review_1,
        "review_2": review_2,
        "review_3": review_3,
        "Combined_review": final_summary
    }
    records.append(record)
    print(f"Total Complete : {i} ")


Original: ['- reviewer: "this place SUCKS', '- all they care about is the sale, not YOU!" reviewer: "i was first on the list for a test drive on a car that was a solid deal" reviewer: "they asked me to have a seat in the waiting room because someone is buying the car"']
Total Complete : 0 
Original: ["- reviewer's issues with cochran Hyundai revolve around the deceptive nature of her car buying experience", '- cochran Hyundai of south hills gave me a quote of 23,306', '- when it came time to purchase they pulled out a price of 24,600 .']
Total Complete : 1 
Original: ["- reviewer says he was refused an appointment for his new sorento's 1st oil change", '- he said he would be cared for within 20 minutes if he showed up on his day off', "- reviewer says he's never left with an issue unresolved ."]
Total Complete : 2 
Original: ['- avoid #1 Cochran Nissan at all costs', "- don't sell your car for less than you'd planned, since it's not a 2016", "- don't sell your car for less than you'd p

In [46]:
results_df = pd.DataFrame(records)
results_df['Combined_review'] = results_df['Combined_review'].apply(
    lambda x: " ".join(x) if isinstance(x, list) else str(x)
)

results_df['Combined_review'] = (results_df['Combined_review']
                                 .str.replace(r'[\[\]\-"]', '', regex=True)
                                 .str.replace(r'\breviewer\b', '', regex=True))
for col in ['review_1', 'review_2', 'review_3']:
    results_df[col] = results_df[col].apply(lambda x: str(x))
    results_df[col] = (results_df[col]
                       .str.replace(r'[\[\]\-"]', '', regex=True)
                       .str.replace(r'\breviewer\b', '', regex=True))

In [47]:
results_df.to_excel("codefest_results_base_model.xlsx", index=False)