In [1]:
import json
import math
from collections import defaultdict
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import os

In [2]:

class HotelReviewStats:
    def __init__(self, reviews=None):
        self.reviews = reviews
        self.results = []
        self.stats = defaultdict(lambda: {
            "count_review": 0,
            "count_review_with_image": 0,
            "count_long_review": 0,
            "total_score": 0.0,
            "num_score": 0,
            "avg_score": 0.0
        })
        self.final_scores = {}

    def process_reviews(self):
        """
        Xử lý danh sách đánh giá và tính toán các số liệu thống kê cơ bản.
        """
        for review in tqdm(self.reviews, total=len(self.reviews), desc="Processing: "):
            hotel_id = review.get('id')
            if not hotel_id:  # Kiểm tra nếu hotel_id không tồn tại
                continue
            self.stats[hotel_id]['count_review'] += 1
            
            # Review with image
            if review.get('review_photo'):
                self.stats[hotel_id]['count_review_with_image'] += 1
                
            # Long review
            pos = review.get("review_positive", "")
            neg = review.get("review_negative", "")
            text = f"{pos} {neg}".strip()

            if len(text) > 150:
                self.stats[hotel_id]['count_long_review'] += 1
            
            score_str = review.get("review_score")
            try:
                score = float(score_str.split()[-1].replace(",", "."))
                self.stats[hotel_id]["total_score"] += score
                self.stats[hotel_id]["num_score"] += 1
                self.stats[hotel_id]["avg_score"] = (
                    self.stats[hotel_id]["total_score"] / self.stats[hotel_id]["num_score"]
                )
            except:
                continue
        
        return self.stats

    def save_stats(self, filename='review_quality.json.json'):
        """
        Lưu self.stats vào file JSON.
        
        Args:
            filename: Tên file để lưu (mặc định: hotel_stats.json)
        """
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(dict(self.stats), f, ensure_ascii=False, indent=4)
        print(f"Stats saved to {filename}")

    def load_stats(self, filename="D:\\graduate_dissertation\\final\\dataset\\review_hotel\\review_quality.json"):
        """
        Load self.stats từ file JSON.
        
        Args:
            filename: Tên file để đọc (mặc định: review_quality.json.json)
        
        Raises:
            FileNotFoundError: Nếu file không tồn tại
        """
        if not os.path.exists(filename):
            raise FileNotFoundError(f"File {filename} not found")
        
        with open(filename, 'r', encoding='utf-8') as f:
            loaded_stats = json.load(f)
        
        # Chuyển dictionary thường thành defaultdict
        self.stats = defaultdict(lambda: {
            "count_review": 0,
            "count_review_with_image": 0,
            "count_long_review": 0,
            "total_score": 0.0,
            "num_score": 0,
            "avg_score": 0.0
        })

        for hotel_id, data in loaded_stats.items():
            self.stats[hotel_id] = data
        
        print(f"Stats loaded from {filename}")
        return self.stats
        

    def calculate_statistics(self, hotel_ids=None):
        """
        Tính các số liệu thống kê cho các tham số trong stats.
        Nếu hotel_ids được cung cấp, chỉ tính trên các khách sạn được chỉ định.
        
        Args:
            hotel_ids: List chứa các hotel_id (mặc định: None, tính trên tất cả)
        
        Returns:
            Dictionary chứa số liệu thống kê
        """
        # Nếu hotel_ids được cung cấp, lọc stats
        if hotel_ids is not None:
            filtered_stats = {hid: self.stats[hid] for hid in hotel_ids if hid in self.stats}
        else:
            filtered_stats = self.stats
        
        count_reviews = []
        count_reviews_with_image = []
        count_long_reviews = []
        avg_scores = []
        
        for hotel_id, data in filtered_stats.items():
            count_reviews.append(data['count_review'])
            count_reviews_with_image.append(data['count_review_with_image'])
            count_long_reviews.append(data['count_long_review'])
            avg_scores.append(data['avg_score'])
        
        def compute_stats(values):
            return {
                'count': len(values),
                'mean': np.mean(values) if values else 0,
                'std': np.std(values) if len(values) > 1 else 0,
                'min': np.min(values) if values else 0,
                'max': np.max(values) if values else 0
            }
        
        statistics = {
            'count_review': compute_stats(count_reviews),
            'count_review_with_image': compute_stats(count_reviews_with_image),
            'count_long_review': compute_stats(count_long_reviews),
            'avg_score': compute_stats(avg_scores)
        }
        
        return statistics

    # def calculate_final_score_local_data(self, hotel_ids, print_warning=True):

    #     """
    #     Tính điểm số (0-10) cho danh sách khách sạn được chỉ định, sử dụng số liệu thống kê tính lại.
        
    #     Args:
    #         hotel_ids: List chứa các hotel_id cần tính điểm
        
    #     Returns:
    #         Dictionary với hotel_id và final_score
    #     """
        
    #     # Kiểm tra nếu danh sách rỗng hoặc không có hotel_id hợp lệ
    #     valid_ids = [hid for hid in hotel_ids if hid in self.stats]
    #     if not valid_ids:
    #         print("Warning: No valid hotel IDs provided")
    #         return {}
        
    #     # Tính số liệu thống kê trên tập con hotel_ids
    #     stats_info = self.calculate_statistics(hotel_ids=valid_ids)
        

    #     # Trích xuất mean và std để chuẩn hóa z-score
    #     mean_count_review = stats_info['count_review']['mean']
    #     std_count_review = stats_info['count_review']['std'] if stats_info['count_review']['std'] > 0 else 1
    #     mean_review_with_image = stats_info['count_review_with_image']['mean']
    #     std_review_with_image = stats_info['count_review_with_image']['std'] if stats_info['count_review_with_image']['std'] > 0 else 1
    #     mean_long_review = stats_info['count_long_review']['mean']
    #     std_long_review = stats_info['count_long_review']['std'] if stats_info['count_long_review']['std'] > 0 else 1
    #     mean_avg_score = stats_info['avg_score']['mean']
    #     std_avg_score = stats_info['avg_score']['std'] if stats_info['avg_score']['std'] > 0 else 1
        

    #     local_scores = {}

        
    #     for hotel_id in hotel_ids:
    #         if hotel_id not in self.stats:
    #             if print_warning:
    #                 print(f"Warning: Hotel ID {hotel_id} not found in stats")
    #                 continue
    #             else:
    #                 continue
            
    #         data = self.stats[hotel_id]

    #         # Lấy các tham số
    #         count_review = data['count_review']
    #         count_review_with_image = data['count_review_with_image']
    #         count_long_review = data['count_long_review']
    #         avg_score = data['avg_score']
            
    #         # Chuẩn hóa z-score
    #         norm_count_review = (count_review - mean_count_review) / std_count_review
    #         norm_review_with_image = (count_review_with_image - mean_review_with_image) / std_review_with_image
    #         norm_long_review = (count_long_review - mean_long_review) / std_long_review
    #         norm_avg_score = (avg_score - mean_avg_score) / std_avg_score
            
    #         # Giới hạn giá trị chuẩn hóa trong khoảng [-2, 2] để tránh ngoại lai
    #         norm_count_review = max(min(norm_count_review, 2), -2)
    #         norm_review_with_image = max(min(norm_review_with_image, 2), -2)
    #         norm_long_review = max(min(norm_long_review, 2), -2)
    #         norm_avg_score = max(min(norm_avg_score, 2), -2)
            
    #         # Chuyển z-score về thang 0-1
    #         norm_count_review = (norm_count_review + 2) / 4
    #         norm_review_with_image = (norm_review_with_image + 2) / 4
    #         norm_long_review = (norm_long_review + 2) / 4
    #         norm_avg_score = (norm_avg_score + 2) / 4
            
    #         # Tính điểm số với trọng số
    #         final_score = (
    #             0.5 * norm_avg_score +          # Trọng số cho avg_score
    #             0.2 * norm_count_review +       # Trọng số cho count_review
    #             0.15 * norm_review_with_image + # Trọng số cho review với hình
    #             0.15 * norm_long_review         # Trọng số cho review dài
    #         )
            
    #         # Chuẩn hóa về thang 0-10
    #         final_score = final_score * 10.0
            
    #         local_scores[hotel_id] = round(final_score, 2)
        
    #     return local_scores

    
    def calculate_final_score_local_data(self, hotel_ids, print_warning=True):

        """
        Tính điểm số (0-10) cho danh sách khách sạn được chỉ định, sử dụng số liệu thống kê tính lại.
        
        Args:
            hotel_ids: List chứa các hotel_id cần tính điểm
        
        Returns:
            Dictionary với hotel_id và final_score
        """
        
        # Kiểm tra nếu danh sách rỗng hoặc không có hotel_id hợp lệ
        valid_ids = [hid for hid in hotel_ids if hid in self.stats]
        if not valid_ids:
            print("Warning: No valid hotel IDs provided")
            return {}
        
        # Tính số liệu thống kê trên tập con hotel_ids
        stats_info = self.calculate_statistics(hotel_ids=valid_ids)
        

        # Trích xuất mean và std để chuẩn hóa z-score
        mean_count_review = stats_info['count_review']['mean']
        std_count_review = stats_info['count_review']['std'] if stats_info['count_review']['std'] > 0 else 1
        mean_review_with_image = stats_info['count_review_with_image']['mean']
        std_review_with_image = stats_info['count_review_with_image']['std'] if stats_info['count_review_with_image']['std'] > 0 else 1
        mean_long_review = stats_info['count_long_review']['mean']
        std_long_review = stats_info['count_long_review']['std'] if stats_info['count_long_review']['std'] > 0 else 1
        mean_avg_score = stats_info['avg_score']['mean']
        std_avg_score = stats_info['avg_score']['std'] if stats_info['avg_score']['std'] > 0 else 1
        

        local_scores = {}

        
        
        for hotel_id in hotel_ids:
            if hotel_id not in self.stats:
                if print_warning:
                    print(f"Warning: Hotel ID {hotel_id} not found in stats")
                    continue
                else:
                    continue
            
            data = self.stats[hotel_id]
            # Lấy các tham số
            count_review = data['count_review']
            count_review_with_image = data['count_review_with_image']
            count_long_review = data['count_long_review']
            avg_score = data['avg_score']
            
            # Chuẩn hóa các tham số
            # 1. avg_score: Chuẩn hóa tuyến tính về 0-1 dựa trên min/max
            norm_avg_score = (avg_score - stats_info['avg_score']['min']) / (
                stats_info['avg_score']['max'] - stats_info['avg_score']['min']
            )
            
            # 2. count_review: Chuẩn hóa logarit để giảm ảnh hưởng giá trị lớn
            max_log_review = math.log1p(stats_info['count_review']['max'])
            norm_count_review = math.log1p(count_review) / max_log_review if count_review > 0 else 0
            
            image_ratio = (
                count_review_with_image / count_review if count_review > 0 else 0
            )
            norm_review_with_image = min(image_ratio / 0.5, 1.0)  

            long_review_ratio = (
                count_long_review / count_review if count_review > 0 else 0
            )
            norm_long_review = min(long_review_ratio / 0.5, 1.0)  
            
            final_score = (
                0.6 * norm_avg_score +          
                0.15 * norm_count_review +     
                0.125 * norm_review_with_image + 
                0.125 * norm_long_review        
            )
            
            # Chuẩn hóa về thang 0-10
            final_score = final_score 
            
            local_scores[hotel_id] = round(final_score, 2)
        
        return local_scores

    def calculate_final_score_global_data(self):
        
        """
        Tính điểm số cuối cùng (0-10) cho mỗi khách sạn dựa trên số liệu thống kê.
        
        Returns:
            Dictionary với hotel_id và final_score
        """
        # Lấy số liệu thống kê để chuẩn hóa
        stats_info = self.calculate_statistics()
        
        # Trích xuất mean và std để chuẩn hóa z-score
        mean_count_review = stats_info['count_review']['mean']
        std_count_review = stats_info['count_review']['std'] if stats_info['count_review']['std'] > 0 else 1
        mean_review_with_image = stats_info['count_review_with_image']['mean']
        std_review_with_image = stats_info['count_review_with_image']['std'] if stats_info['count_review_with_image']['std'] > 0 else 1
        mean_long_review = stats_info['count_long_review']['mean']
        std_long_review = stats_info['count_long_review']['std'] if stats_info['count_long_review']['std'] > 0 else 1
        mean_avg_score = stats_info['avg_score']['mean']
        std_avg_score = stats_info['avg_score']['std'] if stats_info['avg_score']['std'] > 0 else 1
        
        self.final_scores = {}
        
        for hotel_id, data in self.stats.items():
            # Lấy các tham số
            count_review = data['count_review']
            count_review_with_image = data['count_review_with_image']
            count_long_review = data['count_long_review']
            avg_score = data['avg_score']
            
            # Chuẩn hóa các tham số
            # 1. avg_score: Chuẩn hóa tuyến tính về 0-1 dựa trên min/max
            norm_avg_score = (avg_score - stats_info['avg_score']['min']) / (
                stats_info['avg_score']['max'] - stats_info['avg_score']['min']
            )
            
            # 2. count_review: Chuẩn hóa logarit để giảm ảnh hưởng giá trị lớn
            max_log_review = math.log1p(stats_info['count_review']['max'])
            norm_count_review = math.log1p(count_review) / max_log_review if count_review > 0 else 0
            
            image_ratio = (
                count_review_with_image / count_review if count_review > 0 else 0
            )
            norm_review_with_image = min(image_ratio / 0.5, 1.0)  

            long_review_ratio = (
                count_long_review / count_review if count_review > 0 else 0
            )
            norm_long_review = min(long_review_ratio / 0.5, 1.0)  
            
            final_score = (
                0.6 * norm_avg_score +          
                0.15 * norm_count_review +     
                0.125 * norm_review_with_image + 
                0.125 * norm_long_review        
            )
            
            # Chuẩn hóa về thang 0-10
            final_score = final_score * 10.0
            
            self.final_scores[hotel_id] = round(final_score, 2)
        
        return self.final_scores

    def visualize_distributions(self, output_file='distributions.png'):
        """
        Vẽ biểu đồ phân bố cho các tham số.
        
        Args:
            output_file: Tên file để lưu biểu đồ
        """
        plt.figure(figsize=(12, 8))
        
        # Lấy dữ liệu
        count_reviews = [data['count_review'] for data in self.stats.values()]
        count_reviews_with_image = [data['count_review_with_image'] for data in self.stats.values()]
        count_long_reviews = [data['count_long_review'] for data in self.stats.values()]
        avg_scores = [data['avg_score'] for data in self.stats.values()]
        
        # Vẽ 4 biểu đồ phân bố
        plt.subplot(2, 2, 1)
        plt.hist(count_reviews, bins=20, color='skyblue', edgecolor='black')
        plt.title('Distribution of Count Review')
        plt.xlabel('Count Review')
        plt.ylabel('Frequency')
        
        plt.subplot(2, 2, 2)
        plt.hist(count_reviews_with_image, bins=20, color='lightgreen', edgecolor='black')
        plt.title('Distribution of Count Review with Image')
        plt.xlabel('Count Review with Image')
        plt.ylabel('Frequency')
        
        plt.subplot(2, 2, 3)
        plt.hist(count_long_reviews, bins=20, color='salmon', edgecolor='black')
        plt.title('Distribution of Count Long Review')
        plt.xlabel('Count Long Review')
        plt.ylabel('Frequency')
        
        plt.subplot(2, 2, 4)
        plt.hist(avg_scores, bins=20, color='lightcoral', edgecolor='black')
        plt.title('Distribution of Avg Score')
        plt.xlabel('Avg Score')
        plt.ylabel('Frequency')
        
        plt.tight_layout()
        plt.show()
        plt.savefig(output_file)
        plt.close()

    def visualize_scatter(self, output_file='scatter.png'):
        """
        Vẽ biểu đồ phân tán giữa avg_score và count_review, với kích thước điểm tỷ lệ với count_long_review.
        
        Args:
            output_file: Tên file để lưu biểu đồ
        """
        plt.figure(figsize=(10, 6))
        
        # Lấy dữ liệu
        count_reviews = [data['count_review'] for data in self.stats.values()]
        avg_scores = [data['avg_score'] for data in self.stats.values()]
        count_long_reviews = [data['count_long_review'] for data in self.stats.values()]
        
        # Chuẩn hóa count_long_review để làm kích thước điểm
        max_long_review = max(count_long_reviews) if count_long_reviews else 1
        sizes = [(cl / max_long_review) * 1000 for cl in count_long_reviews]
        
        plt.scatter(count_reviews, avg_scores, s=sizes, alpha=0.5, c='blue')
        plt.title('Scatter: Avg Score vs Count Review')
        plt.xlabel('Count Review')
        plt.ylabel('Avg Score')
        plt.grid(True)
        plt.show()
        plt.savefig(output_file)
        plt.close()

    def visualize_final_scores(self, output_file='final_scores.png'):
        """
        Vẽ biểu đồ phân bố của final_score.
        
        Args:
            output_file: Tên file để lưu biểu đồ
        """
        if not self.final_scores:
            self.calculate_final_score()
        
        plt.figure(figsize=(8, 6))
        scores = list(self.final_scores.values())
        plt.hist(scores, bins=20, color='purple', edgecolor='black')
        plt.title('Distribution of Final Scores')
        plt.xlabel('Final Score')
        plt.ylabel('Frequency')
        plt.show()  
        plt.savefig(output_file)
        plt.close()

In [6]:
stats = hotel_review_stats.load_stats("D:\\graduate_dissertation\\final\\dataset\\review_hotel\\review_quality.json")
hotel_review_stats.calculate_final_score_local_data(ids_result, print_warning=False)

Stats loaded from D:\graduate_dissertation\final\dataset\review_hotel\review_quality.json


{'10000593': 0.77,
 '10151628': 0.79,
 '10602492': 0.74,
 '10679047': 0.35,
 '1087896': 0.51,
 '1089325': 0.37,
 '10896945': 0.62,
 '1095388': 0.79,
 '1116170': 0.72,
 '11240069': 0.59,
 '11242456': 0.58,
 '11256679': 0.73,
 '11307999': 0.76,
 '11320501': 0.66,
 '11402236': 0.81,
 '11429049': 0.67,
 '11438385': 0.7,
 '11438620': 0.63,
 '11465602': 0.63,
 '11510053': 0.28,
 '11512963': 0.72,
 '11542377': 0.79,
 '11569083': 0.68,
 '11676701': 0.65,
 '11684019': 0.63,
 '11695111': 0.69,
 '11715691': 0.53,
 '11754233': 0.7,
 '11765433': 0.71,
 '11766845': 0.85,
 '11789496': 0.27,
 '11833388': 0.64,
 '11841796': 0.71,
 '11900070': 0.83,
 '1197436': 0.79,
 '11985781': 0.64,
 '11998735': 0.76,
 '12404896': 0.86,
 '12440315': 0.65,
 '12466042': 0.68,
 '12596924': 0.71,
 '12638180': 0.67,
 '12701535': 0.61,
 '12755317': 0.46,
 '12765115': 0.71,
 '1290297': 0.5,
 '13001177': 0.59,
 '13172950': 0.63,
 '13191337': 0.6,
 '13210896': 0.55,
 '13216464': 0.77,
 '13247067': 0.7,
 '13266737': 0.81,
 '13

In [5]:
ids_result = ['10000593', '10151628', '1052120', '10602492', '10679047', '1087896', '1089325', '10896945', '1095388', '1116170', '11161908', '11240069', '11242456', '11256679', '11270049', '11295606', '11307999', '11320326', '11320501', '11373382', '1139091', '11402236', '11429049', '11438385', '11438620', '11465602', '11480287', '11510053', '11512963', '11542377', '11569083', '11577537', '11591287', '11676701', '11684019', '11695111', '11715691', '11754233', '11765433', '11766845', '11788268', '11789496', '11802757', '11822990', '11833388', '11841796', '11900070', '11918325', '1197436', '11985781', '11991522', '11998735', '12005491', '12039789', '12144659', '12404896', '12440315', '12466042', '12596924', '12638180', '12670972', '12671360', '12673545', '12697585', '12701535', '12731957', '12755317', '12765115', '12767709', '12837627', '12870016', '1290297', '13001177', '13136703', '13172950', '13191337', '13210281', '13210896', '13216464', '13247067', '13266737', '13275333', '13297433', '13312901', '13328353', '13357213', '13359845', '13363606', '13386802', '13393412', '13409368', '13425214', '13429128', '13449473', '13488416', '13488847', '13512307', '13513785', '13540175', '13545552', '13550135', '13579007', '13612785', '1379498', '1571248', '1576786', '1587323', '1607798', '1645562', '1669977', '1942807', '1965263', '2006627', '2028376', '2111117', '2125472', '2186818', '2186835', '2238468', '2358664', '2361422', '2423887', '2423890', '2466914', '2547059', '2591951', '2614816', '262337', '2648923', '2677308', '2683907', '270280', '2705522', '2707873', '2707962', '2783725', '2787259', '2808220', '2808654', '2812915', '2855703', '2868256', '2876005', '2907137', '2915693', '2929961', '2956757', '2967978', '3033796', '304845', '3075263', '3133832', '3147128', '3161761', '3173598', '3212139', '3240994', '3249623', '3273792', '3313111', '331758', '3384511', '3438123', '3456776', '3504809', '3533675', '3543329', '3563537', '3595537', '3609505', '3663771', '3665026', '3721327', '3808521', '3818641', '3822679', '3860193', '3919497', '3941996', '3943143', '3984378', '4009904', '4057986', '4061862', '4089700', '4096398', '4171022', '4181231', '4209896', '4278997', '4314435', '4352129', '4353800', '4423717', '4427271', '4433027', '4464775', '4512098', '4542179', '4546420', '4614860', '4663940', '4669324', '4743578', '4751534', '4753777', '4767623', '4769154', '4779725', '4822969', '4824897', '4850213', '4852870', '486891', '4915922', '4975479', '5031561', '5051247', '5086310', '5100864', '5173273', '5218878', '5221523', '5254031', '5372971', '5393536', '5411283', '5481359', '5485336', '5485416', '5565161', '5589200', '5651037', '5679592', '5680682', '5705367', '5712596', '5756378', '5798125', '5816635', '5887851', '5892057', '5901361', '5922809', '5937836', '5958880', '5989166', '6007463', '6031358', '6050675', '6064355', '6089703', '6094650', '6120309', '6129593', '6149923', '6183780', '6196342', '6242892', '6271222', '6349461', '6356526', '6443338', '6531554', '6536748', '6540211', '6562884', '6588380', '6639579', '6698545', '676890', '6891345', '6917864', '6921137', '7090264', '7268884', '7975590', '8120358', '8276094', '8436941', '8488274', '8497122', '8527558', '8728584', '8771886', '8803524', '9000235', '901551', '9019044', '902502', '9159977', '9353769', '9452473', '9474560', '9578549', '9610313', '9647686', '9670494', '9738970', '9742003', '9811403', '9915387', '9915429', '9915871', '9926830', '9933885', '9935029']

In [7]:
import review_quality
hotel_review_stats = review_quality.HotelReviewStats()
states = hotel_review_stats.load_stats()
Q_i = hotel_review_stats.calculate_final_score_local_data(ids_result, print_warning=False)

Stats loaded from D:\graduate_dissertation\final\dataset\review_hotel\review_quality.json


In [8]:

# Bước 1: Sắp xếp theo score tăng dần
sorted_items = sorted(Q_i.items(), key=lambda x: x[1])

# Bước 2: Lấy 10 phần tử cách đều nhau
n = len(sorted_items)
num_samples = 10

# Nếu ít hơn 10 phần tử, lấy tất cả
if n <= num_samples:
    result = [k for k, v in sorted_items]
else:
    step = (n - 1) / (num_samples - 1)
    indices = [round(i * step) for i in range(num_samples)]
    result = [sorted_items[i][0] for i in indices]

# Kết quả
print("10 id chia đều từ thấp đến cao:", result)

10 id chia đều từ thấp đến cao: ['11510053', '13172950', '2787259', '2683907', '6917864', '4427271', '5901361', '6064355', '5887851', '4464775']


In [9]:
for i in result:
    print(states[i])

{'count_review': 13, 'count_review_with_image': 0, 'count_long_review': 1, 'total_score': 99.0, 'num_score': 13, 'avg_score': 7.615384615384615}
{'count_review': 18, 'count_review_with_image': 0, 'count_long_review': 7, 'total_score': 167.0, 'num_score': 18, 'avg_score': 9.277777777777779}
{'count_review': 58, 'count_review_with_image': 5, 'count_long_review': 24, 'total_score': 502.0, 'num_score': 58, 'avg_score': 8.655172413793103}
{'count_review': 86, 'count_review_with_image': 10, 'count_long_review': 40, 'total_score': 739.0, 'num_score': 86, 'avg_score': 8.593023255813954}
{'count_review': 134, 'count_review_with_image': 5, 'count_long_review': 72, 'total_score': 1176.0, 'num_score': 134, 'avg_score': 8.776119402985074}
{'count_review': 230, 'count_review_with_image': 12, 'count_long_review': 107, 'total_score': 2060.0, 'num_score': 230, 'avg_score': 8.956521739130435}
{'count_review': 203, 'count_review_with_image': 18, 'count_long_review': 99, 'total_score': 1937.0, 'num_score'

In [None]:
states['13275333']

{'count_review': 21,
 'count_review_with_image': 5,
 'count_long_review': 8,
 'total_score': 199.0,
 'num_score': 21,
 'avg_score': 9.476190476190476}