### Basic import & Helper function

In [2]:
import pickle
import re
import pandas as pd
import numpy as np
import random

In [3]:
class RatingProcessor:
    def __init__(self, file_path):
        self.file_path = file_path
        self.df = None
        self.df_num = None
        self.df_diff = None
        self.final_ratings = {}

    def load_data(self):
        with open(self.file_path, 'rb') as f:
            data = pickle.load(f)
        self.df = pd.DataFrame(data).T

    def rating_to_numeric(self, rating):
        rating_order = [
            'D', 'CCC-', 'CCC', 'CCC+', 'CC', 
            'B-', 'B', 'B+', 'BB-', 'BB', 'BB+', 
            'BBB-', 'BBB', 'BBB+', 'A-', 'A', 'A+', 
            'AA-', 'AA', 'AA+', 'AAA'
        ]
        rating_to_num = {rating: i for i, rating in enumerate(rating_order)}
        if pd.isna(rating):
            return None
        return rating_to_num.get(rating, None)

    def convert_ratings(self):
        self.df_num = self.df.applymap(self.rating_to_numeric)

    def calculate_diff(self):
        self.df_diff = self.df_num.diff(axis=1)
        first_quarter = self.df.iloc[:, 0]
        self.df_diff.iloc[:, 0] = first_quarter.apply(lambda x: 0 if pd.notna(x) else None)

    def count_changes(self):
        no_change = (self.df_diff == 0).sum().sum()
        increase = (self.df_diff > 0).sum().sum()
        decrease = (self.df_diff < 0).sum().sum()
        return no_change, increase, decrease

    def extract_ratings_by_change_type(self):
        increase = []
        decrease = []
        no_change = []
        for company in self.df.index:
            for quarter in self.df.columns:
                if self.df_diff.loc[company, quarter] > 0:
                    increase.append((company, quarter))
                elif self.df_diff.loc[company, quarter] < 0:
                    decrease.append((company, quarter))
                elif self.df_diff.loc[company, quarter] == 0:
                    no_change.append((company, quarter))
        
        return increase, decrease, no_change

    def select_samples(self, increase, decrease, no_change):
        num_samples = min(len(increase), len(decrease), len(no_change))
        selected_increase = random.sample(increase, num_samples)
        selected_decrease = random.sample(decrease, num_samples)
        selected_no_change = random.sample(no_change, num_samples)
        return selected_increase + selected_decrease + selected_no_change

    def create_final_ratings(self, selected_data):
        for company, quarter in selected_data:
            if company not in self.final_ratings:
                self.final_ratings[company] = {}
            self.final_ratings[company][quarter] = self.df.loc[company, quarter]

    def save_final_ratings(self, output_path):
        pd.to_pickle(self.final_ratings, output_path)

    def process_ratings(self, output_path):
        self.load_data()
        self.convert_ratings()
        self.calculate_diff()
        no_change, increase, decrease = self.count_changes()
        print(f"No Change: {no_change}")
        print(f"Increase: {increase}")
        print(f"Decrease: {decrease}")

        increase, decrease, no_change = self.extract_ratings_by_change_type()
        selected_data = self.select_samples(increase, decrease, no_change)
        self.create_final_ratings(selected_data)
        self.save_final_ratings(output_path)
        return self.final_ratings

# 使用示例
processor = RatingProcessor('data/ratings_retail_indus.pkl')
final_ratings = processor.process_ratings('data/ratings_retail_indus_processed.pkl')
print("Final Ratings:", final_ratings)


No Change: 19962
Increase: 357
Decrease: 168
Final Ratings: {'URI': {'2012Q4': 'B+', '2014Q1': 'BB-', '2019Q3': 'BB-'}, 'TGI': {'2014Q1': 'BB+', '2016Q3': 'BB-', '2015Q4': 'BB'}, 'CP': {'2015Q1': 'BBB+', '2011Q3': 'BBB-'}, 'ALK': {'2013Q4': 'BB', '2015Q1': 'BBB-', '2010Q4': 'B+', '2014Q1': 'BB+', '2017Q1': 'BB+', '2010Q1': 'B', '2014Q3': 'BB+', '2020Q4': 'BB+'}, 'MGA': {'2014Q1': 'A-'}, 'TSN': {'2013Q2': 'BBB', '2011Q2': 'BBB-', '2010Q4': 'BB+', '2011Q1': 'BB+'}, 'M': {'2011Q3': 'BBB-', '2017Q2': 'BBB-', '2016Q2': 'BBB'}, 'HLT': {'2016Q3': 'BB+', '2015Q2': 'BB', '2023Q3': 'BB+'}, 'AAL': {'2015Q3': 'BB-', '2012Q1': 'D', '2014Q3': 'B'}, 'ABG': {'2013Q2': 'BB', '2012Q3': 'BB-', '2019Q1': 'BB+', '2010Q4': 'B+', '2013Q4': 'BB'}, 'PFMT': {'2013Q2': 'BB-', '2015Q3': 'B+'}, 'HAS': {'2011Q3': 'BBB+', '2013Q2': 'BBB', '2018Q4': 'BBB', '2020Q1': 'BBB'}, 'OC': {'2016Q4': 'BBB'}, 'DDS': {'2011Q2': 'BB-', '2012Q2': 'BB', '2013Q2': 'BB+'}, 'CMI': {'2011Q4': 'A', '2014Q4': 'A+'}, 'KDP': {'2014Q3': 'BB