In [95]:
import pandas as pd
import numpy as np
import joblib

In [90]:
import pandas as pd
import numpy as np

class MatrixLoader:
    def __init__(self, year, agg):
        self.base_url = 'https://github.com/GuilhermeZiegler/input_output/raw/master/input_output/data/iom'
        self.year = year
        self.agg = agg
        self.file_path = f'{self.base_url}/{self.agg}/{self.year}/MIP-BR-CN10-68S-{self.year}.xlsx'
        self.matrix = {}  # Dictionary to hold data from each sheet
        self.load_data()

    def load_data(self):
        sheet_name_map = {
            'Producao': 'P',
            'Usos PxS': 'PxS',
            'Usos SxS': 'SxS',
            'Mat A Coef Tec': 'A',
            'Inv Leontief': 'Leontief_Inv',
            'Importacoes': 'M',
            'Imposto Import': 'T',
            'ICMS': 'ICMS',
            'IPI': 'IPI',
            'OIIL': 'OIIL',
            'MG Com': 'MG_Com',
            'MG Transp': 'MG_Transp'
        }
        
        xls = pd.ExcelFile(self.file_path)
        for sheet_name in xls.sheet_names:
            if sheet_name == 'Referência':
                continue  # Ignore the 'Referência' sheet
            #sheet_name_map = joblib.load()
            translated_name = sheet_name_map.get(sheet_name, sheet_name)
            self.extract_indices_and_matrix(translated_name, sheet_name, xls)

    def extract_indices_and_matrix(self, translated_name, sheet_name, xls):
        df = pd.read_excel(xls, sheet_name=sheet_name, header=None)
        
        # Extract row indexes starting from A5
        row_start_index = 4
        row_indexes = df.iloc[row_start_index:, :3].reset_index(drop=True)

        # Extract column indexes starting from D2
        col_start_row, col_end_row = 1, 3
        col_start_col = 3
        column_indexes = df.iloc[col_start_row:col_end_row+1, col_start_col:].reset_index(drop=True)

        # Extract data matrix starting from D5
        data_matrix = df.iloc[row_start_index:, col_start_col:].reset_index(drop=True)
        data_matrix = data_matrix.dropna(axis=1, how='all')

        # Store data in the dictionary
        self.matrix[translated_name] = {
            'row_indexes': row_indexes,
            'column_indexes': column_indexes,
            'data_matrix': data_matrix,
            'aligned_matrix': self.align_matrix(row_indexes, column_indexes, data_matrix)
        }

    def align_matrix(self, row_indexes, column_indexes, data_matrix):
        # Ensure the data matrix and column indexes have matching shapes
        col_len_data = data_matrix.shape[1]
        col_len_index = column_indexes.shape[1]

        if col_len_data < col_len_index:
            # Add NaN columns to data matrix
            for _ in range(col_len_index - col_len_data):
                data_matrix[f'NaN_{_}'] = np.nan
        elif col_len_data > col_len_index:
            # Truncate the data matrix columns
            data_matrix = data_matrix.iloc[:, :col_len_index]

        # Ensure row_indexes has exactly two levels
        if row_indexes.shape[1] > 2:
            row_indexes = row_indexes.iloc[:, :2]
        elif row_indexes.shape[1] < 2:
            row_indexes['dummy'] = np.nan

        # Ensure column_indexes has exactly two levels
        if column_indexes.shape[0] > 2:
            column_indexes = column_indexes.iloc[:2, :]
        elif column_indexes.shape[0] < 2:
            extra_row = pd.Series([np.nan] * column_indexes.shape[1])
            column_indexes = column_indexes.append(extra_row, ignore_index=True)

        # Create MultiIndex for rows
        row_index = pd.MultiIndex.from_frame(row_indexes)
        row_index.names = ['codes', 'sectors']

        # Create MultiIndex for columns
        col_index = pd.MultiIndex.from_frame(column_indexes.T)
        col_index.names = ['codes', 'sectors']

        # Align the data matrix with NaN filling for missing values
        aligned_df = pd.DataFrame(data_matrix.values, index=row_index, columns=col_index)
        aligned_df = aligned_df.reindex(index=row_index, columns=col_index)

        return aligned_df

    def get_aligned_matrix(self, sheet_name):
        """Return the aligned matrix for a specific sheet."""
        return self.matrix.get(sheet_name, {}).get('aligned_matrix', None)

    def display_aligned_matrices(self):
        """Display the aligned matrices for all sheets."""
        for sheet_name, data in self.matrix.items():
            print(f"Sheet: {sheet_name}")
            print("Aligned Matrix:")
            display(data['aligned_matrix'])
            print("\n")

# Example usage
# loader = MatrixLoader(2024, 'some_agg_value')
# loader.display_aligned_matrices()



In [91]:
# Exemplo de uso
year = '2018'
agg = '68S'
min_68S_2018 = MatrixLoader(year, agg)



In [92]:
for key, item in min_68S_2018.matrix.items():
    print(key)
   


P
PxS
SxS
A
Leontief_Inv
M
T
ICMS
IPI
OIIL
MG_Com
MG_Transp


In [94]:
min_68S_2018.matrix['P']

{'row_indexes':        0                                                  1     2
 0   0191  Agricultura, inclusive o apoio à agricultura e...   1.0
 1   0192             Pecuária, inclusive o apoio à pecuária   2.0
 2   0280            Produção florestal; pesca e aquicultura   3.0
 3   0580  Extração de carvão mineral e de minerais não-m...   4.0
 4   0680  Extração de petróleo e gás, inclusive as ativi...   5.0
 ..   ...                                                ...   ...
 66  9480  Organizações associativas e outros serviços pe...  67.0
 67  9700                                Serviços domésticos  68.0
 68   NaN                                  Total\ndo produto  69.0
 69   NaN                                                NaN   NaN
 70   NaN                                        Conferência   NaN
 
 [71 rows x 3 columns],
 'column_indexes':                              3              4    \
 0                          01911          01912   
 1  Arroz, trigo e outros cereais