# Form 1

In [13]:
import pandas as pd
import logging
import datetime
from indices import form_1_indices as INDICES


csv_list = ['./Reports/1.csv', './Reports/2.csv', './Reports/3.csv', './Reports/4.csv']
# Add Logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler('transform_logs.log', encoding='utf-8')
fh.setLevel(logging.DEBUG)
logger.addHandler(fh)
# Add name
NAME = 'report_form_1.py'

def construct_df(csv_list):
    '''
    Linear time:
    ~12 sec for 1 day
    ~6 min for 1 month
    '''
    def create_pivot(df):
        # Run Transforms for this day
        pivot_df_mistakes = df.pivot_table(index='Имя колл-листа', columns='Дата', values='Ошибки', aggfunc='sum')
        pivot_df_mistakes = pivot_df_mistakes.fillna(0)
        pivot_df_mistakes = pivot_df_mistakes.replace(0.00, '')
        pivot_df_mistakes.columns = pd.to_datetime(pivot_df_mistakes.columns, format='%d.%m.%Y')  # Fix date Time
        pivot_df_mistakes = pivot_df_mistakes.sort_index(axis=1)  # Fix date Time
        tmp_pivot_df_mistakes = pivot_df_mistakes.copy()  # Fix %%
        pivot_df_mistakes.index = pivot_df_mistakes.index + ' (ошибки шт.)'
        # Create dynamic Calls count (2)
        pivot_df_calls = df.pivot_table(index='Имя колл-листа', columns='Дата', values='Результат автооценки', aggfunc='count', fill_value=0)
        pivot_df_calls.columns = pd.to_datetime(pivot_df_calls.columns, format='%d.%m.%Y')  # Fix date Time
        pivot_df_calls = pivot_df_calls.sort_index(axis=1)  # Fix date Time
        tmp_pivot_df_calls = pivot_df_calls.copy()  # Fix %%
        pivot_df_calls.index = pivot_df_calls.index + ' (всего шт.)'
        # Create dynamic Mean Autoscore (3)
        pivot_df_mean = df.pivot_table(index='Имя колл-листа', columns='Дата', values='Результат автооценки', aggfunc='mean', fill_value='')
        pivot_df_mean.columns = pd.to_datetime(pivot_df_mean.columns, format='%d.%m.%Y')  # Fix date Time
        pivot_df_mean = pivot_df_mean.sort_index(axis=1)  # Fix date Time
        pivot_df_mean.index = pivot_df_mean.index + ' (средняя АО)'
        # Create dynamic Error Percentage (4)
        pivot_df_mistakes_filled = tmp_pivot_df_mistakes.replace('', 0)
        pivot_df_error_rate = (pivot_df_mistakes_filled / tmp_pivot_df_calls).applymap(lambda x: x if not pd.isna(x) else '')
        pivot_df_error_rate.index = pivot_df_error_rate.index + ' (доля ошибок %)'
        # Create Mega-Pivot
        # Concatenate the pivot tables vertically along rows (axis=0)
        pivot_table = pd.concat([pivot_df_mistakes, pivot_df_calls, pivot_df_mean, pivot_df_error_rate], axis=0)
        pivot_table = pivot_table.sort_index()
        return pivot_table
    # Concatenate all csv to a single biiig df
    df_main = pd.DataFrame(index=INDICES)
    df_rpc = pd.DataFrame(index=INDICES)
    for i in csv_list:
        '''
        Take report files 1-by-1 and the merge then on external index from indices.py
        This will cut RAM cost 30 times (and make shit slower)
        '''
        # Merge 2 frames
        df = pd.read_csv(i, sep=';', encoding='utf-8',header=0)
        # Remove мультидоговоры for RSB
        mask = df['№ п/п'].isna()
        df = df[~mask]
        # Convert the 'Длительность звонка' column to Timedelta
        df['Длительность звонка'] = pd.to_timedelta(df['Длительность звонка'])
        df['Ошибки'] = df['Результат автооценки'] != 100
        # Fix Date
        df['Дата'] = pd.to_datetime(df['Дата звонка'], format='%d.%m.%Y %H:%M:%S')
        df['Дата'] = df['Дата'].dt.strftime('%d.%m.%Y')
        df = df.reset_index(drop=True)
        # Create RPC
        rpc_df = df[df['Контактное лицо'] == 'Должник']
        rpc_df = rpc_df.reset_index(drop=True)
        # Warn if dates != 1
        if len(df['Дата'].unique().tolist()) > 1:
            logger.warning('%s Warning: more than a single date in df...', datetime.datetime.now())

        # MEMORY MANAGEMENT: CONCAT TO INDEX AND DELETE
        main_pivot = create_pivot(df)
        df_main = pd.concat([df_main, main_pivot], axis=1)
        del main_pivot  # Save 10MB
        rpc_pivot = create_pivot(rpc_df)
        df_rpc = pd.concat([df_rpc, rpc_pivot], axis=1)
        del rpc_pivot  # Save 10MB
    # Returns 2 complete pivots
    return df_main, df_rpc

def construct_summary(df_main, df_rpc):
    '''
    Calculations for the summary sheet
    WEIGHTED BT DAY -> Simple
    '''
    def create_col(pivot, title):
        calls = pivot[pivot.index.str.contains('(всего шт.)')].apply(pd.to_numeric, errors='coerce')
        calls = calls.sum(axis=1)
        error_rate = pivot[pivot.index.str.contains('(доля ошибок %)')].apply(pd.to_numeric, errors='coerce')
        error_rate = error_rate.mean(axis=1,skipna=True,numeric_only=True)
        errors = pivot[pivot.index.str.contains('(ошибки шт.)')].apply(pd.to_numeric, errors='coerce')
        errors = errors.sum(axis=1)
        score = pivot[pivot.index.str.contains('(средняя АО)')].apply(pd.to_numeric, errors='coerce')
        score = score.mean(axis=1,skipna=True, numeric_only=True)
        summary = pd.DataFrame(index=INDICES)
        summary[f'Свод: {title}'] = pd.concat([calls, error_rate, errors, score], axis=0)
        summary = summary.sort_index()
        return summary
    # Create Summary DF
    df_summary = pd.DataFrame()
    df_summary = pd.concat([create_col(df_main, 'все звонки'),
                            create_col(df_rpc, 'RPC') ], axis=1)
    # Returns Dataframe
    return df_summary

# Form 2

In [200]:
import pandas as pd
import logging
import datetime
from indices import form_2_indices as INDICES


csv_list = ['15GB.csv']
# Add Logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler('transform_logs.log', encoding='utf-8')
fh.setLevel(logging.DEBUG)
logger.addHandler(fh)
# Add name
NAME = 'report_form_2.py'

In [201]:
def construct_df(csv_list):
    '''
    Linear time:
    ~9 sec for 1 day
    ~5 min for 1 month
    '''
    def create_pivot(df):
        # Create multiindex
        def create_multiindex(dataframe, sub_index:str):
            # Create MultiIndex
            multiindex = []
            for i, column in enumerate(dataframe):
                multiindex.append((column, sub_index))
            dataframe.columns = pd.MultiIndex.from_tuples(multiindex)
            return dataframe
        # Create Pivot FUNC
        pivot_df_calls = df.pivot_table(index=['Имя колл-листа', 'Результат робота'], columns='Дата', values='Результат автооценки', aggfunc='count', fill_value='')
        pivot_df_calls.columns = pd.to_datetime(pivot_df_calls.columns, format='%d.%m.%Y')  # Fix date Time
        pivot_df_calls = pivot_df_calls.sort_index(axis=1)  # Fix date Time
        # Calculate number of errors
        pivot_df_errors = df.pivot_table(index=['Имя колл-листа', 'Результат робота'], columns='Дата', values='Ошибки', aggfunc='sum', fill_value='')
        pivot_df_errors.columns = pd.to_datetime(pivot_df_errors.columns, format='%d.%m.%Y')  # Fix date Time
        pivot_df_errors = pivot_df_errors.sort_index(axis=1)  # Fix date Time
        # Calculate mean autoscore
        pivot_df_mean = df.pivot_table(index=['Имя колл-листа', 'Результат робота'], columns='Дата', values='Результат автооценки', aggfunc='mean', fill_value='')
        pivot_df_mean.columns = pd.to_datetime(pivot_df_mean.columns, format='%d.%m.%Y')  # Fix date Time
        pivot_df_mean = pivot_df_mean.sort_index(axis=1)  # Fix date Time
        # Calculate error rate
        pivot_df_error_rate = (pivot_df_errors.replace("", pd.NA) / pivot_df_calls.replace("", pd.NA)).applymap(lambda x: x if not pd.isna(x) else '')
        pivot_df_error_rate.columns = pd.to_datetime(pivot_df_error_rate.columns, format='%d.%m.%Y')  # Fix date Time
        pivot_df_error_rate = pivot_df_error_rate.sort_index(axis=1)  # Fix date Time
        # Create MultiIndex
        pivot_df_calls = create_multiindex(pivot_df_calls, 'Зв.(шт.)')
        pivot_df_errors = create_multiindex(pivot_df_errors, 'Ошб.(шт.)')
        pivot_df_mean = create_multiindex(pivot_df_mean, 'Ср.АО')
        pivot_df_error_rate = create_multiindex(pivot_df_error_rate, 'Ошб.%')
        # Create a list of the DataFrames you want to merge
        #dfs_to_merge = [pivot_df_calls, pivot_df_errors, pivot_df_mean, pivot_df_error_rate]
        dfs_to_merge = [pivot_df_error_rate, pivot_df_errors, pivot_df_calls, pivot_df_mean]
        # Initialize an empty DataFrame with the same index as the original DataFrames
        merged_df = pd.DataFrame(index=pivot_df_calls.index)
        # Create Multiindex
        multi_index = []
        # Iterate through the DataFrames and concatenate their columns in the desired order
        for num, column in enumerate(pivot_df_calls.columns):
                for dataframe in dfs_to_merge:
                        col_name = (dataframe.iloc[:, num].name[0], dataframe.iloc[:, num].name[1])
                        # Append the column name tuple to the list
                        multi_index.append(col_name)
                        merged_df[col_name] = dataframe.iloc[:, num]
        merged_df.columns = pd.MultiIndex.from_tuples(multi_index)
        # Returns merged df
        return merged_df
    # Create Base dfs for pivots
    multi_index = pd.MultiIndex.from_tuples(INDICES)
    multi_header = pd.MultiIndex.from_tuples([('tmp1','tmp2')])
    # Create your empty DataFrames with the MultiIndex
    df_main = pd.DataFrame(index=multi_index, columns=multi_header)
    df_rpc = pd.DataFrame(index=multi_index, columns=multi_header)
    for i in csv_list:
        '''
        Take report files 1-by-1 and the merge then on external index from indices.py
        This will cut RAM cost 30 times (and make shit slower)
        '''
        # Merge 2 frames
        df = pd.read_csv(i, sep=';', encoding='utf-8',header=0)
        # Remove мультидоговоры for RSB
        mask = df['№ п/п'].isna()
        df = df[~mask]
        # Convert the 'Длительность звонка' column to Timedelta
        df['Длительность звонка'] = pd.to_timedelta(df['Длительность звонка'])
        df['Ошибки'] = df['Результат автооценки'] != 100
        # Fix Date
        df['Дата'] = pd.to_datetime(df['Дата звонка'], format='%d.%m.%Y %H:%M:%S')
        df['Дата'] = df['Дата'].dt.strftime('%d.%m.%Y')
        df = df.reset_index(drop=True)
        # Create RPC
        rpc_df = df[df['Контактное лицо'] == 'Должник']
        rpc_df = rpc_df.reset_index(drop=True)
        # Warn if dates != 1
        if len(df['Дата'].unique().tolist()) > 1:
            logger.warning('%s Warning: more than a single date in df...', datetime.datetime.now())
        # MEMORY MANAGEMENT: CONCAT TO INDEX AND DELETE
        main_pivot = create_pivot(df)
        df_main = pd.concat([df_main, main_pivot], axis=1)
        del main_pivot  # Save 10MB
        rpc_pivot = create_pivot(rpc_df)
        df_rpc = pd.concat([df_rpc, rpc_pivot], axis=1)
        del rpc_pivot  # Save 10MB
    # Remove TMP columns
    del df_main[('tmp1','tmp2')]
    del df_rpc[('tmp1','tmp2')]
    # Returns 2 complete pivots
    return df_main, df_rpc

def construct_summary(df_main, df_rpc):
    '''
    Calculations for the summary sheet
    WEIGHTED BT DAY -> Simple
    '''
    def create_col(pivot, title):
        df = pivot
        errors_percent = df.loc[:, df.columns.get_level_values(1) == 'Ошб.%']
        errors_percent = errors_percent.mean(axis=1, skipna=True)
        errors_count = df.loc[:, df.columns.get_level_values(1) == 'Ошб.(шт.)']
        errors_count = errors_count.sum(axis=1, numeric_only=True)
        calls_count = df.loc[:, df.columns.get_level_values(1) == 'Зв.(шт.)']
        calls_count = calls_count.sum(axis=1, numeric_only=True)
        score = df.loc[:, df.columns.get_level_values(1) == 'Ср.АО']
        score = score.mean(axis=1, skipna=True)
        df = pd.DataFrame(index=pd.MultiIndex.from_tuples(df.index), columns=pd.MultiIndex.from_tuples([(title,'')]))
        # Mask Error Count
        mask = calls_count == 0
        df[(title, 'Ошб.%')] = errors_percent
        df[(title, 'Ошб.(шт.)')] = errors_count[~mask]  # pd.mean considers NA = 0
        df[(title, 'Зв.(шт.)')] = calls_count.replace(0,pd.NA)
        df[(title, 'Ср.АО')] = score
        del df[(title, '')]
        # Returns weighted sumary
        return df
        # Create Summary DF
    df_summary = pd.DataFrame()
    df_summary = pd.concat([create_col(df_main, 'Срез: все звонки'),
                            create_col(df_rpc, 'Срез: RPC') ], axis=1)
    # Returns Dataframe
    return df_summary

In [202]:
i, j = construct_df(csv_list)

In [226]:
# FUNC Construct summary
def construct_summary(df_main, df_rpc):
    '''
    Calculations for the summary sheet
    WEIGHTED BT DAY -> Simple
    '''
    def create_col(pivot, title):
        df = pivot.apply(pd.to_numeric, errors='coerce')
        errors_percent = df.loc[:, df.columns.get_level_values(1) == 'Ошб.%']
        errors_percent = errors_percent.mean(axis=1, skipna=True)
        errors_count = df.loc[:, df.columns.get_level_values(1) == 'Ошб.(шт.)']
        errors_count = errors_count.sum(axis=1, numeric_only=True, skipna=True)
        calls_count = df.loc[:, df.columns.get_level_values(1) == 'Зв.(шт.)']
        calls_count = calls_count.sum(axis=1, numeric_only=True, skipna=True)
        score = df.loc[:, df.columns.get_level_values(1) == 'Ср.АО']
        score = score.mean(axis=1, skipna=True)
        df = pd.DataFrame(index=pd.MultiIndex.from_tuples(df.index), columns=pd.MultiIndex.from_tuples([(title,'')]))
        # Mask Error Count
        mask = calls_count == 0
        df[(title, 'Ошб.%')] = errors_percent
        df[(title, 'Ошб.(шт.)')] = errors_count[~mask]  # pd.mean considers NA = 0
        df[(title, 'Зв.(шт.)')] = calls_count.replace(0,pd.NA)
        df[(title, 'Ср.АО')] = score
        del df[(title, '')]
        # Returns weighted sumary
        return df
        # Create Summary DF
    df_summary = pd.DataFrame()
    df_summary = pd.concat([create_col(df_main, 'Срез: все звонки'),
                            create_col(df_rpc, 'Срез: RPC') ], axis=1)
    # Returns Dataframe
    return df_summary


In [227]:
z = construct_summary(i, j)

In [225]:
errors_percent = j.loc[:, j.columns.get_level_values(1) == 'Ошб.%']
#errors_percent = errors_percent.mean(axis=1, skipna=True)
errors_percent

Unnamed: 0_level_0,Unnamed: 1_level_0,2023-09-18 00:00:00,2023-09-19 00:00:00,2023-09-20 00:00:00,2023-09-21 00:00:00,2023-09-22 00:00:00,2023-09-23 00:00:00,2023-09-24 00:00:00
Unnamed: 0_level_1,Unnamed: 1_level_1,Ошб.%,Ошб.%,Ошб.%,Ошб.%,Ошб.%,Ошб.%,Ошб.%
Актуализация,АО: Абонент не отвечает,,,,,,,
Актуализация,АО: Абонент недоступен,,,,,,,
Актуализация,АО: Номер не существует,,,,,,,
Актуализация,АО: Нужен внутренний номер,,,,,,,
Актуализация,АО: Соединение установлено,,,,,,,
...,...,...,...,...,...,...,...,...
ФЗ-230,Отрицает долг,0.0331126,0.0444444,0,0.0263158,0.010101,0.0307692,0.0204082
ФЗ-230,Перезвонить,,,,,,,
ФЗ-230,Просьба передать информацию,,,,,,,
ФЗ-230,Сброс звонка роботом,,,,,,,


In [236]:
z.reset_index().columns

MultiIndex([(         'level_0',          ''),
            (         'level_1',          ''),
            ('Срез: все звонки',     'Ошб.%'),
            ('Срез: все звонки', 'Ошб.(шт.)'),
            ('Срез: все звонки',  'Зв.(шт.)'),
            ('Срез: все звонки',     'Ср.АО'),
            (       'Срез: RPC',     'Ошб.%'),
            (       'Срез: RPC', 'Ошб.(шт.)'),
            (       'Срез: RPC',  'Зв.(шт.)'),
            (       'Срез: RPC',     'Ср.АО')],
           )