In [1]:
import pandas as pd
from datetime import datetime
from gsheets_connect import GoogleSheets, Connect
import helper_functions as hlp
from expense import Expense
import os
from credit_card_object import CreditCard

GSHEET_ID = '14a36tesQZ2AH0aIEdG5Ch2F8iVyrnXCvlxeASLX47N4'

gsheet = GoogleSheets()

mnths_dict = {
    'Ene':'01','Feb':'02','Mar':'03',
    'Abr':'04','May':'05','Jun':'06',
    'Jul':'07','Ago':'08','Sep':'09',
    'Oct':'10','Nov':'11','Dic':'12'
}

# Helper Functions

In [2]:
def loadXlsx(file_path):
    data = pd.read_excel(file_path, header=None)
    
    ix_of_first = index_of_first_spend(data)
    
    data = data.iloc[
        ix_of_first:,:3
    ].copy().reset_index(drop=True)
    
    # Date column as datetime
    data.iloc[:,0] = data.iloc[:,0].apply(
        lambda x: parse_date(x)[1]
    )
    
    # Amount column as float
    data.iloc[:,2] = data.iloc[:,2].astype(float)
    
    return data

def loadXls(file_path):
    file = pd.read_html(file_path)
    data = file[1]
    
    ix_of_first = index_of_first_spend(data)

    
    data = data.iloc[
        ix_of_first:,[0,2,3]
    ].copy().reset_index(drop=True)
    
    # Date columns as datetime
    data.iloc[:,0] = data.iloc[:,0].apply(
        lambda x: parse_date(x)[1]
    )
    
    # Amount column as float
    data.iloc[:,2] = data.iloc[:,2].astype(float)
    
    return data

def get_available_files():
    data_sources = []
    cwd = os.getcwd()
    files_path = os.path.join(cwd,"input_data")
    dir_content = os.listdir(files_path)
    for file_name in dir_content:
        file_path = os.path.join(files_path, file_name)
        if file_path.endswith('.xlsx') or file_path.endswith('.xls'):
            data_sources.append(file_path)

    return data_sources

def list_files(files_list):
    lst_str = ""
    for i, obj in enumerate(files_list):
        lst_str += '\n'+str(i)+') '+obj.split('/')[-1]
    return lst_str

def open_file(file_path):
    if file_path.endswith('.xlsx'):
        loaded_data = loadXlsx(file_path)
    elif file_path.endswith('.xls'):
        loaded_data = loadXls(file_path)
    else:
        raise ValueError("Unable to open file: {0}".format(file_path))
    return loaded_data

def ask_user_for_file():
    files = get_available_files()
    listed_files = list_files(files)
    if len(files)==0:
        raise ValueError('No files to process')
    elif len(files)>1:
        file_num = input("""
I have this files available: """+listed_files+"""

Which one do you want to process?: """)
    else:
        file_num = 0
    return files[int(file_num)]

def load_data():
    file_to_process = ask_user_for_file()
    data = open_file(file_to_process)
    
    print("Data loaded correctly \n")
    return data.dropna()

def index_of_first_spend(data_frame):
    for ix, row in data_frame.iterrows():
        val = row.iloc[0]
        if not pd.isnull(val) and parse_date(val)[0]:
            return ix
    return None
    
def parse_date(date_string):
    try:
        date_string = parse_month(date_string)
        date = datetime.strptime(date_string, '%d/%m/%Y')
        return (True, date)
    except (ValueError, TypeError):
        return (False, None)

def parse_month(date_string):
    for m,v in mnths_dict.items():
        if m in date_string:
            date_string = date_string.replace(m,v)
    return date_string

def update_expenses_data_frame(cur_df, expense, ignored=False):
    
    lst_ix = 0 if cur_df.shape[0] == 0 else cur_df.index.max()+1
    new_df = pd.DataFrame(index=[lst_ix], columns=cur_df.columns)
    
    new_df.iloc[0].loc["expense_id"] = expense.getId()
    new_df.iloc[0].loc["expense_date"]=expense.getDateAsString()
    new_df.iloc[0].loc["description"] = expense.getDescription()
    new_df.iloc[0].loc["amount"]=expense.getAmount()
    new_df.iloc[0].loc["installments"]=expense.getInstallments()
    new_df.iloc[0].loc["ignored"]=ignored
    new_df.iloc[0].loc["payment_method"]=expense.getPaymentMethod()
    new_df.iloc[0].loc["method_name"]=expense.getPaymentMethodName()
    
    cur_df = pd.concat([cur_df, new_df])
    
    return cur_df

def update_installments_data_frame(cur_df, expenses):
    
    for i, expense in enumerate(expenses):
        lst_ix = 0 if cur_df.shape[0] == 0 else cur_df.index.max()+1
        new_df = pd.DataFrame(index=[lst_ix], columns=cur_df.columns)
        
        new_df.iloc[0].loc["expense_id"] = expense.getId()
        new_df.iloc[0].loc["installment_amount"]=expense.getAmount()
        new_df.iloc[0].loc["expense_date"] = expense.getDateAsString()
        new_df.iloc[0].loc["expense_month"]=expense.getMonthNum()
        new_df.iloc[0].loc["payment_date"]=expense.getPaymentDateAsString()
        new_df.iloc[0].loc["payment_month"]=expense.getPaymentMonthNum()
        new_df.iloc[0].loc["total_installments"]=expense.getInstallments()
        new_df.iloc[0].loc["installment_num"]=str(i)
        new_df.iloc[0].loc["payment_method"]=expense.getPaymentMethod()
        new_df.iloc[0].loc["method_name"]=expense.getPaymentMethodName()
        new_df.iloc[0].loc["category"]=expense.getMainCategory()
        new_df.iloc[0].loc["sub_category"]=expense.getSubCategory()
        new_df.iloc[0].loc["payment_fortnight"]=expense.getPaymentFortnight()
        new_df.iloc[0].loc["description"]=expense.getDescription()
        
        cur_df = pd.concat([cur_df, new_df])
    
    return cur_df

def modifyInfo(an_expense, row, main_cats, sub_cats):
    print("""
    Registering a new expense. At any time:
        * r is for restart, and it restarts 
            the entire expense information logging, 
        * q kills the entire process.
    """)
    
    try:
        date, concept, amount = row

        cat, scat = hlp.selectCategory(main_cats, sub_cats)
        installments = hlp.askForInstallments(
            is_credit = type(pm) == CreditCard
        )

        an_expense.updateData(
            payment_date=date, 
            description=concept, 
            category=cat, 
            sub_category=scat, 
            amount=amount, 
            payment_method='credit' if type(pm) == CreditCard else pm, 
            n_installments=installments,
            credit_card_used=pm if type(pm) == CreditCard else None
        )

        print(an_expense.toString())
        
        return True
    except RuntimeError:
        print('Quit')
        return False
    except Exception as e:
        if str(e) == 'restart':
            return modifyInfo(an_expense, row, main_cats, sub_cats)
        else:
            print('Error: {0}'.format(e))
            
def pivotData(data_frame, how='bimonthly'):
    
    date_ix = 'payment_'+('fortnight' if how =='bimonthly' else 'month')
    pivot = pd.pivot_table(
        data_frame, 
        values = 'installment_amount',
        aggfunc='sum',
        index=[date_ix, 'category', 'sub_category']
    ).reset_index()
    
    return pivot

# Load Data

In [4]:
file_to_process = ask_user_for_file()


I have this files available: 
0) descarga (8).xlsx
1) descarga.xlsx
2) movimientos (2).xlsx
3) 5470467005458188 - 17_Ene_20.xls
4) movimientos (1).xlsx
5) movimientos.xlsx

Which one do you want to process?: 5


In [5]:
file_to_process

'/Users/data/Documents/personal_finance/input_data/movimientos.xlsx'

In [6]:
data = pd.read_excel(file_to_process, header=None)
    
ix_of_first = index_of_first_spend(data)

In [8]:
data

Unnamed: 0,0,1,2,3,4
0,Cuenta: 1542512912,,,,
1,DETALLE DE MOVIMIENTOS,,,,
2,,,,,
3,FECHA,DESCRIPCIÓN,CARGO,ABONO,SALDO
4,21/01/2020,SUPERAMA DAKOTA CASH / ******9603 RF...,-944.5,,6684.39
5,21/01/2020,SANBORNS WTC / ******9603 RF...,-18.75,,7628.89
6,20/01/2020,SPEI RECIBIDOBANAMEX / 0007505838 002 02...,,450.00,7647.64
7,20/01/2020,ADAMANTA GYM ESCANDON / ******9603 RF...,-200.00,,7197.64
8,20/01/2020,CONFITERIA / ******9603 RF...,-210.00,,7397.64
9,20/01/2020,SUPERAMA LOS MORALES / ******9603 RF...,-136.3,,7607.64


In [7]:
ix_of_first

4

In [9]:
data = data.iloc[
    ix_of_first:,:3
].copy().reset_index(drop=True)

In [11]:
data.iloc[:,0] = data.iloc[:,0].apply(
    lambda x: parse_date(x)[1]
)

In [None]:


# Amount column as float
data.iloc[:,2] = data.iloc[:,2].astype(float)

In [3]:
data = load_data()


I have this files available: 
0) descarga (8).xlsx
1) descarga.xlsx
2) movimientos (2).xlsx
3) 5470467005458188 - 17_Ene_20.xls
4) movimientos (1).xlsx
5) movimientos.xlsx

Which one do you want to process?: 5


ValueError: could not convert string to float: '-11,962.41'

## Load current Data

In [5]:
try:
    cur_inst_data = hlp.retrieveDataFromSheet(
        gsheet, 
        GSHEET_ID, 
        sheet_name = 'data'
    )
    
    cur_inst_data['expense_id'] = cur_inst_data['expense_id'].astype(int)
    cur_inst_data['installment_amount'] = cur_inst_data['installment_amount'].str.replace(',', '').astype(float)
    cur_inst_data['expense_month'] = cur_inst_data['expense_month'].astype(int)
    cur_inst_data['payment_month'] = cur_inst_data['payment_month'].astype(int)
    cur_inst_data['installment_num'] = cur_inst_data['installment_num'].astype(str)
    cur_inst_data['total_installments'] = cur_inst_data['total_installments'].astype(int)
    
    cur_exp_data = hlp.retrieveDataFromSheet(
        gsheet, 
        GSHEET_ID, 
        sheet_name = 'expenses'
    )
    
    cur_exp_data['expense_id'] = cur_exp_data['expense_id'].astype(int)
    cur_exp_data['amount'] = cur_exp_data['amount'].str.replace(',', '').astype(float)
    cur_exp_data['ignored'] = cur_exp_data['ignored'].astype(str)
    cur_exp_data['installments'] = cur_exp_data['installments'].astype(int)

    cur_exp_data['key'] = (
        cur_exp_data['expense_date'] +'_'+
        cur_exp_data['description']+'_'+
        round(cur_exp_data['amount'].astype(float)).astype(int).astype(str)
    )

    s1 = set(cur_exp_data['key'])
    last_id = cur_exp_data['expense_id'].max()
    cur_exp_data.drop('key',axis=1, inplace=True)
    
    empty_data=False
    
except IndexError:
    empty_data = True
    s1 = set()
    last_id = 0

In [6]:
positive_data = data[data.iloc[:,2].astype(float)>0].copy()

positive_data['key'] = (
    positive_data.iloc[:,0].astype(str)+'_'+
    positive_data.iloc[:,1]+'_'+
    round(positive_data.iloc[:,2].astype(float)).astype(int).astype(str)
)

s2 = set(positive_data['key'])

In [7]:
missing_data = positive_data[
    ~positive_data['key'].isin(
        s1.intersection(s2)
    )
].reset_index(drop=True).drop('key', axis=1).copy()

In [8]:
# Init Data Frames
cols_1 = [
    "expense_id",
    "installment_amount",
    "expense_date", 
    "expense_month", 
    "payment_date", 
    "payment_month", 
    "total_installments", 
    "installment_num",
    "payment_method",
    "method_name",
    "category", 
    "sub_category",
    "payment_fortnight",
    "description"
]

cols_2 = [
    "expense_id",
    "expense_date", 
    "description", 
    "amount",
    "installments",
    "ignored",
    "payment_method",
    "method_name"
]

installments_df = pd.DataFrame(index=[0],columns=cols_1)
expenses_df = pd.DataFrame(index=[0],columns=cols_2)

# Log Expenses

In [9]:
new_expenses = []
main_cats, sub_cats = hlp.getCategoriesFromGSheet(gsheet, GSHEET_ID)

pm = hlp.askForPaymentMethod()

ix=0
while ix < missing_data.shape[0]:
    an_expense = Expense(this_id=(int(last_id)+1)+ix)
    row = missing_data.loc[ix,:]
    print('')
    print('----------------------------------')
    print("Currently Processing this expense:")
    print(row.to_string())
    log_expense = hlp.askYesOrNo("Log this expense? ")
    
    if log_expense:       
        if modifyInfo(an_expense, row, main_cats, sub_cats):
            
            sub_expenses = an_expense.divideExpense()
            installments_df = update_installments_data_frame(
                installments_df, 
                expenses = sub_expenses
            ) 
            expenses_df = update_expenses_data_frame(
                cur_df = expenses_df, 
                expense = an_expense, 
                ignored=False
            )
            
        else:
            break
    else:
        date, concept, amount = row
        an_expense.updateData(
            payment_date=date, 
            description=concept, 
            category='', 
            sub_category='', 
            amount=amount, 
            payment_method='credit' if type(pm) == CreditCard else pm, 
            n_installments=1,
            credit_card_used=pm if type(pm) == CreditCard else None
        )
        expenses_df = update_expenses_data_frame(
            cur_df = expenses_df, 
            expense = an_expense, 
            ignored=True
        )
        
    ix+=1

print('')
print('Done! That was the end of the file.')


Which payment method was used? 
0) Santander-Free
1) Bancomer-Platinum
2) debit
3) cash

Enter the number of the method: 1
Bancomer-Platinum


----------------------------------
Currently Processing this expense:
0    2020-01-21 00:00:00
1      CAFEBRERIA EL PEN
2                    251
Log this expense? y

    Registering a new expense. At any time:
        * r is for restart, and it restarts 
            the entire expense information logging, 
        * q kills the entire process.
    
Select one Category from the below list:
1.  Food
2.  Getting Around
3.  Fun Stuff
4.  Health Care
5.  Personal Stuff
6.  Apartment Spends
7.  Removed From Savings

Enter number of the Category: 1
Select one Sub Category from the below list:
1.  Super Market
2.  Restaurants
3.  Take-out and bingeing
4.  Coffe
5.  Other

Enter number of the Sub Category: 2

How many installments? 1
Expense:
    Id:             107
    Date:           2020-01-21
    Category:       Food
    Sub Category:   Restaurants


Select one Sub Category from the below list:
1.  Super Market
2.  Restaurants
3.  Take-out and bingeing
4.  Coffe
5.  Other

Enter number of the Sub Category: 2

How many installments? 1
Expense:
    Id:             115
    Date:           2020-01-17
    Category:       Food
    Sub Category:   Restaurants
    Payment Method: credit
    Installments:   1
    Amount:        $192.0

----------------------------------
Currently Processing this expense:
0     2020-01-16 00:00:00
1    SUPERAMA LOS MORALES
2                   200.1
Log this expense? y

    Registering a new expense. At any time:
        * r is for restart, and it restarts 
            the entire expense information logging, 
        * q kills the entire process.
    
Select one Category from the below list:
1.  Food
2.  Getting Around
3.  Fun Stuff
4.  Health Care
5.  Personal Stuff
6.  Apartment Spends
7.  Removed From Savings

Enter number of the Category: 1
Select one Sub Category from the below list:
1.  Super Market
2. 

In [98]:
updated_inst_data = pd.concat(
    [cur_inst_data, installments_df], 
    ignore_index=True
) if not empty_data else installments_df
updated_inst_data.sort_values(by=['expense_date'], inplace=True)
updated_inst_data.index = range(len(updated_inst_data))
updated_inst_data.drop_duplicates(
    subset=['expense_date', 'description', 'installment_amount'],
    inplace=True
)

inst_values_list = hlp.dataFrameToListOfValues(updated_inst_data.dropna())

gsheet.clear_values(
    spreadsheet_id = GSHEET_ID,
    range_name = 'data'
)

gsheet.values_to_gsheet(
    spreadsheet_id = GSHEET_ID,
    values_list=inst_values_list, 
    range_name='data'
)

KeyboardInterrupt: 

In [149]:
updated_exp_data = pd.concat(
    [cur_exp_data, expenses_df], 
    ignore_index=True 
)if not empty_data else expenses_df
updated_exp_data.sort_values(by=['expense_id'], inplace=True)
updated_exp_data.index = range(len(updated_exp_data))
updated_exp_data.drop_duplicates(
    subset=['expense_date', 'description', 'amount'],
    inplace=True
)

exp_values_list = hlp.dataFrameToListOfValues(updated_exp_data.dropna())

gsheet.clear_values(
    spreadsheet_id = GSHEET_ID,
    range_name = 'expenses'
)

gsheet.values_to_gsheet(
    spreadsheet_id = GSHEET_ID,
    values_list=exp_values_list, 
    range_name='expenses'
)

944 cells updated.


In [157]:
 # Pivot bimonthly (every 15 days)
bimonthly_pivot = pivotData(cur_inst_data, how='bimonthly')
bimonthly_values = hlp.dataFrameToListOfValues(bimonthly_pivot)
gsheet.clear_values(
    spreadsheet_id = GSHEET_ID,
    range_name = 'bimonthly pivot!A1'
)
gsheet.values_to_gsheet(
    spreadsheet_id = GSHEET_ID,
    values_list=bimonthly_values, 
    range_name='bimonthly pivot!A1'
)

# Pivot monthly
monthly_pivot = pivotData(cur_inst_data, how='monthly')
monthly_values = hlp.dataFrameToListOfValues(monthly_pivot)
gsheet.clear_values(
    spreadsheet_id = GSHEET_ID,
    range_name = 'monthly pivot!A1'
)
gsheet.values_to_gsheet(
    spreadsheet_id = GSHEET_ID,
    values_list=monthly_values, 
    range_name='monthly pivot!A1'
)

print(" ")
print("Expenses were succesfully uploaded to Google Doc.")

208 cells updated.
168 cells updated.
 
Expenses were succesfully uploaded to Google Doc.
