From b2ccbf62cf04adddbf3464875c6521582b353964 Mon Sep 17 00:00:00 2001 From: Derek Atkins Date: Sat, 8 Mar 2003 19:48:50 +0000 Subject: [PATCH] * src/import-export/import-backend.c: * src/import-export/import-match-map.c: * src/import-export/import-match-map.h: Chris Morgan's Baysian Matching code, to match transactions based on Bayesian filtering of previously matched transactions. git-svn-id: svn+ssh://svn.gnucash.org/repo/gnucash/trunk@8044 57a11ea4-9604-0410-9ed3-97b8803252fd --- ChangeLog | 8 + src/import-export/import-backend.c | 169 ++++++++++-- src/import-export/import-match-map.c | 383 ++++++++++++++++++++++++++- src/import-export/import-match-map.h | 16 +- 4 files changed, 552 insertions(+), 24 deletions(-) diff --git a/ChangeLog b/ChangeLog index 5f6339eedfb..32b0b55efac 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +2003-03-08 Derek Atkins + + * src/import-export/import-backend.c: + * src/import-export/import-match-map.c: + * src/import-export/import-match-map.h: + Chris Morgan's Baysian Matching code, to match transactions + based on Bayesian filtering of previously matched transactions. + 2003-03-06 Christian Stimming * src/import-export/hbci/dialog-hbcitrans.c: Include a latest diff --git a/src/import-export/import-backend.c b/src/import-export/import-backend.c index 99b4ad55697..1f79e4942c3 100644 --- a/src/import-export/import-backend.c +++ b/src/import-export/import-backend.c @@ -44,6 +44,9 @@ #include "gnc-ui-util.h" +#define IMPORT_PAGE "Online Banking & Importing" /* from app-utils/prefs.scm */ +#define BAYES_OPTION "Use Bayesian Matching?" + /********************************************************************\ * Constants * \********************************************************************/ @@ -90,6 +93,9 @@ struct _transactioninfo GNCImportAction action; GNCImportAction previous_action; + /* A list of tokenized strings to use for bayesian matching purposes */ + GList * match_tokens; + /* In case of a single destination account it is stored here. */ Account *dest_acc; gboolean dest_acc_selected_manually; @@ -241,6 +247,15 @@ void gnc_import_TransInfo_delete (GNCImportTransInfo *info) xaccTransDestroy(info->trans); xaccTransCommitEdit(info->trans); } + if (info->match_tokens) + { + GList *node; + + for (node = info->match_tokens; node; node = node->next) + g_free (node->data); + + g_list_free (info->match_tokens); + } g_free(info); } } @@ -343,28 +358,128 @@ GdkPixmap* gen_probability_pixmap(gint score_original, GNCImportSettings *settin * MatchMap- related functions (storing and retrieving) */ -/* searches using the GNCImportTransInfo through all existing transactions */ -/* if there is an exact match of the description and memo */ +/* Tokenize a string and append to an existing GList(or an empty GList) + * the tokens + */ +static GList* +tokenize_string(GList* existing_tokens, const char *string) +{ + char **tokenized_strings; /* array of strings returned by g_strsplit() */ + char **stringpos; + + tokenized_strings = g_strsplit(string, " ", 0); + stringpos = tokenized_strings; + + /* add each token to the token GList */ + while(stringpos && *stringpos) + { + /* prepend the char* to the token GList */ + existing_tokens = g_list_prepend(existing_tokens, g_strdup(*stringpos)); + + /* then move to the next string */ + stringpos++; + } + + /* free up the strings that g_strsplit() created */ + g_strfreev(tokenized_strings); + + return existing_tokens; +} + +/* create and return a list of tokens for a given transaction info. */ +static GList* +TransactionGetTokens(GNCImportTransInfo *info) +{ + Transaction* transaction; + GList* tokens; + const char* text; + time_t transtime; + struct tm *tm_struct; + char local_day_of_week[16]; + Split* split; + int split_index; + + g_return_val_if_fail (info, NULL); + if (info->match_tokens) return info->match_tokens; + + transaction = gnc_import_TransInfo_get_trans(info); + g_assert(transaction); + + tokens = 0; /* start off with an empty list */ + + /* make tokens from the transaction description */ + text = xaccTransGetDescription(transaction); + tokens = tokenize_string(tokens, text); + + /* the day of week the transaction occured is a good indicator of + * what account this transaction belongs in get the date and covert + * it to day of week as a token + */ + transtime = xaccTransGetDate(transaction); + tm_struct = gmtime(&transtime); + if(!strftime(local_day_of_week, sizeof(local_day_of_week), "%A", tm_struct)) + { + PERR("TransactionGetTokens: error, strftime failed\n"); + } + + /* we cannot add a locally allocated string to this array, dup it so + * it frees the same way the rest do + */ + tokens = g_list_prepend(tokens, g_strdup(local_day_of_week)); + + /* make tokens from the memo of each split of this transaction */ + split_index = 0; + while((split = xaccTransGetSplit(transaction, split_index))) + { + text = xaccSplitGetMemo(split); + tokens = tokenize_string(tokens, text); + split_index++; /* next split */ + } + + /* remember the list of tokens for later.. */ + info->match_tokens = tokens; + + /* return the pointer to the GList */ + return tokens; +} + +/* searches using the GNCImportTransInfo through all existing transactions + * if there is an exact match of the description and memo + */ static Account * -matchmap_find_destination (GncImportMatchMap *matchmap, - GNCImportTransInfo *info) +matchmap_find_destination (GncImportMatchMap *matchmap, GNCImportTransInfo *info) { GncImportMatchMap *tmp_map; Account *result; + GList* tokens; + gboolean useBayes; + g_assert (info); - tmp_map = ((matchmap != NULL) ? matchmap : gnc_imap_create_from_account (xaccSplitGetAccount (gnc_import_TransInfo_get_fsplit (info)))); - result = gnc_imap_find_account - (tmp_map, GNCIMPORT_DESC, - xaccTransGetDescription (gnc_import_TransInfo_get_trans (info))); + useBayes = gnc_lookup_boolean_option(IMPORT_PAGE, BAYES_OPTION, TRUE); + if(useBayes) + { + /* get the tokens for this transaction* */ + tokens = TransactionGetTokens(info); + + /* try to find the destination account for this transaction from its tokens */ + result = gnc_imap_find_account_bayes(tmp_map, tokens); + + } else { + /* old system of transaction to account matching */ + result = gnc_imap_find_account + (tmp_map, GNCIMPORT_DESC, + xaccTransGetDescription (gnc_import_TransInfo_get_trans (info))); + } /* Disable matching by memo, until bayesian filtering is implemented. - It's currently unlikely to help, and has adverse effects, causing false positives, - since very often the type of the transaction is stored there. + * It's currently unlikely to help, and has adverse effects, + * causing false positives, since very often the type of the + * transaction is stored there. if (result == NULL) result = gnc_imap_find_account @@ -390,6 +505,9 @@ matchmap_store_destination (GncImportMatchMap *matchmap, GncImportMatchMap *tmp_matchmap = NULL; Account *dest; const char *descr, *memo; + GList *tokens; + gboolean useBayes; + g_assert (trans_info); /* This will store the destination account of the selected match if @@ -410,20 +528,33 @@ matchmap_store_destination (GncImportMatchMap *matchmap, (xaccSplitGetAccount (gnc_import_TransInfo_get_fsplit (trans_info)))); - descr = xaccTransGetDescription - (gnc_import_TransInfo_get_trans (trans_info)); - if (descr && (strlen (descr) > 0)) - gnc_imap_add_account (tmp_matchmap, + /* see what matching system we are currently using */ + useBayes = gnc_lookup_boolean_option(IMPORT_PAGE, BAYES_OPTION, TRUE); + if(useBayes) + { + /* tokenize this transaction */ + tokens = TransactionGetTokens(trans_info); + + /* add the tokens to the imap with the given destination account */ + gnc_imap_add_account_bayes(tmp_matchmap, tokens, dest); + + } else { + /* old matching system */ + descr = xaccTransGetDescription + (gnc_import_TransInfo_get_trans (trans_info)); + if (descr && (strlen (descr) > 0)) + gnc_imap_add_account (tmp_matchmap, GNCIMPORT_DESC, descr, dest); - memo = xaccSplitGetMemo - (gnc_import_TransInfo_get_fsplit (trans_info)); - if (memo && (strlen (memo) > 0)) - gnc_imap_add_account (tmp_matchmap, + memo = xaccSplitGetMemo + (gnc_import_TransInfo_get_fsplit (trans_info)); + if (memo && (strlen (memo) > 0)) + gnc_imap_add_account (tmp_matchmap, GNCIMPORT_MEMO, memo, dest); + } /* if(useBayes) */ if (matchmap == NULL) gnc_imap_destroy (tmp_matchmap); @@ -935,7 +1066,7 @@ gnc_import_TransInfo_refresh_destacc (GNCImportTransInfo *transaction_info, /* if we haven't manually selected a destination account for this transaction */ if(gnc_import_TransInfo_get_destacc_selected_manually(transaction_info) == FALSE) { - /* Try to find a previous selected destination account string match for the ADD action */ + /* Try to find the destination account for this transaction based on prior ones */ new_destacc = matchmap_find_destination(matchmap, transaction_info); gnc_import_TransInfo_set_destacc(transaction_info, new_destacc, FALSE); } else diff --git a/src/import-export/import-match-map.c b/src/import-export/import-match-map.c index 8080ffc164a..e2222483b05 100644 --- a/src/import-export/import-match-map.c +++ b/src/import-export/import-match-map.c @@ -25,11 +25,22 @@ An import mapper service that stores Account Maps for the generic importer. This allows importers to map various "strings" to Gnucash accounts in a generic manner. - @author Copyright (C) 2002 Derek Atkins + @author Copyright (C) 2002,2003 Derek Atkins */ #include +#include #include "import-match-map.h" #include "kvp_frame.h" +#include "Group.h" +#include "gnc-ui-util.h" +#include "gnc-engine-util.h" + +/********************************************************************\ + * Constants * +\********************************************************************/ + +static short module = MOD_IMPORT; + struct _GncImportMatchMap { kvp_frame * frame; @@ -37,7 +48,8 @@ struct _GncImportMatchMap { GNCBook * book; }; -#define IMAP_FRAME "import-map" +#define IMAP_FRAME "import-map" +#define IMAP_FRAME_BAYES "import-map-bayes" static GncImportMatchMap * gnc_imap_create_from_frame (kvp_frame *frame, Account *acc, GNCBook *book) @@ -99,6 +111,9 @@ void gnc_imap_clear (GncImportMatchMap *imap) /* Clear the IMAP_FRAME kvp */ kvp_frame_set_slot_path (imap->frame, NULL, IMAP_FRAME); + /* Clear the bayes kvp, IMAP_FRAME_BAYES */ + kvp_frame_set_slot_path (imap->frame, NULL, IMAP_FRAME_BAYES); + /* XXX: mark the account (or book) as dirty! */ } @@ -143,4 +158,368 @@ void gnc_imap_add_account (GncImportMatchMap *imap, const char *category, /* XXX Mark the account (or book) as dirty! */ } + + + +/* Below here is the bayes transaction to account matching system */ +struct account_token_count +{ + char* account_name; + gint64 token_count; /* occurances of a given token for this account_name */ +}; + +/* total_count and the token_count for a given account let us calculate the + * probability of a given account with any single token + */ +struct token_accounts_info +{ + GList *accounts; /* array of struct account_token_count */ + gint64 total_count; +}; + +/* gpointer is a pointer to a struct token_accounts_info + * NOTE: can always assume that keys are unique, reduces code in this function + */ +static void buildTokenInfo(const char *key, kvp_value *value, gpointer data) +{ + struct token_accounts_info *tokenInfo = (struct token_accounts_info*)data; + struct account_token_count* this_account; + + // PINFO("buildTokenInfo: account '%s', token_count: '%ld'\n", (char*)key, + // (long)kvp_value_get_gint64(value)); + + /* add the count to the total_count */ + tokenInfo->total_count += kvp_value_get_gint64(value); + + /* allocate a new structure for this account and it's token count */ + this_account = (struct account_token_count*) + g_new0(struct account_token_count, 1); + + /* fill in the account name and number of tokens found for this account name */ + this_account->account_name = (char*)key; + this_account->token_count = kvp_value_get_gint64(value); + + /* append onto the glist a pointer to the new account_token_count structure */ + tokenInfo->accounts = g_list_prepend(tokenInfo->accounts, this_account); +} + +/* intermediate values used to calculate the bayes probability of a given account + * where p(AB) = (a*b)/[a*b + (1-a)(1-b)], product is (a*b), + * product_difference is (1-a) * (1-b) + */ +struct account_probability +{ + double product; /* product of probabilities */ + double product_difference; /* product of (1-probabilities) */ +}; + +/* convert a hash table of account names and (struct account_probability*) + * into a hash table of 100000x the percentage match value, ie. 10% would be + * 0.10 * 100000 = 10000 + */ +#define PROBABILITY_FACTOR 100000 +static void buildProbabilities(gpointer key, gpointer value, gpointer data) +{ + GHashTable *final_probabilities = (GHashTable*)data; + struct account_probability *account_p = (struct account_probability*)value; + + /* P(AB) = A*B / [A*B + (1-A)*(1-B)] + * NOTE: so we only keep track of a running product(A*B*C...) + * and product difference ((1-A)(1-B)...) + */ + gint32 probability = + (account_p->product / + (account_p->product + account_p->product_difference)) + * PROBABILITY_FACTOR; + + PINFO("P('%s') = '%d'\n", (char*)key, probability); + + g_hash_table_insert(final_probabilities, key, (gpointer)probability); +} + +/* Frees an array of the same time that buildProperties built */ +static void freeProbabilities(gpointer key, gpointer value, gpointer data) +{ + /* free up the struct account_probability that was allocated + * in gnc_imap_find_account_bayes() + */ + g_free(value); +} + +/* holds an account name and its corresponding integer probability + * the integer probability is some factor of 10 + */ +struct account_info +{ + char* account_name; + gint32 probability; +}; + +/* Find the highest probability and the corresponding account name + * store in data, a (struct account_info*) + * NOTE: this is a g_hash_table_foreach() function for a hash table of entries + * key is a pointer to the account name, value is a gint32, 100000x + * the probability for this account + */ +static void highestProbability(gpointer key, gpointer value, gpointer data) +{ + struct account_info *account_i = (struct account_info*)data; + + /* if the current probability is greater than the stored, store the current */ + if((gint32)value > account_i->probability) + { + /* Save the new highest probability and the assoaciated account name */ + account_i->probability = (gint32)value; + account_i->account_name = key; + } +} + + +#define threshold (.90 * PROBABILITY_FACTOR) /* 90% */ + +/* Look up an Account in the map */ +Account* gnc_imap_find_account_bayes(GncImportMatchMap *imap, GList *tokens) +{ + struct token_accounts_info tokenInfo; /* holds the accounts and total + * token count for a single token */ + GList *current_token; /* pointer to the current token from the + * input GList *tokens */ + GList *current_account_token; /* pointer to the struct + * account_token_count */ + struct account_token_count *account_c; /* an account name and the number + * of times a token has appeared + * for the account */ + struct account_probability *account_p; /* intermediate storage of values + * to compute the bayes probability + * of an account */ + GHashTable *running_probabilities = g_hash_table_new(g_str_hash, g_str_equal); + GHashTable *final_probabilities = g_hash_table_new(g_str_hash, g_str_equal); + struct account_info account_i; + kvp_value* value; + kvp_frame* token_frame; + + ENTER(" "); + + /* check to see if the imap is NULL */ + if(!imap) + { + PINFO("imap is null, returning null"); + LEAVE(" "); + return NULL; + } + + /* find the probability for each account that contains any of the tokens + * in the input tokens list + */ + for(current_token = tokens; current_token; current_token = current_token->next) + { + /* zero out the token_accounts_info structure */ + memset(&tokenInfo, 0, sizeof(struct token_accounts_info)); + + PINFO("token: '%s'", (char*)current_token->data); + + /* find the slot for the given token off of the source account + * for these tokens, search off of the IMAP_FRAME_BAYES path so + * we aren't looking from the parent of the entire kvp tree + */ + value = kvp_frame_get_slot_path(imap->frame, IMAP_FRAME_BAYES, + (char*)current_token->data, NULL); + + /* if value is null we should skip over this token */ + if(!value) + continue; + + /* convert the slot(value) into a the frame that contains the + * list of accounts + */ + token_frame = kvp_value_get_frame(value); + + /* token_frame should NEVER be null */ + if(!token_frame) + { + PERR("token '%s' has no accounts", (char*)current_token->data); + continue; /* skip over this token */ + } + + /* process the accounts for this token, adding the account if it + * doesn't already exist or adding to the existing accounts token + * count if it does + */ + kvp_frame_for_each_slot(token_frame, buildTokenInfo, &tokenInfo); + + /* for each account we have just found, see if the account already exists + * in the list of account probabilities, if not add it + */ + for(current_account_token = tokenInfo.accounts; current_account_token; + current_account_token = current_account_token->next) + { + /* get the account name and corresponding token count */ + account_c = (struct account_token_count*)current_account_token->data; + + PINFO("account_c->account_name('%s'), " + "account_c->token_count('%ld')/total_count('%ld')", + account_c->account_name, (long)account_c->token_count, + (long)tokenInfo.total_count); + + account_p = g_hash_table_lookup(running_probabilities, + account_c->account_name); + + /* if the account exists in the list then continue + * the running probablities + */ + if(account_p) + { + account_p->product = + ((double)account_c->token_count / (double)tokenInfo.total_count) + * account_p->product; + account_p->product_difference = + ((double)1 - ((double)account_c->token_count / + (double)tokenInfo.total_count)) + * account_p->product_difference; + PINFO("product == %f, product_difference == %f", + account_p->product, account_p->product_difference); + } + else + { + /* add a new entry */ + PINFO("adding a new entry for this account"); + account_p = (struct account_probability*) + g_new0(struct account_probability, 1); + + /* set the product and product difference values */ + account_p->product = ((double)account_c->token_count / + (double)tokenInfo.total_count); + account_p->product_difference = + (double)1 - ((double)account_c->token_count / + (double)tokenInfo.total_count); + + PINFO("product == %f, product_difference == %f", + account_p->product, account_p->product_difference); + + /* add the account name and (struct account_probability*) + * to the hash table */ + g_hash_table_insert(running_probabilities, + account_c->account_name, account_p); + } + } /* for all accounts in tokenInfo */ + + /* free the data in tokenInfo */ + for(current_account_token = tokenInfo.accounts; current_account_token; + current_account_token = current_account_token->next) + { + /* free up each struct account_token_count we allocated */ + g_free((struct account_token_count*)current_account_token->data); + } + + g_list_free(tokenInfo.accounts); /* free the accounts GList */ + } + + /* build a hash table of account names and their final probabilities + * from each entry in the running_probabilties hash table + */ + g_hash_table_foreach(running_probabilities, buildProbabilities, + final_probabilities); + + /* find the highest probabilty and the corresponding account */ + memset(&account_i, 0, sizeof(struct account_info)); + g_hash_table_foreach(final_probabilities, highestProbability, &account_i); + + /* free each element of the running_probabilities hash */ + g_hash_table_foreach(running_probabilities, freeProbabilities, NULL); + + /* free the hash tables */ + g_hash_table_destroy(running_probabilities); + g_hash_table_destroy(final_probabilities); + + PINFO("highest P('%s') = '%d'", account_i.account_name, account_i.probability); + + /* has this probability met our threshold? */ + if(account_i.probability >= threshold) + { + PINFO("found match"); + LEAVE(" "); + return xaccGetAccountFromFullName(gnc_book_get_group(imap->book), + account_i.account_name, + gnc_get_account_separator()); + } + + PINFO("no match"); + LEAVE(" "); + + return NULL; /* we didn't meet our threshold, return NULL for an account */ +} + + +/* Updates the imap for a given account using a list of tokens */ +void gnc_imap_add_account_bayes(GncImportMatchMap *imap, GList *tokens, Account *acc) +{ + GList *current_token; + kvp_value *value; + gint64 token_count; + char* account_fullname; + kvp_value *new_value; /* the value that will be added back into the kvp tree */ + + ENTER(" "); + + /* if imap is null return */ + if(!imap) + { + LEAVE(" "); + return; + } + + account_fullname = xaccAccountGetFullName(acc, gnc_get_account_separator()); + + PINFO("account name: '%s'\n", account_fullname); + + /* process each token in the list */ + for(current_token = g_list_first(tokens); current_token; + current_token = current_token->next) + { + /* start off with no tokens for this account */ + token_count = 0; + + PINFO("adding token '%s'\n", (char*)current_token->data); + + /* is this token/account_name already in the kvp tree? */ + value = kvp_frame_get_slot_path(imap->frame, IMAP_FRAME_BAYES, + (char*)current_token->data, account_fullname, + NULL); + + /* if the token/account is already in the tree, read the current + * value from the tree and use this for the basis of the value we + * are putting back + */ + if(value) + { + PINFO("found existing value of '%ld'\n", + (long)kvp_value_get_gint64(value)); + + /* convert this value back into an integer */ + token_count+=kvp_value_get_gint64(value); + } + + /* increment the token count */ + token_count++; + + /* create a new value */ + new_value = kvp_value_new_gint64(token_count); + + /* insert the value into the kvp tree at + * /imap->frame/IMAP_FRAME/token_string/account_name_string + */ + kvp_frame_set_slot_path(imap->frame, new_value, IMAP_FRAME_BAYES, + (char*)current_token->data, account_fullname, NULL); + + /* kvp_frame_set_slot_path() copied the value so we + * need to delete this one ;-) */ + kvp_value_delete(new_value); + } + + /* free up the account fullname string */ + g_free(account_fullname); + + LEAVE(" "); +} + /** @} */ diff --git a/src/import-export/import-match-map.h b/src/import-export/import-match-map.h index 1a106d105f2..592a6dd3fdb 100644 --- a/src/import-export/import-match-map.h +++ b/src/import-export/import-match-map.h @@ -24,7 +24,7 @@ An import mapper service that stores Account Maps for the generic importer. This allows importers to map various "strings" to Gnucash accounts in a generic manner. - @author Copyright (C) 2002 Derek Atkins + @author Copyright (C) 2002,2003 Derek Atkins */ #ifndef GNC_IMPORT_MATCH_MAP_H #define GNC_IMPORT_MATCH_MAP_H @@ -48,8 +48,8 @@ void gnc_imap_destroy (GncImportMatchMap *imap); void gnc_imap_clear (GncImportMatchMap *imap); /** Look up an Account in the map */ -Account * gnc_imap_find_account (GncImportMatchMap *imap, const char *category, - const char *key); +Account* gnc_imap_find_account(GncImportMatchMap *imap, const char* category, + const char *key); /** Store an Account in the map. This mapping is immediatly stored in the underlying kvp frame, regardless of whether the MatchMap is @@ -57,6 +57,16 @@ Account * gnc_imap_find_account (GncImportMatchMap *imap, const char *category, void gnc_imap_add_account (GncImportMatchMap *imap, const char *category, const char *key, Account *acc); +/** Look up an Account in the map from a GList* of pointers to strings(tokens) + from the current transaction */ +Account* gnc_imap_find_account_bayes (GncImportMatchMap *imap, GList* tokens); + +/** Store an Account in the map. This mapping is immediatly stored in + the underlying kvp frame, regardless of whether the MatchMap is + destroyed later or not. */ +void gnc_imap_add_account_bayes (GncImportMatchMap *imap, GList* tokens, + Account *acc); + /** @name Some well-known categories