In [1]:
#Importing dependencies
import pandas as pd
import os

In [2]:
## getDataFrameFromFiles - use pandas to read the csv file and get a dataframe to work with it
# @params:
# path - the path of the data file
# @return:
# dataframe created from the file
##
def getDataFrameFromFile(path):
    #Use index_col so pandas doesn't use the fisrt column as an index column.
    return pd.read_csv(path, index_col=False)

In [3]:
## sortDataFrame - return the dataframe sorted by column(s) and ascending or descending order
# @params:
# dataframe - dataframe that will be sorted
# column - name of column or columns (separated by comma) to sort as string ie. "TransactionID" or "TransactionID,TransactionType"
# ascending - True for ascending sort. False for descending sort.
##
def sortDataFrame(dataframe, column, ascending):
    sortedDF = dataframe.sort_values(by=[column], ascending=ascending)

    #Reset the index of the dataframe
    sortedDF.index = [x for x in range(0, len(sortedDF))]
    return sortedDF

In [33]:
def checkForDuplicate(previous, actual):
    if(previous.empty):
        return False
    else:
        if((previous[4] == actual[4]) and (previous[5] == actual[5])):
            return True
        else:
            return False

In [5]:
#csv files path
clientFilePath = os.path.join("..", "Resources", "ClientMarkoffFile20140113.csv")
tutukaFilePath = os.path.join("..", "Resources", "TutukaMarkoffFile20140113.csv")

In [6]:
#Create dataframes for each file
client_df = getDataFrameFromFile(clientFilePath)
tutuka_df = getDataFrameFromFile(tutukaFilePath)

In [7]:
#Sort client datframe by TransactionID (ascending order)
sortedClient = sortDataFrame(client_df, "TransactionID", True)
sortedClient.head()

Unnamed: 0,ProfileName,TransactionDate,TransactionAmount,TransactionNarrative,TransactionDescription,TransactionID,TransactionType,WalletReference
0,Card Campaign,2014-01-12 08:24:56,-10250,Sunset Butchery 100343 Francistown BW,DEDUCT,4012304765521,0,P_NzI4Njk3MTBfMTM4MTQ5MTEwNy4wNTMy
1,Card Campaign,2014-01-12 08:41:20,-3475,Choppies Superst102145 Lobatse BW,DEDUCT,4012314602553,0,P_NzU3OTY0MzBfMTM4NzIwMzAxOS4zMzYz
2,Card Campaign,2014-01-12 08:41:20,-3475,Choppies Superst102145 Lobatse BW,DEDUCT,4012314602553,0,P_NzU3OTY0MzBfMTM4NzIwMzAxOS4zMzYz
3,Card Campaign,2014-01-12 12:56:25,4310,766831 SHOPRITE GABORONE BOTSWANA BW,REVERSAL,4012321856252,0,P_NzI0MTE0MjJfMTM4ODEzMTA0Mi42MTI3
4,Card Campaign,2014-01-12 12:56:25,-4310,766831 SHOPRITE GABORONE BOTSWANA BW,DEDUCT,4012321856252,0,P_NzI0MTE0MjJfMTM4ODEzMTA0Mi42MTI3


In [8]:
#Sort tutuka dataframe by TransactionID (ascending order)
sortedTutuka = sortDataFrame(tutuka_df, "TransactionID", True)
sortedTutuka.head()

Unnamed: 0,ProfileName,TransactionDate,TransactionAmount,TransactionNarrative,TransactionDescription,TransactionID,TransactionType,WalletReference
0,Card Campaign,2014-01-12 08:24:56,-10250,Sunset Butchery 100343 Francistown BW,DEDUCT,4012304765521,0,P_NzI4Njk3MTBfMTM4MTQ5MTEwNy4wNTMy
1,Card Campaign,2014-01-12 08:41:20,-3475,Choppies Superst102145 Lobatse BW,DEDUCT,4012314602553,0,P_NzU3OTY0MzBfMTM4NzIwMzAxOS4zMzYz
2,Card Campaign,2014-01-12 12:56:25,4310,766831 SHOPRITE GABORONE BOTSWANA BW,REVERSAL,4012321856252,0,P_NzI0MTE0MjJfMTM4ODEzMTA0Mi42MTI3
3,Card Campaign,2014-01-12 12:56:25,-4310,766831 SHOPRITE GABORONE BOTSWANA BW,DEDUCT,4012321856252,0,P_NzI0MTE0MjJfMTM4ODEzMTA0Mi42MTI3
4,Card Campaign,2014-01-12 12:57:31,-4310,766831 SHOPRITE GABORONE BOTSWANA BW,DEDUCT,4012322510401,0,P_NzI0MTE0MjJfMTM4ODEzMTA0Mi42MTI3


In [47]:
#iterator for dataframe
index = 0
clientPrevRow = pd.Series()
tutukaPrevRow = pd.Series()
while(index < 10):
    clientRow = sortedClient.iloc[index]
    tutukaRow = sortedTutuka.iloc[index]
    if((checkForDuplicate(clientPrevRow, clientRow))):
        sortedClient.drop([index], inplace=True)
        sortedClient.reset_index()
        #print(f'Description:{clientRow[4]}  TransactionID:{clientRow[5]}')
    if((checkForDuplicate(tutukaPrevRow, tutukaRow))):
        sortedTutuka.drop([index], inplace=True)
        #print(f'Description:{tutukaRow[4]}  TransactionID:{tutukaRow[5]}')
    #if(clientRow[5] == tutukaRow[5]):
        #print(f'ID Match: {clientRow[5]}')
    clientPrevRow = clientRow.copy()
    tutukaPrevRow = tutukaRow.copy()
    #print('--------------------')
    index += 1

Unnamed: 0,ProfileName,TransactionDate,TransactionAmount,TransactionNarrative,TransactionDescription,TransactionID,TransactionType,WalletReference
0,Card Campaign,2014-01-12 08:24:56,-10250,Sunset Butchery 100343 Francistown BW,DEDUCT,4012304765521,0,P_NzI4Njk3MTBfMTM4MTQ5MTEwNy4wNTMy
1,Card Campaign,2014-01-12 08:41:20,-3475,Choppies Superst102145 Lobatse BW,DEDUCT,4012314602553,0,P_NzU3OTY0MzBfMTM4NzIwMzAxOS4zMzYz
3,Card Campaign,2014-01-12 12:56:25,4310,766831 SHOPRITE GABORONE BOTSWANA BW,REVERSAL,4012321856252,0,P_NzI0MTE0MjJfMTM4ODEzMTA0Mi42MTI3
4,Card Campaign,2014-01-12 12:56:25,-4310,766831 SHOPRITE GABORONE BOTSWANA BW,DEDUCT,4012321856252,0,P_NzI0MTE0MjJfMTM4ODEzMTA0Mi42MTI3
5,Card Campaign,2014-01-12 12:57:31,-4310,766831 SHOPRITE GABORONE BOTSWANA BW,DEDUCT,4012322510401,0,P_NzI0MTE0MjJfMTM4ODEzMTA0Mi42MTI3


In [None]:
#Get the length of both files so we can choose the largest file and iterate over that one
clientLen = len(client_df)
tutukaLen = len(tutuka_df)

#Assign the biggest length
rowsLen = clientLen if (clientLen > tutukaLen) else tutukaLen
print(rowsLen)
    

In [None]:
sortedClient.reset_index()

In [None]:
duplicated = sortedTutuka[sortedTutuka.duplicated(["TransactionID"])]
duplicated

In [None]:
client_df.dtypes