In [1]:
import pandas as pd

# Load the dataset
file_path = "../data/EXPORT_Cleaned_TPA_data.csv"

try:
    df = pd.read_csv(file_path)

    def classify_transaction(row):
        if pd.isna(row['Amount (£)']):  # If Amount is NaN
            return "Unknown"

        if row['Amount (£)'] > 0:
            return "Purchase"
        elif row['Amount (£)'] < 0:
            # Correcting for "Refunded" status
            if row['Transaction Status'] in ["Refunded", "Completed"]:
                return "Refund"
            elif row['Transaction Status'] == "Failed":
                return "Failed Refund"
            elif row['Transaction Status'] == "Pending":
                return "Pending Refund"
        elif row['Amount (£)'] == 0:
            if row['Transaction Status'] == "Failed":
                return "Failed Transaction"
            elif row['Promotion/Discount Applied'] != "No":
                return "Loyalty Redemption"
            elif row['Transaction Status'] == "Completed":
                return "Invalid Completed Transaction"  # Explicitly marking the issue

        return "Unclassified"  # Instead of "Other", this helps track unknown cases

    # Apply the function to classify transactions
    df['Transaction Type'] = df.apply(classify_transaction, axis=1)

    # Check how many transactions are now classified as "Invalid Completed Transaction"
    invalid_completed_transactions = df[df['Transaction Type'] == "Invalid Completed Transaction"]

    # Save problematic transactions to a CSV file
    invalid_completed_transactions.to_csv("invalid_transactions.csv", index=False)
    print("Invalid transactions saved to invalid_transactions.csv")

except FileNotFoundError:
    print(f"Error: The file {file_path} was not found. Please check the file path.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Invalid transactions saved to invalid_transactions.csv


In [2]:
import sys
print(sys.executable)

/Users/jackrobertson/TPA_CleanAndAnalysis/venv/bin/python


In [3]:
import pandas as pd

# Load the dataset
file_path = "../data/EXPORT_Cleaned_TPA_data.csv"

try:
    df = pd.read_csv(file_path)

    def classify_transaction(row):
        if pd.isna(row['Amount (£)']):  # If Amount is NaN
            return "Unknown"

        if row['Amount (£)'] > 0:
            return "Purchase"
        elif row['Amount (£)'] < 0:
            # Correcting for "Refunded" status
            if row['Transaction Status'] == "Refunded":
                return "Refund"
            elif row['Transaction Status'] == "Failed":
                return "Failed Refund"
            elif row['Transaction Status'] == "Pending":
                return "Pending Refund"
        elif row['Amount (£)'] == 0:
            if row['Transaction Status'] == "Failed":
                return "Failed Transaction"
            elif row['Promotion/Discount Applied'] != "No":
                return "Loyalty Redemption"
            elif row['Transaction Status'] == "Completed":
                return None  # Marking for deletion

        return "Unclassified"  # Instead of "Other", this helps track unknown cases

    # Apply the function to classify transactions
    df['Transaction Type'] = df.apply(classify_transaction, axis=1)

    # Drop only completed transactions with zero amount, but retain refunds
    df_cleaned = df[~((df["Amount (£)"] == 0) & (df["Transaction Status"] == "Completed"))]

    # Ensure refunds remain, but reclassify mislabelled refunds
    df_cleaned.loc[(df_cleaned["Amount (£)"] < 0) & (df_cleaned["Transaction Status"] == "Completed"), "Transaction Status"] = "Refunded"

    # Ensure debit card transactions on mobile are correctly categorized as digital wallet
    df_cleaned.loc[(df_cleaned["Payment Method"] == "Debit Card") & (df_cleaned["Customer Device Type"] == "Mobile"), "Payment Method"] = "Digital Wallet"

    # Drop incorrect positive refunds
    df_cleaned = df_cleaned[~((df_cleaned["Transaction Status"] == "Refunded") & (df_cleaned["Amount (£)"] > 0))]

    # Save the cleaned dataset
    cleaned_file_path = "EXPORT_Cleaned_TPA_data_Final.csv"
    df_cleaned.to_csv(cleaned_file_path, index=False)
    
    print("Cleaned dataset saved as EXPORT_Cleaned_TPA_data_Final.csv")

except FileNotFoundError:
    print(f"Error: The file {file_path} was not found. Please check the file path.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Cleaned dataset saved as EXPORT_Cleaned_TPA_data_Final.csv


# Format Updated

In [5]:
import pandas as pd

df = pd.read_csv("../data/EXPORT_Cleaned_TPA_data_Final.csv")

df['date'] = pd.to_datetime(df['date'], dayfirst=True)  # dayfirst=True for UK date formats

# Check if conversion worked
df.info()  # 'date' should now be 'datetime64[ns]'

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152 entries, 0 to 151
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   transaction_ID                 152 non-null    object        
 1   date                           152 non-null    datetime64[ns]
 2   Amount_GBP                     152 non-null    float64       
 3   payment_method                 152 non-null    object        
 4   merchant_category              152 non-null    object        
 5   location                       152 non-null    object        
 6   customer_segment               152 non-null    object        
 7   transaction_status             152 non-null    object        
 8   sales_channel                  152 non-null    object        
 9   customer_device_type           152 non-null    object        
 10  promotion_or_discount_applied  152 non-null    object        
 11  time               

In [7]:
df['date'] = pd.to_datetime(df['date'], errors='coerce', dayfirst=True)

In [9]:
df.to_csv("../data/F_EXPORT_Cleaned_TPA_data_Final.csv", index=False)
