In [1]:
import pandas as pd
import re
df = pd.read_csv("../data/Amount_Cleaned.csv")

In [2]:
non_strings = df[~df['Merchant Category'].apply(lambda x: isinstance(x, str))]
print(non_strings[['Merchant Category']].head(10))

Empty DataFrame
Columns: [Merchant Category]
Index: []


In [3]:
df['Merchant Category'] = df['Merchant Category'].fillna('').astype(str)

category_corrections = {
    'Food & Bevergae': 'Food & Beverage',
    'Trave': 'Travel',
    'Srvices': 'Services',
    'Food & Bevera ge': 'Food & Beverage',
    'Travél': 'Travel',
    'Tra®vel': 'Travel',
    'merchent catagot': 'Merchant Category'
}

df['Merchant Category'] = df['Merchant Category'].replace(category_corrections)

df['Merchant Category'] = df['Merchant Category'].str.strip().str.title()

print(df['Merchant Category'].unique())

['Retail' 'Food & Beverage' 'Travel' 'Entertainment' 'Services']


In [4]:
df['Merchant Category'] = df['Merchant Category'].replace(re.compile(r'[^a-zA-Z\s&/]+', re.IGNORECASE), '', regex=True)

print(df['Merchant Category'].unique())

['Retail' 'Food & Beverage' 'Travel' 'Entertainment' 'Services']


In [5]:
print(df['Merchant Category'].head(10))

0             Retail
1    Food & Beverage
2             Travel
3      Entertainment
4           Services
5             Retail
6    Food & Beverage
7             Travel
8      Entertainment
9           Services
Name: Merchant Category, dtype: object


In [6]:
print(df.shape)
print(df.head())


(170, 13)
  Transaction ID        Date  Amount (£)  Payment Method Merchant Category  \
0        T000001  2023-01-15      245.67     Credit Card            Retail   
1        T000002  2023-02-11       98.34  Mobile Payment   Food & Beverage   
2        T000003  2023-03-22      500.25  Digital Wallet            Travel   
3        T000004  2023-04-10       75.89      Debit Card     Entertainment   
4        T000005  2023-05-30      320.00   Bank Transfer          Services   

     Location Customer Segment Transaction Status Sales Channel  \
0      London            26-35          Completed        Online   
1  Manchester            18-25            Pending    Mobile App   
2  Birmingham            36-45          Completed        Online   
3     Glasgow            46-60             Failed      In-Store   
4     Bristol              60+           Refunded  Subscription   

  Customer Device Type Promotion/Discount Applied   Time Transaction Type  
0               Mobile                    

In [7]:
df.to_csv("../data/Merchant_Cleaned.csv", index=False)

### Location cleaning 

In [8]:
location_corrections = {
    'Birmingh@m': 'Birmingham',
    'Birminghm': 'Birmingham',
    'Birmngham': 'Birmingham',
    'Glasow': 'Glasgow',  
    'Manchester': 'Manchester',  
    'London': 'London', 
    'Bristol': 'Bristol',  
}

df['Location'] = df['Location'].replace(location_corrections)

df['Location'] = df['Location'].str.strip().str.title()

print(df['Location'].unique())

['London' 'Manchester' 'Birmingham' 'Glasgow' 'Bristol' nan]


In [9]:
df = df.dropna(subset=['Location'])

print(df.shape)
print(df['Location'].unique())
print(df['Location'].isna().sum())

(169, 13)
['London' 'Manchester' 'Birmingham' 'Glasgow' 'Bristol']
0


In [10]:
df.to_csv("../data/Location_Cleaned.csv", index=False)

### Age format review

In [11]:
import pandas as pd

dfAge = pd.read_csv("../data/Location_Cleaned.csv")  # Replace with the correct path if necessary

print(dfAge.head())

  Transaction ID        Date  Amount (£)  Payment Method Merchant Category  \
0        T000001  2023-01-15      245.67     Credit Card            Retail   
1        T000002  2023-02-11       98.34  Mobile Payment   Food & Beverage   
2        T000003  2023-03-22      500.25  Digital Wallet            Travel   
3        T000004  2023-04-10       75.89      Debit Card     Entertainment   
4        T000005  2023-05-30      320.00   Bank Transfer          Services   

     Location Customer Segment Transaction Status Sales Channel  \
0      London            26-35          Completed        Online   
1  Manchester            18-25            Pending    Mobile App   
2  Birmingham            36-45          Completed        Online   
3     Glasgow            46-60             Failed      In-Store   
4     Bristol              60+           Refunded  Subscription   

  Customer Device Type Promotion/Discount Applied   Time Transaction Type  
0               Mobile                         No  1

In [12]:
dfAge['Customer Segment'] = dfAge['Customer Segment'].str.strip().str.title()

dfAge['Customer Segment'] = dfAge['Customer Segment'].replace(re.compile(r'[^a-zA-Z0-9\s\-+]+', re.IGNORECASE), '', regex=True)

print(dfAge['Customer Segment'].unique())

['26-35' '18-25' '36-45' '46-60' '60+']


#### No need to clean for age (still working from Location_Cleaned)

### Formating the Transaction Status column

In [13]:
valid_statuses = ['Chargeback', 'Completed', 'Failed', 'Pending', 'Refunded']
unexpected_statuses = df[~df['Transaction Status'].isin(valid_statuses)]

print(unexpected_statuses)

Empty DataFrame
Columns: [Transaction ID, Date, Amount (£), Payment Method, Merchant Category, Location, Customer Segment, Transaction Status, Sales Channel, Customer Device Type, Promotion/Discount Applied, Time, Transaction Type]
Index: []


In [14]:
df.to_csv("../data/Transaction_data.csv", index=False)

In [15]:
sales_channel_corrections = {
    'On-Line': 'Online', 
    'Mob App': 'Mobile App',  
}

df['Sales Channel'] = df['Sales Channel'].replace(sales_channel_corrections)

df['Sales Channel'] = df['Sales Channel'].str.strip().str.title()

print(df['Sales Channel'].unique())

['Online' 'Mobile App' 'In-Store' 'Subscription' 'Social Media']


In [16]:
valid_sales_channels = ['In-Store', 'Mobile App', 'Online', 'Social Media', 'Subscription']
unexpected_sales_channels = df[~df['Sales Channel'].isin(valid_sales_channels)]

print(unexpected_sales_channels)

Empty DataFrame
Columns: [Transaction ID, Date, Amount (£), Payment Method, Merchant Category, Location, Customer Segment, Transaction Status, Sales Channel, Customer Device Type, Promotion/Discount Applied, Time, Transaction Type]
Index: []


In [17]:
print(df['Sales Channel'].unique())


['Online' 'Mobile App' 'In-Store' 'Subscription' 'Social Media']


In [18]:
df.to_csv("../data/Sales_data.csv", index=False)


In [19]:
df['Customer Device Type'] = df['Customer Device Type'].str.strip().str.title()

print(df['Customer Device Type'].unique())

['Mobile' 'Desktop' 'Smartwatch' 'Tablet']


In [20]:
valid_device_types = ['Desktop', 'Mobile', 'Smartwatch', 'Tablet']

unexpected_device_types = df[~df['Customer Device Type'].isin(valid_device_types)]

print(unexpected_device_types)

Empty DataFrame
Columns: [Transaction ID, Date, Amount (£), Payment Method, Merchant Category, Location, Customer Segment, Transaction Status, Sales Channel, Customer Device Type, Promotion/Discount Applied, Time, Transaction Type]
Index: []


In [21]:
print(df['Customer Device Type'].unique())


['Mobile' 'Desktop' 'Smartwatch' 'Tablet']


In [22]:
df.to_csv("../data/Device_data.csv", index=False)


# Seasonal offer corrections

In [23]:
df['Promotion/Discount Applied'] = df['Promotion/Discount Applied'].replace('Seasonal Offr', 'Seasonal Offer')

df['Promotion/Discount Applied'] = df['Promotion/Discount Applied'].str.strip().str.title()

print(df['Promotion/Discount Applied'].unique())

['No' 'Seasonal Offer' 'Yes' 'Loyalty Points Redeemed'
 'First-Time User Discount']


In [24]:
valid_promotions = ['First-Time User Discount', 'Loyalty Points Redeemed', 'No', 'Seasonal Offer', 'Yes']

unexpected_promotions = df[~df['Promotion/Discount Applied'].isin(valid_promotions)]

print(unexpected_promotions)

Empty DataFrame
Columns: [Transaction ID, Date, Amount (£), Payment Method, Merchant Category, Location, Customer Segment, Transaction Status, Sales Channel, Customer Device Type, Promotion/Discount Applied, Time, Transaction Type]
Index: []


In [25]:
df.to_csv("../data/Seasonal_data.csv", index=False)


In [26]:
df.to_csv("../data/EXPORT_Cleaned_TPA_data.csv", index=False)