In [2]:
# Import required libraries
import pandas as pd

# Load the dataset
file_path = '..\\..\\csv_files\\wire.csv'  # Update with your file path
wire_data = pd.read_csv(file_path)

# Display basic information about the dataset
print("Dataset Summary:")
print(wire_data.info())

# Display summary statistics for numeric and categorical columns
print("\nSummary Statistics:")
print(wire_data.describe(include='all'))

# Identify unique values in each column
print("\nUnique Values:")
print(wire_data.nunique())

# Check for missing values
print("\nMissing Values:")
print(wire_data.isnull().sum())

# Analyze 'amount_cad' for outliers
amount_stats = wire_data['amount_cad'].describe()
iqr = amount_stats['75%'] - amount_stats['25%']
outlier_threshold_high = amount_stats['75%'] + 1.5 * iqr
outlier_threshold_low = amount_stats['25%'] - 1.5 * iqr

print("\nOutlier Thresholds for 'amount_cad':")
print(f"High Outlier Threshold: {outlier_threshold_high}")
print(f"Low Outlier Threshold: {outlier_threshold_low}")

# Filter potential outliers
high_outliers = wire_data[wire_data['amount_cad'] > outlier_threshold_high]
low_outliers = wire_data[wire_data['amount_cad'] < outlier_threshold_low]

print("\nHigh Value Transactions (Potential Outliers):")
print(high_outliers)

print("\nLow Value Transactions (Potential Outliers):")
print(low_outliers)

# Analyze transaction time patterns
time_pattern = wire_data['transaction_time'].value_counts().sort_values(ascending=False).head(10)

print("\nTop 10 Most Common Transaction Times:")
print(time_pattern)

# Optional: Save results for further analysis
high_outliers.to_csv('high_outliers.csv', index=False)
low_outliers.to_csv('low_outliers.csv', index=False)


Dataset Summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4758 entries, 0 to 4757
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   wire_id           4758 non-null   object 
 1   customer_id       4758 non-null   object 
 2   amount_cad        4758 non-null   float64
 3   debit_credit      4758 non-null   object 
 4   transaction_date  4758 non-null   object 
 5   transaction_time  4758 non-null   object 
dtypes: float64(1), object(5)
memory usage: 223.2+ KB
None

Summary Statistics:
                     wire_id       customer_id    amount_cad debit_credit  \
count                   4758              4758  4.758000e+03         4758   
unique                  4758               856           NaN            2   
top     WIR00000000000004953  SYNCID0000013956           NaN        debit   
freq                       1               659           NaN         2647   
mean                     NaN        

In [3]:
# Drop the transaction_time column from the dataset
wire_data = wire_data.drop(columns=['transaction_time'])

# Display the updated dataset
print("\nDataset after dropping 'transaction_time' column:")
print(wire_data.head())

# Optional: Save the updated dataset to a new CSV file
wire_data.to_csv('updated_wire_data.csv', index=False)




Dataset after dropping 'transaction_time' column:
                wire_id       customer_id  amount_cad debit_credit  \
0  WIR00000000000000000  SYNCID0000000000     6316.04        debit   
1  WIR00000000000000001  SYNCID0000000000        0.03       credit   
2  WIR00000000000000005  SYNCID0000000038    67268.48       credit   
3  WIR00000000000000006  SYNCID0000000055     6354.06       credit   
4  WIR00000000000000007  SYNCID0000000061     7217.61       credit   

  transaction_date  
0       2022-11-18  
1       2022-12-31  
2       2022-12-07  
3       2022-12-07  
4       2023-01-10  


In [4]:
# Group by customer_id and calculate mean and standard deviation for transaction amounts
customer_stats = wire_data.groupby('customer_id')['amount_cad'].agg(['mean', 'std']).reset_index()
customer_stats.rename(columns={'mean': 'mean_amount', 'std': 'std_amount'}, inplace=True)

# Merge the statistics back into the original dataset
wire_data = wire_data.merge(customer_stats, on='customer_id', how='left')

# Define a threshold for flagging unusual transactions (e.g., 3 standard deviations above the mean)
wire_data['threshold'] = wire_data['mean_amount'] + 3 * wire_data['std_amount']

# Flag transactions that are unusually high
wire_data['is_unusual'] = wire_data['amount_cad'] > wire_data['threshold']

# Filter the flagged transactions
unusual_transactions = wire_data[wire_data['is_unusual']]

# Display the unusual transactions
print("\nUnusual Transactions:")
print(unusual_transactions)

# Optional: Save the flagged transactions to a CSV file
unusual_transactions.to_csv('unusual_transactions.csv', index=False)



Unusual Transactions:
                   wire_id       customer_id   amount_cad debit_credit  \
73    WIR00000000000000076  SYNCID0000000580     47516.81        debit   
150   WIR00000000000000153  SYNCID0000001010   1136825.74       credit   
184   WIR00000000000000188  SYNCID0000001235    154413.52        debit   
283   WIR00000000000000287  SYNCID0000001487    201882.71       credit   
323   WIR00000000000000327  SYNCID0000001487    443592.73       credit   
414   WIR00000000000000418  SYNCID0000001663    156085.05        debit   
470   WIR00000000000000486  SYNCID0000001889     89604.73        debit   
471   WIR00000000000000487  SYNCID0000001889    146319.19        debit   
693   WIR00000000000000720  SYNCID0000002830     51656.29        debit   
805   WIR00000000000000832  SYNCID0000002952    734081.64       credit   
849   WIR00000000000000876  SYNCID0000002954    888603.17       credit   
868   WIR00000000000000895  SYNCID0000002954   1063364.85       credit   
887   WIR000000