# ChatGPT Exercises

### Exercise 1: Fraud Detection — Flag High-Risk Transactions

Instruction:

Given transaction data, identify transactions above $10,000 from new customers (less than 90 days since account opening).

Sample Data:

In [18]:
import pandas as pd
from datetime import datetime

data = {
    "transaction_id": [1, 2, 3, 4, 5],
    "customer_id": ["C1", "C2", "C3", "C4", "C5"],
    "transaction_amount": [500, 15000, 20000, 300, 12000],
    "account_open_date": [
        "2023-06-01", "2024-08-01", "2024-07-15", "2022-01-10", "2024-09-01"
    ],
    "transaction_date": [
        "2024-09-01", "2024-09-01", "2024-09-01", "2024-09-01", "2024-09-01"
    ]
}
df = pd.DataFrame(data)
df["account_open_date"] = pd.to_datetime(df["account_open_date"])
df["transaction_date"] = pd.to_datetime(df["transaction_date"])

In [3]:
df

Unnamed: 0,transaction_id,customer_id,transaction_amount,account_open_date,transaction_date
0,1,C1,500,2023-06-01,2024-09-01
1,2,C2,15000,2024-08-01,2024-09-01
2,3,C3,20000,2024-07-15,2024-09-01
3,4,C4,300,2022-01-10,2024-09-01
4,5,C5,12000,2024-09-01,2024-09-01


In [11]:
df[((df['transaction_date'] - df['account_open_date']).dt.days < 90) & (df['transaction_amount'] > 10_000)]

Unnamed: 0,transaction_id,customer_id,transaction_amount,account_open_date,transaction_date
1,2,C2,15000,2024-08-01,2024-09-01
2,3,C3,20000,2024-07-15,2024-09-01
4,5,C5,12000,2024-09-01,2024-09-01


Exercise 1 solution

In [19]:
df["days_active"] = (df["transaction_date"] - df["account_open_date"]).dt.days
suspicious = df[(df["transaction_amount"] > 10000) & (df["days_active"] < 90)]
suspicious

Unnamed: 0,transaction_id,customer_id,transaction_amount,account_open_date,transaction_date,days_active
1,2,C2,15000,2024-08-01,2024-09-01,31
2,3,C3,20000,2024-07-15,2024-09-01,48
4,5,C5,12000,2024-09-01,2024-09-01,0


### Exercise 2: AML — Count Transactions by Country

Instruction:

Group transactions by country and calculate the number of transactions and total transaction value.

Sample Data:

In [21]:
data = {
    "transaction_id": [101, 102, 103, 104, 105, 106],
    "country": ["US", "US", "UK", "CN", "CN", "UK"],
    "amount": [2000, 5000, 3000, 7000, 15000, 8000]
}
df = pd.DataFrame(data)

In [13]:
df

Unnamed: 0,transaction_id,country,amount
0,101,US,2000
1,102,US,5000
2,103,UK,3000
3,104,CN,7000
4,105,CN,15000
5,106,UK,8000


In [15]:
countrycount = df['transaction_id'].groupby(df['country']).count()
countrycount

country
CN    2
UK    2
US    2
Name: transaction_id, dtype: int64

In [16]:
countrycount = df['amount'].groupby(df['country']).sum()
countrycount

country
CN    22000
UK    11000
US     7000
Name: amount, dtype: int64

Exercise 2 solution

In [22]:
# review groupby with agg() function
country_summary = df.groupby("country").agg(
    transaction_count=("transaction_id", "count"),
    total_value=("amount", "sum")
).reset_index()
country_summary

Unnamed: 0,country,transaction_count,total_value
0,CN,2,22000
1,UK,2,11000
2,US,2,7000


### Exercise 3: Tax Evasion — Detect Under-Reported Income

Instruction:

Compare declared income vs. actual income (derived from bank transactions) and flag customers underreporting by more than 20%.

In [23]:
data = {
    "customer_id": ["C1", "C2", "C3", "C4"],
    "declared_income": [50000, 40000, 60000, 70000],
    "bank_income": [60000, 45000, 80000, 65000]
}
df = pd.DataFrame(data)


In [24]:
df

Unnamed: 0,customer_id,declared_income,bank_income
0,C1,50000,60000
1,C2,40000,45000
2,C3,60000,80000
3,C4,70000,65000


In [47]:
df['ratio'] = (df['bank_income'] - df['declared_income']) / df['bank_income']
df

Unnamed: 0,customer_id,declared_income,bank_income,difference,ratio,underreporting_ratio
0,C1,50000,60000,-10000,0.166667,0.166667
1,C2,40000,45000,-5000,0.111111,0.111111
2,C3,60000,80000,-20000,0.25,0.25
3,C4,70000,65000,5000,-0.076923,-0.076923


In [48]:
flag = df[df['ratio'] > 0.2]
flag

Unnamed: 0,customer_id,declared_income,bank_income,difference,ratio,underreporting_ratio
2,C3,60000,80000,-20000,0.25,0.25


Exercise 3 solution

In [41]:
df["underreporting_ratio"] = (df["bank_income"] - df["declared_income"]) / df["bank_income"]
suspicious = df[df["underreporting_ratio"] > 0.2]
suspicious

Unnamed: 0,customer_id,declared_income,bank_income,difference,ratio,underreporting_ratio
2,C3,60000,80000,-20000,59999.0,0.25


### Exercise 4: Export Control — Sanctions Screening

Instruction:

Check if any counterparties appear on a sanctions list.

Sample Data:

In [49]:
transactions = pd.DataFrame({
    "transaction_id": [201, 202, 203, 204],
    "counterparty": ["ABC Corp", "XYZ Ltd", "Iran Trading", "Global Inc"]
})

sanctions_list = pd.DataFrame({
    "entity": ["Iran Trading", "North Korea Exports"]
})

In [50]:
transactions

Unnamed: 0,transaction_id,counterparty
0,201,ABC Corp
1,202,XYZ Ltd
2,203,Iran Trading
3,204,Global Inc


In [51]:
sanctions_list

Unnamed: 0,entity
0,Iran Trading
1,North Korea Exports


In [53]:
transactions[transactions['counterparty'].isin(sanctions_list['entity'])]

Unnamed: 0,transaction_id,counterparty
2,203,Iran Trading


Exercise 4 solution

In [54]:
flagged = transactions[transactions["counterparty"].isin(sanctions_list["entity"])]
flagged

Unnamed: 0,transaction_id,counterparty
2,203,Iran Trading


### Exercise 5: Model Evaluation — Confusion Matrix in Compliance

Instruction:

Given model predictions for suspicious transactions, compute confusion matrix metrics: accuracy, precision, recall.

Sample Data:

In [55]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

y_true = [1, 0, 1, 1, 0, 0, 1]   # 1 = suspicious, 0 = normal
y_pred = [1, 0, 1, 0, 0, 1, 1]

Exercise 5 solution

In [59]:
cm = confusion_matrix(y_true, y_pred)
acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred)
rec = recall_score(y_true, y_pred)

cm, acc, prec, rec

(array([[2, 1],
        [1, 3]], dtype=int64),
 0.7142857142857143,
 0.75,
 0.75)

In [58]:
"""
Explanation:

confusion_matrix: [[TN, FP], [FN, TP]]
Accuracy = overall correctness.
Precision = TP / (TP+FP) → "Of flagged suspicious, how many were truly suspicious?"
Recall = TP / (TP+FN) → "How many suspicious did we catch?"
This is core for evaluating compliance/fraud models.
"""

'\nExplanation:\n\nconfusion_matrix: [[TN, FP], [FN, TP]]\nAccuracy = overall correctness.\nPrecision = TP / (TP+FP) → "Of flagged suspicious, how many were truly suspicious?"\nRecall = TP / (TP+FN) → "How many suspicious did we catch?"\nThis is core for evaluating compliance/fraud models.\n'

### Exercise 6: Transaction Monitoring — Rolling Suspicious Patterns

Instruction:

For each customer, calculate a 7-day rolling sum of transaction amounts. Flag customers whose rolling sum exceeds $50,000.

Sample Data:

In [126]:
import pandas as pd

data = {
    "customer_id": ["C1","C1","C1","C2","C2","C2","C3","C3"],
    "transaction_date": [
        "2024-08-01","2024-08-03","2024-08-07",
        "2024-08-01","2024-08-05","2024-08-08",
        "2024-08-02","2024-08-09"
    ],
    "amount": [20000, 15000, 18000, 10000, 30000, 12000, 5000, 60000]
}
df = pd.DataFrame(data)
df["transaction_date"] = pd.to_datetime(df["transaction_date"])


In [127]:
df

Unnamed: 0,customer_id,transaction_date,amount
0,C1,2024-08-01,20000
1,C1,2024-08-03,15000
2,C1,2024-08-07,18000
3,C2,2024-08-01,10000
4,C2,2024-08-05,30000
5,C2,2024-08-08,12000
6,C3,2024-08-02,5000
7,C3,2024-08-09,60000


Exercise 6 solution

In [129]:
# ensure sorted by customer and date
df = df.sort_values(["customer_id", "transaction_date"])

# rolling 7-day sum per customer
df["rolling_sum"] = (
    df.groupby("customer_id", group_keys=False)
      .apply(lambda t: t.rolling("7D", on="transaction_date")["amount"].sum())
)

# flag suspicious
suspicious = df[df["rolling_sum"] > 50000]
print(suspicious)

  customer_id transaction_date  amount  rolling_sum
2          C1       2024-08-07   18000      53000.0
7          C3       2024-08-09   60000      60000.0


  .apply(lambda t: t.rolling("7D", on="transaction_date")["amount"].sum())


### Exercise 7: Customer Risk Scoring — Weighted Aggregation

Instruction:

Compute a risk score per customer as:
```
>>> risk = 0.6 × (mean transaction amount) + 0.4 × (transaction count)
```
Return the top 3 riskiest customers.

Sample Data:

In [87]:
data = {
    "customer_id": ["C1","C1","C2","C2","C2","C3","C4"],
    "amount": [1000, 5000, 20000, 15000, 3000, 10000, 2000]
}
df = pd.DataFrame(data)


In [88]:
mean = df['amount'].mean()
count = df['customer_id'].groupby(df['customer_id']).count()
def score(x):
    return 0.6 * mean + 0.4 * count
df['score'] = df[df['customer_id'].groupby(df['customer_id']).apply(score)]

KeyError: "None of [Index([4800.8, 4801.2, 4800.4, 4800.4, 4800.8, 4801.2, 4800.4, 4800.4, 4800.8,\n       4801.2, 4800.4, 4800.4, 4800.8, 4801.2, 4800.4, 4800.4],\n      dtype='float64')] are in the [columns]"

Exercise 7 solution

In [89]:
agg = df.groupby("customer_id").agg(
    mean_amount=("amount","mean"),
    txn_count=("amount","count")
).reset_index()

agg["risk_score"] = 0.6*agg["mean_amount"] + 0.4*agg["txn_count"]

top3 = agg.sort_values("risk_score", ascending=False).head(3)
top3


Unnamed: 0,customer_id,mean_amount,txn_count,risk_score
1,C2,12666.666667,3,7601.2
2,C3,10000.0,1,6000.4
0,C1,3000.0,2,1800.8


### Exercise 8: Data Quality — Detect Duplicate Suspicious Records

Instruction:

From the transactions dataset, identify duplicate records (same customer, date, and amount). Report the percentage of duplicates.

Sample Data:

In [91]:
data = {
    "customer_id": ["C1","C1","C2","C2","C2","C3","C3"],
    "transaction_date": [
        "2024-08-01","2024-08-01",
        "2024-08-03","2024-08-03","2024-08-03",
        "2024-08-04","2024-08-05"
    ],
    "amount": [1000, 1000, 2000, 2000, 2500, 3000, 3000]
}
df = pd.DataFrame(data)
df["transaction_date"] = pd.to_datetime(df["transaction_date"])


In [92]:
df

Unnamed: 0,customer_id,transaction_date,amount
0,C1,2024-08-01,1000
1,C1,2024-08-01,1000
2,C2,2024-08-03,2000
3,C2,2024-08-03,2000
4,C2,2024-08-03,2500
5,C3,2024-08-04,3000
6,C3,2024-08-05,3000


Exercise 8 solution

In [99]:
duplicates = df.duplicated(subset=["customer_id","transaction_date","amount"])
duplicate_count = duplicates.sum()
duplicate_percentage = duplicate_count / len(df) * 100

duplicate_count, duplicate_percentage


(2, 28.57142857142857)

In [100]:
duplicates

0    False
1     True
2    False
3     True
4    False
5    False
6    False
dtype: bool

### Exercise 9: Sanctions Evasion — Name Matching (Fuzzy Join Simplified)

Instruction:

You have counterparties and a sanctions list. Detect possible near-matches using simple text rules (lowercasing, stripping punctuation, partial string match).

Sample Data:

In [101]:
transactions = pd.DataFrame({
    "counterparty": ["Iran Trading Ltd", "XYZ Global", "NorthKorea Exports", "ABC Corp"]
})

sanctions = pd.DataFrame({
    "entity": ["iran trading", "north korea exports"]
})


Exercise 9 solution

In [102]:
transactions["clean_name"] = transactions["counterparty"].str.lower().str.replace("[^a-z ]","", regex=True)

matches = transactions[transactions["clean_name"].apply(
    lambda x: any(s in x for s in sanctions["entity"].str.lower())
)]
matches


Unnamed: 0,counterparty,clean_name
0,Iran Trading Ltd,iran trading ltd


### Exercise 10: Model Validation — Precision/Recall by Threshold

Instruction:

Given fraud risk scores, calculate precision and recall at thresholds 0.3, 0.5, and 0.7.

Sample Data:

In [103]:
import numpy as np
from sklearn.metrics import precision_score, recall_score

y_true = np.array([1,0,1,0,1,0,0,1,1,0])
scores = np.array([0.9,0.2,0.7,0.4,0.8,0.1,0.35,0.6,0.55,0.05])


Exercise 10 solution

In [105]:
thresholds = [0.3, 0.5, 0.7]
results = []

for t in thresholds:
    preds = (scores >= t).astype(int)
    prec = precision_score(y_true, preds)
    rec = recall_score(y_true, preds)
    results.append((t, prec, rec))

pd.DataFrame(results, columns=["threshold","precision","recall"])


Unnamed: 0,threshold,precision,recall
0,0.3,0.714286,1.0
1,0.5,1.0,1.0
2,0.7,1.0,0.6


### Exercise 11: Handling Missing Values (Data Quality Check)

Instruction:

Identify transactions with missing amount or counterparty, and impute missing amounts with the median transaction amount.

Sample Data:

In [115]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
    "transaction_id": [1,2,3,4,5],
    "counterparty": ["ABC Corp", None, "XYZ Ltd", "DEF Inc", None],
    "amount": [1000, 2000, np.nan, 5000, np.nan]
})


In [116]:
df

Unnamed: 0,transaction_id,counterparty,amount
0,1,ABC Corp,1000.0
1,2,,2000.0
2,3,XYZ Ltd,
3,4,DEF Inc,5000.0
4,5,,


In [117]:
df['amount'].fillna(df['amount'].median(), inplace=True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['amount'].fillna(df['amount'].median(), inplace=True)


Unnamed: 0,transaction_id,counterparty,amount
0,1,ABC Corp,1000.0
1,2,,2000.0
2,3,XYZ Ltd,2000.0
3,4,DEF Inc,5000.0
4,5,,2000.0


Exercise 11 solution

In [118]:
median_amount = df["amount"].median()
df["amount"].fillna(median_amount, inplace=True)
missing_counterparty = df["counterparty"].isna()
df[missing_counterparty]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["amount"].fillna(median_amount, inplace=True)


Unnamed: 0,transaction_id,counterparty,amount
1,2,,2000.0
4,5,,2000.0


### Exercise 12: GroupBy – Customer Risk Summary

Instruction:

For each customer, calculate total transaction amount, number of transactions, and average transaction amount.

Sample Data:

In [119]:
df = pd.DataFrame({
    "customer_id": ["C1","C1","C2","C2","C3","C3","C3"],
    "amount": [5000, 7000, 12000, 8000, 2000, 3000, 1000]
})


In [120]:
df

Unnamed: 0,customer_id,amount
0,C1,5000
1,C1,7000
2,C2,12000
3,C2,8000
4,C3,2000
5,C3,3000
6,C3,1000


In [121]:
agg = df.groupby("customer_id").agg(
    cum_amount=("amount","sum"),
    txn_count=("amount","count"),
    avg_count=("amount", "mean")
).reset_index()

In [122]:
agg

Unnamed: 0,customer_id,cum_amount,txn_count,avg_count
0,C1,12000,2,6000.0
1,C2,20000,2,10000.0
2,C3,6000,3,2000.0


Exercise 12 solution

In [123]:
summary = df.groupby("customer_id").agg(
    total_amount=("amount","sum"),
    txn_count=("amount","count"),
    avg_amount=("amount","mean")
).reset_index()
summary


Unnamed: 0,customer_id,total_amount,txn_count,avg_amount
0,C1,12000,2,6000.0
1,C2,20000,2,10000.0
2,C3,6000,3,2000.0


### Exercise 13: Pivot Table – Transaction Count by Country and Month

Instruction:

Create a pivot table showing number of transactions per country per month.

Sample Data:

In [133]:
df = pd.DataFrame({
    "transaction_id": [1,2,3,4,5,6],
    "country": ["US","US","UK","UK","CN","CN"],
    "date": pd.to_datetime(["2024-08-01","2024-08-15","2024-08-03","2024-09-01","2024-08-07","2024-09-02"])
})


In [134]:
df

Unnamed: 0,transaction_id,country,date
0,1,US,2024-08-01
1,2,US,2024-08-15
2,3,UK,2024-08-03
3,4,UK,2024-09-01
4,5,CN,2024-08-07
5,6,CN,2024-09-02


In [136]:
df['month'] = df[df['date'].dt.month]
df.pivot_table(index='country', columns='date', values='amount', aggfunc='sum')

KeyError: "None of [Index([8, 8, 8, 9, 8, 9], dtype='int32')] are in the [columns]"

Exercise 13 solution

In [140]:
df["month"] = df["date"].dt.to_period("M")
pivot = pd.pivot_table(df, index="country", columns="month", values="transaction_id", aggfunc="count", fill_value=0)
pivot


month,2024-08,2024-09
country,Unnamed: 1_level_1,Unnamed: 2_level_1
CN,1,1
UK,1,1
US,2,0


### Exercise 14: Crosstab – High-Risk Transactions by Type

Instruction:

Generate a crosstab of transaction type vs high-risk flag.

Sample Data:

In [141]:
df = pd.DataFrame({
    "transaction_type": ["Wire","Wire","ACH","ACH","Wire","ACH"],
    "high_risk": [1,0,1,0,1,0]
})


In [142]:
df

Unnamed: 0,transaction_type,high_risk
0,Wire,1
1,Wire,0
2,ACH,1
3,ACH,0
4,Wire,1
5,ACH,0


In [144]:
pd.crosstab(df['transaction_type'], df['high_risk'], margins=True)

high_risk,0,1,All
transaction_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ACH,2,1,3
Wire,1,2,3
All,3,3,6


Exercise 14 solution

In [145]:
ct = pd.crosstab(df["transaction_type"], df["high_risk"], margins=True)
ct


high_risk,0,1,All
transaction_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ACH,2,1,3
Wire,1,2,3
All,3,3,6


### Exercise 15: Merge/Join – Combine Customer and Transaction Data

Instruction:

Merge customer master and transaction dataset to include customer segment in transaction data.

Sample Data:

In [146]:
customers = pd.DataFrame({
    "customer_id": ["C1","C2","C3"],
    "segment": ["Retail","Corporate","Retail"]
})

transactions = pd.DataFrame({
    "transaction_id": [101,102,103,104],
    "customer_id": ["C1","C2","C1","C3"],
    "amount": [5000, 12000, 7000, 2000]
})


In [147]:
customers

Unnamed: 0,customer_id,segment
0,C1,Retail
1,C2,Corporate
2,C3,Retail


In [148]:
transactions

Unnamed: 0,transaction_id,customer_id,amount
0,101,C1,5000
1,102,C2,12000
2,103,C1,7000
3,104,C3,2000


In [151]:
pd.merge(customers, transactions, on='customer_id', how='left')

Unnamed: 0,customer_id,segment,transaction_id,amount
0,C1,Retail,101,5000
1,C1,Retail,103,7000
2,C2,Corporate,102,12000
3,C3,Retail,104,2000


Exercise 15 solution

In [152]:
merged = transactions.merge(customers, on="customer_id", how="left")
merged


Unnamed: 0,transaction_id,customer_id,amount,segment
0,101,C1,5000,Retail
1,102,C2,12000,Corporate
2,103,C1,7000,Retail
3,104,C3,2000,Retail


### Exercise 16: Concatenation – Combine Multiple Transaction Files

Instruction:

Concatenate two monthly transaction datasets into a single DataFrame.

Sample Data:

In [153]:
df_aug = pd.DataFrame({"transaction_id":[1,2],"amount":[1000,2000]})
df_sep = pd.DataFrame({"transaction_id":[3,4],"amount":[1500,2500]})


In [154]:
df_aug

Unnamed: 0,transaction_id,amount
0,1,1000
1,2,2000


In [155]:
df_sep

Unnamed: 0,transaction_id,amount
0,3,1500
1,4,2500


In [157]:
pd.concat([df_aug, df_sep])

Unnamed: 0,transaction_id,amount
0,1,1000
1,2,2000
0,3,1500
1,4,2500


Exercise 16 solution

In [158]:
all_txn = pd.concat([df_aug, df_sep], ignore_index=True)
all_txn


Unnamed: 0,transaction_id,amount
0,1,1000
1,2,2000
2,3,1500
3,4,2500


### Exercise 17: Apply Lambda – Flag Large Transactions

Instruction:

Create a new column flag that is 1 if transaction > 10,000, else 0, using apply and lambda.

Sample Data:

In [159]:
df = pd.DataFrame({"transaction_id":[1,2,3],"amount":[5000,12000,7000]})


In [160]:
df

Unnamed: 0,transaction_id,amount
0,1,5000
1,2,12000
2,3,7000


In [161]:
df['flag'] = df['amount'].apply(lambda x : 1 if x > 10_000 else 0)
df

Unnamed: 0,transaction_id,amount,flag
0,1,5000,0
1,2,12000,1
2,3,7000,0


Exercise 17 solution

In [162]:
df["flag"] = df["amount"].apply(lambda x: 1 if x > 10000 else 0)
df


Unnamed: 0,transaction_id,amount,flag
0,1,5000,0
1,2,12000,1
2,3,7000,0
