# Challenge

Another approach to identifying fraudulent transactions is to look for outliers in the data. Standard deviation or quartiles are often used to detect outliers. Using this starter notebook, code two Python functions:

* One that uses standard deviation to identify anomalies for any cardholder.

* Another that uses interquartile range to identify anomalies for any cardholder.

## Identifying Outliers using Standard Deviation

In [1]:
# Initial imports
import pandas as pd
import numpy as np
import random
from sqlalchemy import create_engine



In [2]:
# Create a connection to the database
engine = create_engine("postgresql://postgres:postgres@localhost:5432/fraud_detection")



In [3]:
# Write function that locates outliers using standard deviation

def find_possible_fraudulent_transactions(engine):
    # SQL query to extract relevant data
    query = """
        SELECT t.*, ch.name AS card_holder_name
        FROM transaction t
        INNER JOIN credit_card cc ON t.card = cc.card
        INNER JOIN card_holder ch ON cc.cardholder_id = ch.id
        WHERE t.amount < 2
        OR (EXTRACT(HOUR FROM t.date) BETWEEN 7 AND 9)
        ORDER BY t.amount DESC
        LIMIT 100
    """

    # Execute query and fetch results into a DataFrame
    with engine.connect() as conn:
        df = pd.read_sql(query, conn)

    # Calculate standard deviation of transaction amounts for each card holder
    std_dev = df.groupby('card_holder_name')['amount'].transform('std')

    # Filter outliers based on standard deviation
    outliers = df[df['amount'] > (std_dev * 2)]

    return outliers

# Usage
possible_fraudulent_transactions = find_possible_fraudulent_transactions(engine)
print(possible_fraudulent_transactions)
                                           

      id                date   amount                 card  id_merchant  \
0   3163 2018-12-07 07:22:03  1894.00  4761049645711555811            9   
1   2451 2018-03-05 08:26:08  1617.00     5570600642865857            4   
2   2840 2018-03-06 07:18:09  1334.00        4319653513507           87   
3   2461 2018-12-21 09:56:32  1301.00       30142966699187           96   
8    136 2018-07-18 09:19:08   974.00      344119623920892           19   
..   ...                 ...      ...                  ...          ...   
93  2909 2018-09-26 07:52:23    14.94  4743204091443101526           34   
96  2511 2018-07-11 09:57:31    14.80  4279104135293225293           58   
97  2858 2018-07-16 09:25:54    14.78     5175947111814778           72   
98  2998 2018-10-17 09:49:28    14.67  4165305432349489280          137   
99  2819 2018-08-02 07:13:49    14.42     3516952396080247           14   

     card_holder_name  
0      Robert Johnson  
1       Crystal Clark  
2     Nancy Contreras  
3  

In [4]:
# Find anomalous transactions for 3 random card holders

def find_anomalous_transactions(engine):
    # Select 3 random card holders
    with engine.connect() as conn:
        query = "SELECT id FROM card_holder ORDER BY RANDOM() LIMIT 3"
        random_card_holders = pd.read_sql(query, conn)['id'].tolist()

    # SQL query to extract relevant data
    query = """
        SELECT t.*, ch.name AS card_holder_name
        FROM transaction t
        INNER JOIN credit_card cc ON t.card = cc.card
        INNER JOIN card_holder ch ON cc.cardholder_id = ch.id
        WHERE cc.cardholder_id IN ({})
    """.format(', '.join(map(str, random_card_holders)))

    # Execute query and fetch results into a DataFrame
    with engine.connect() as conn:
        df = pd.read_sql(query, conn)

    # Calculate standard deviation of transaction amounts for each card holder
    std_dev = df.groupby('card_holder_name')['amount'].transform('std')

    # Filter outliers based on standard deviation
    outliers = df[df['amount'] > (std_dev * 2)]

    return outliers

# Result
anomalous_transactions = find_anomalous_transactions(engine)
print(anomalous_transactions)



       id                date   amount              card  id_merchant  \
0    3309 2018-01-01 23:13:30    19.03  4263694062533017            5   
2    2918 2018-01-02 11:28:35    15.85   180098539019105           16   
7    3085 2018-01-08 19:31:23    17.43      584226564303           19   
15   2843 2018-01-14 03:20:21    14.69      584226564303          133   
20   2838 2018-01-18 08:58:13    17.53   180098539019105          137   
..    ...                 ...      ...               ...          ...   
507  2930 2018-12-24 10:13:57    16.16      584226564303           95   
508  1119 2018-12-25 19:10:42  1035.00    30142966699187           57   
509  3198 2018-12-26 04:52:48    17.21  4263694062533017           31   
512  3234 2018-12-26 21:56:44    16.81  4263694062533017           92   
513  2987 2018-12-27 01:02:19    17.45  4263694062533017           12   

     card_holder_name  
0      Danielle Green  
2      Brandon Pineda  
7      Danielle Green  
15     Danielle Green  
20 

## Identifying Outliers Using Interquartile Range

In [5]:
# Write a function that locates outliers using interquartile range

def find_outliers_iqr(engine):
    # SQL query to extract relevant data
    query = """
        SELECT t.*, ch.name AS card_holder_name
        FROM transaction t
        INNER JOIN credit_card cc ON t.card = cc.card
        INNER JOIN card_holder ch ON cc.cardholder_id = ch.id
    """

    # Execute query and fetch results into a DataFrame
    with engine.connect() as conn:
        df = pd.read_sql(query, conn)

    # Calculate IQR for transaction amounts
    q1 = df['amount'].quantile(0.25)
    q3 = df['amount'].quantile(0.75)
    iqr = q3 - q1

    # Define threshold for outliers
    lower_threshold = q1 - 1.5 * iqr
    upper_threshold = q3 + 1.5 * iqr

    # Identify outliers
    outliers = df[(df['amount'] < lower_threshold) | (df['amount'] > upper_threshold)]

    return outliers

# Results

outliers_iqr = find_outliers_iqr(engine)
print(outliers_iqr)


        id                date  amount                 card  id_merchant  \
15      99 2018-01-02 23:27:46  1031.0         501879657465           95   
27    2650 2018-01-04 03:05:18  1685.0     3516952396080247           80   
53    3457 2018-01-07 01:10:54   175.0      344119623920892           12   
62    1291 2018-01-08 02:34:32  1029.0     3581345943543942          145   
67     812 2018-01-08 11:15:36   333.0      344119623920892           95   
...    ...                 ...     ...                  ...          ...   
3405  2461 2018-12-21 09:56:32  1301.0       30142966699187           96   
3429  2520 2018-12-24 15:55:06  1634.0     5570600642865857            7   
3433  1119 2018-12-25 19:10:42  1035.0       30142966699187           57   
3472  2164 2018-12-28 16:20:31   313.0       30078299053512           12   
3492  1293 2018-12-30 23:23:09  1033.0  4761049645711555811           57   

      card_holder_name  
15         Megan Price  
27         Sean Taylor  
53       Mal

In [7]:
# Find anomalous transactions for 3 random card holders

def find_anomalous_transactions_iqr(engine):
    # Select 3 random card holders
    with engine.connect() as conn:
        query = "SELECT id FROM card_holder ORDER BY RANDOM() LIMIT 4"
        random_card_holders = pd.read_sql(query, conn)['id'].tolist()

    # SQL query to extract relevant data for selected card holders
    query = """
        SELECT t.*, ch.name AS card_holder_name
        FROM transaction t
        INNER JOIN credit_card cc ON t.card = cc.card
        INNER JOIN card_holder ch ON cc.cardholder_id = ch.id
        WHERE cc.cardholder_id IN ({})
    """.format(', '.join(map(str, random_card_holders)))

    # Execute query and fetch results into a DataFrame
    with engine.connect() as conn:
        df = pd.read_sql(query, conn)
        print(df.head())  # Add this line to print the first few rows of the DataFrame

    # Filter transactions for selected card holders
    df_selected_card_holders = df[df['id'].isin(random_card_holders)]

    # Calculate IQR for transaction amounts
    q1 = df_selected_card_holders['amount'].quantile(0.25)
    q3 = df_selected_card_holders['amount'].quantile(0.75)
    iqr = q3 - q1

    # Define threshold for outliers
    lower_threshold = q1 - 1.5 * iqr
    upper_threshold = q3 + 1.5 * iqr

    # Identify outliers
    outliers = df_selected_card_holders[(df_selected_card_holders['amount'] < lower_threshold) | 
                                         (df_selected_card_holders['amount'] > upper_threshold)]

    return outliers

# Usage
anomalous_transactions_iqr = find_anomalous_transactions_iqr(engine)
print(anomalous_transactions_iqr)


     id                date  amount                 card  id_merchant  \
0  2083 2018-01-02 02:06:21    1.46        4319653513507           93   
1  2120 2018-01-03 21:04:28    1.91     3561072557118696          108   
2  1552 2018-01-05 06:26:45   10.74      372414832802279           86   
3   487 2018-01-05 06:27:06    4.60  4159836738768855913           48   
4   314 2018-01-05 10:42:55    5.31     3561072557118696            8   

  card_holder_name  
0  Nancy Contreras  
1      Peter Mckay  
2  Nancy Contreras  
3   Beth Hernandez  
4      Peter Mckay  
Empty DataFrame
Columns: [id, date, amount, card, id_merchant, card_holder_name]
Index: []
