In [1]:
!pip install faker

Collecting faker
  Downloading Faker-36.1.1-py3-none-any.whl.metadata (15 kB)
Downloading Faker-36.1.1-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-36.1.1


In [2]:
import pandas as pd
import numpy as np
from faker import Faker
from scipy.stats import beta, poisson

def generate_synthetic_data(df, seed=42):
    np.random.seed(seed)
    fake = Faker()
    Faker.seed(seed)

    # Generate realistic user patterns
    df['User'] = [fake.uuid4()[:8] for _ in range(len(df))]
    df['Card'] = [fake.credit_card_number() for _ in range(len(df))]

    # Enhanced temporal patterns
    def generate_times(num_users):
        # Bimodal distribution for normal vs night users
        night_users = np.random.choice([0,1], num_users, p=[0.95, 0.05])
        time_params = []

        for is_night in night_users:
            if is_night:
                # Beta distribution for night transactions (10 PM - 4 AM)
                alpha, beta_ = 2, 5
                base_time = 22 + 6 * beta.rvs(alpha, beta_)
            else:
                # Bimodal for day transactions (9 AM - 12 PM and 2 PM - 6 PM)
                mode = np.random.choice([0,1], p=[0.6, 0.4])
                if mode == 0:
                    alpha, beta_ = 3, 3  # Morning peak
                    base_time = 9 + 3 * beta.rvs(alpha, beta_)
                else:
                    alpha, beta_ = 2, 4  # Afternoon peak
                    base_time = 14 + 4 * beta.rvs(alpha, beta_)

            # Add minute variation
            minute = np.random.normal(30, 15) % 60
            return f"{int(base_time%24):02d}:{int(minute):02d}"

    # Generate time based on user behavior
    user_time_profiles = df.groupby('User')['User'].first().apply(
        lambda x: generate_times(1)[0]
    )
    df = df.merge(user_time_profiles.rename('TimeProfile'), on='User')

    # Convert to datetime with realistic patterns
    df['Datetime'] = pd.to_datetime(
        df[['Year', 'Month', 'Day']].astype(str).agg('-'.join, axis=1) + ' ' + df['TimeProfile']
    )

    # Generate chargeback patterns
    df = df.sort_values(['User', 'Datetime']).reset_index(drop=True)

    # Create chargeback flags
    df['Chargeback'] = 0
    chargeback_users = np.random.choice(
        df['User'].unique(),
        size=int(len(df['User'].unique())*0.03),  # 3% risky users
        replace=False
    )

    # Track chargeback history per user
    user_chargebacks = {user: 0 for user in df['User'].unique()}

    for idx, row in df.iterrows():
        if row['User'] in chargeback_users:
            # Base probability increases with transaction amount
            cb_prob = 0.01 + (row['Amount']/df['Amount'].max())*0.15
            # Weekend transactions have higher chargeback probability
            if row['Datetime'].weekday() >= 5:
                cb_prob += 0.05
            # Time since last transaction (rush transactions)
            if user_chargebacks[row['User']] > 0:
                time_since_last = row['Datetime'] - df.iloc[idx-1]['Datetime']
                if time_since_last < pd.Timedelta('1h'):
                    cb_prob += 0.1

            if np.random.rand() < cb_prob:
                df.at[idx, 'Chargeback'] = 1
                user_chargebacks[row['User']] += 1

    # Create rolling chargeback count feature
    df['nof.chargebacks'] = df.groupby('User')['Chargeback'].cumsum()

    # Add temporal context features
    df['Hour'] = df['Datetime'].dt.hour
    df['Minute'] = df['Datetime'].dt.minute
    df['DayOfWeek'] = df['Datetime'].dt.dayofweek
    df['DaysFromWeekend'] = np.abs(df['DayOfWeek'] - 5)  # Friday/Saturday proximity

    # Create cyclical time features
    df['Hour_sin'] = np.sin(2 * np.pi * df['Hour']/24)
    df['Hour_cos'] = np.cos(2 * np.pi * df['Hour']/24)
    df['Minute_sin'] = np.sin(2 * np.pi * df['Minute']/60)
    df['Minute_cos'] = np.cos(2 * np.pi * df['Minute']/60)

    # Generate time-based anomalies
    df['NightTransaction'] = ((df['Hour'] >= 23) | (df['Hour'] <= 4)).astype(int)
    df['RushHour'] = ((df['Hour'] >= 7) & (df['Hour'] <= 9)).astype(int)

    # Create temporal clusters
    conditions = [
        (df['Hour'].between(0,4)),   # Late Night
        (df['Hour'].between(5,9)),   # Morning
        (df['Hour'].between(10,15)), # Midday
        (df['Hour'].between(16,19)), # Evening
        (df['Hour'].between(20,23)), # Night
    ]
    df['TimeCluster'] = np.select(conditions, [0,1,2,3,4], default=2)

    # Add error patterns correlated with anomalies
    df['Errors?'] = np.where(
        (df['NightTransaction'] == 1) |
        (df['nof.chargebacks'] > 2) |
        (df['Amount'] > df['Amount'].quantile(0.98)),
        'High Risk Error', 'No Error'
    )

    return df.drop(columns=['TimeProfile'])

In [None]:
def validate_with_rl(df, api_key, model_id="llama-v3-8b-w", max_samples=100, batch_size=5):
    """
    Validates generated synthetic data using reinforcement learning via Fireworks AI API.

    Args:
        df (pd.DataFrame): The synthetic data to validate
        api_key (str): Fireworks API key
        model_id (str): Fireworks model ID to use
        max_samples (int): Maximum number of samples to validate
        batch_size (int): Number of samples to process in each batch

    Returns:
        tuple: (validated_df, validation_scores, feedback)
    """
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    # Prepare sample data for validation (limit to max_samples)
    validation_samples = df.sample(min(len(df), max_samples)).reset_index(drop=True)
    validation_scores = []
    validation_feedback = []

    # Process in batches to avoid overwhelming the API
    for i in range(0, len(validation_samples), batch_size):
        batch = validation_samples.iloc[i:i+batch_size]

        # Convert batch to list of dictionaries for the API
        batch_records = batch.to_dict('records')

        # Create prompt for the LLM
        prompt = f"""You are a reinforcement learning system validating synthetic financial transaction data.

Please analyze these {len(batch_records)} transaction records and evaluate their realism on a scale of 0-10,
where 0 means completely unrealistic and 10 means indistinguishable from real data.
For each record, provide feedback on:
1. Temporal patterns (time of day, day of week)
2. User behavior consistency
3. Chargeback patterns
4. Anomaly indicators

For each record, give a score and specific feedback on how to improve realism.

Transaction Records:
{json.dumps(batch_records, indent=2, default=str)}

Respond with a JSON object containing:
1. An array of scores (one per record)
2. An array of feedback strings (one per record)
3. An overall batch score
4. Suggestions for improving the data generation model
"""

        # Call the Fireworks API
        response = requests.post(
            f"https://api.fireworks.ai/inference/v1/completions",
            headers=headers,
            json={
                "model": model_id,
                "prompt": prompt,
                "max_tokens": 2048,
                "temperature": 0.2,
                "top_p": 0.9
            }
        )

        if response.status_code != 200:
            print(f"API Error: {response.status_code}")
            print(response.text)
            continue

        # Parse the response
        try:
            result = response.json()
            llm_response = result.get('choices', [{}])[0].get('text', '{}')

            # Extract JSON from the response (handle potential text wrapping)
            import re
            json_match = re.search(r'({[\s\S]*})', llm_response)
            if json_match:
                validation_result = json.loads(json_match.group(1))

                # Store scores and feedback
                scores = validation_result.get('scores', [5.0] * len(batch))
                feedbacks = validation_result.get('feedback', ['No feedback'] * len(batch))

                validation_scores.extend(scores)
                validation_feedback.extend(feedbacks)

                # Apply reinforcement learning - adjust data based on feedback
                for j, (score, feedback) in enumerate(zip(scores, feedbacks)):
                    idx = i + j
                    if idx < len(validation_samples):
                        # Store validation results with the data
                        validation_samples.at[idx, 'RL_Score'] = score
                        validation_samples.at[idx, 'RL_Feedback'] = feedback

                # Allow API to rest between batches
                time.sleep(1)
            else:
                print("Failed to extract JSON from response")

        except Exception as e:
            print(f"Error processing response: {e}")
            print(f"Response: {llm_response}")

    # Calculate overall validation metrics
    if validation_scores:
        avg_score = sum(validation_scores) / len(validation_scores)
        print(f"Average validation score: {avg_score:.2f}/10")

        # Apply reinforcement learning adjustments to the entire dataset
        df_adjusted = adjust_data_with_rl_feedback(df, validation_samples)

        return df_adjusted, validation_scores, validation_feedback

    return df, [], []


def adjust_data_with_rl_feedback(original_df, validated_samples):
    """
    Applies reinforcement learning feedback to adjust the synthetic data generation.

    Args:
        original_df (pd.DataFrame): The original synthetic data
        validated_samples (pd.DataFrame): Samples with validation scores

    Returns:
        pd.DataFrame: Adjusted synthetic data
    """
    # Create a copy of the original data
    adjusted_df = original_df.copy()

    # If we don't have enough validation data, return the original
    if len(validated_samples) < 10 or 'RL_Score' not in validated_samples.columns:
        return adjusted_df

    # Identify patterns in low-scoring samples
    low_score_threshold = 5.0
    low_scoring_samples = validated_samples[validated_samples['RL_Score'] < low_score_threshold]

    if len(low_scoring_samples) > 0:
        # Analyze temporal patterns
        problematic_hours = low_scoring_samples['Hour'].value_counts().index.tolist()[:3]

        # Find problematic chargeback patterns
        mean_cb_low = low_scoring_samples['Chargeback'].mean()
        mean_cb_all = original_df['Chargeback'].mean()

        # Adjust data based on findings
        if mean_cb_low > mean_cb_all * 1.5:
            # Too many chargebacks in the data - reduce them
            chargeback_mask = adjusted_df['Chargeback'] == 1
            random_indices = np.random.choice(
                adjusted_df[chargeback_mask].index,
                size=int(sum(chargeback_mask) * 0.2),  # Reduce by 20%
                replace=False
            )
            adjusted_df.loc[random_indices, 'Chargeback'] = 0

            # Recalculate the chargeback counts
            adjusted_df['nof.chargebacks'] = adjusted_df.groupby('User')['Chargeback'].cumsum()

        # Adjust problematic hour distributions
        for hour in problematic_hours:
            hour_mask = adjusted_df['Hour'] == hour
            if sum(hour_mask) > 100:
                # Find records in problematic hours and shift them slightly
                for idx in adjusted_df[hour_mask].sample(int(sum(hour_mask) * 0.3)).index:
                    # Shift hour by ±1-2 hours
                    shift = np.random.choice([-2, -1, 1, 2])
                    new_hour = (adjusted_df.loc[idx, 'Hour'] + shift) % 24

                    # Update hour and datetime
                    adjusted_df.at[idx, 'Hour'] = new_hour
                    adjusted_df.at[idx, 'Datetime'] = adjusted_df.at[idx, 'Datetime'].replace(
                        hour=new_hour
                    )

                    # Update derived features
                    adjusted_df.at[idx, 'Hour_sin'] = np.sin(2 * np.pi * new_hour/24)
                    adjusted_df.at[idx, 'Hour_cos'] = np.cos(2 * np.pi * new_hour/24)
                    adjusted_df.at[idx, 'NightTransaction'] = ((new_hour >= 23) | (new_hour <= 4)).astype(int)
                    adjusted_df.at[idx, 'RushHour'] = ((new_hour >= 7) & (new_hour <= 9)).astype(int)

                    # Update time cluster
                    if 0 <= new_hour <= 4:
                        adjusted_df.at[idx, 'TimeCluster'] = 0
                    elif 5 <= new_hour <= 9:
                        adjusted_df.at[idx, 'TimeCluster'] = 1
                    elif 10 <= new_hour <= 15:
                        adjusted_df.at[idx, 'TimeCluster'] = 2
                    elif 16 <= new_hour <= 19:
                        adjusted_df.at[idx, 'TimeCluster'] = 3
                    elif 20 <= new_hour <= 23:
                        adjusted_df.at[idx, 'TimeCluster'] = 4

        # Re-evaluate error patterns
        adjusted_df['Errors?'] = np.where(
            (adjusted_df['NightTransaction'] == 1) |
            (adjusted_df['nof.chargebacks'] > 2) |
            (adjusted_df['Amount'] > adjusted_df['Amount'].quantile(0.98)),
            'High Risk Error', 'No Error'
        )

    return adjusted_df


def generate_and_validate_data(base_df, seed=42, api_key=None, iterations=3):
    """
    Generate synthetic data and iteratively improve it using RL validation.

    Args:
        base_df (pd.DataFrame): Base dataframe with Year, Month, Day, and Amount columns
        seed (int): Random seed for reproducibility
        api_key (str): Fireworks API key
        iterations (int): Number of RL improvement iterations

    Returns:
        pd.DataFrame: The final validated and improved synthetic data
    """
    # Generate initial synthetic data
    print("Generating initial synthetic data...")
    df = generate_synthetic_data(base_df, seed=seed)

    if not api_key:
        print("No API key provided. Skipping validation.")
        return df

    # Iteratively improve data through RL validation
    for i in range(iterations):
        print(f"\nIteration {i+1}/{iterations} of RL validation")
        df, scores, feedback = validate_with_rl(df, api_key, max_samples=min(100, len(df)//10))

        if not scores:
            print("Validation failed. Using current data.")
            break

        avg_score = sum(scores) / len(scores)
        print(f"Average validation score: {avg_score:.2f}/10")

        # If we reach a good score, we can stop early
        if avg_score >= 8.5:
            print(f"Reached target quality score {avg_score:.2f}. Stopping iterations.")
            break

    print("Data generation and validation complete.")
    return df


In [5]:
# Generate synthetic data with enhanced features
df = pd.read_csv('/content/drive/MyDrive/card_transaction.v1.csv',
                         skiprows=lambda x: x > 0 and x % 4 != 0)

enhanced_df = generate_and_validate_data(df, api_key="fw_3ZZEeumfFNs2Ajn1WJbyQcgC", iterations=3)

# Add anomaly labels combining multiple factors
enhanced_df['Is Fraud?'] = np.where(
    (enhanced_df['NightTransaction'] &
     (enhanced_df['Amount'] > enhanced_df['Amount'].median())) |
    (enhanced_df['nof.chargebacks'] > 2) |
    (enhanced_df['TimeSinceLastTxn'] < 3600),  # 1 hour
    1, 0
)

  df['Datetime'] = pd.to_datetime(


DateParseError: Unknown datetime string format, unable to parse: 2002-9-2 1, at position 0

In [None]:
enhanced_df.head()

In [None]:
!pip install tensorflow keras

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, LayerNormalization, MultiHeadAttention
from tensorflow.keras.models import Model

def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    # Normalization and Attention
    x = LayerNormalization(epsilon=1e-6)(inputs)
    x = MultiHeadAttention(
        key_dim=head_size, num_heads=num_heads, dropout=dropout
    )(x, x)
    x = Dropout(dropout)(x)
    res = x + inputs

    # Feed Forward Part
    x = LayerNormalization(epsilon=1e-6)(res)
    x = Dense(ff_dim, activation="gelu")(x)
    x = Dropout(dropout)(x)
    x = Dense(inputs.shape[-1])(x)
    return x + res

def build_model(
    input_shape,
    head_size,
    num_heads,
    ff_dim,
    num_transformer_blocks,
    mlp_units,
    dropout=0,
    mlp_dropout=0,
):
    inputs = Input(shape=input_shape)
    x = inputs
    for _ in range(num_transformer_blocks):
        x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)

    for dim in mlp_units:
        x = Dense(dim, activation="gelu")(x)
        x = Dropout(mlp_dropout)(x)

    outputs = Dense(input_shape[0], activation="linear")(x)  # Reconstruction
    return Model(inputs, outputs)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# 1. Identify numerical and categorical columns
numerical_columns = enhanced_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_columns = enhanced_df.select_dtypes(include=['object']).columns.tolist()

# Remove target columns from features
columns_to_remove = ['Is Fraud?', 'Chargeback', 'User', 'Card']
if 'Datetime' in numerical_columns:
    columns_to_remove.append('Datetime')

numerical_columns = [col for col in numerical_columns if col not in columns_to_remove]
categorical_columns = [col for col in categorical_columns if col not in columns_to_remove]

# 2. Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', 'drop', categorical_columns)  # Drop categorical for now, could use OneHotEncoder instead
    ]
)

# 3. Prepare data
X = preprocessor.fit_transform(enhanced_df)
input_shape = X.shape[1]

# 4. Define model the same way you had it
model = build_model(
    input_shape=(input_shape,),
    head_size=256,
    num_heads=4,
    ff_dim=4,
    num_transformer_blocks=4,
    mlp_units=[128],
    mlp_dropout=0.25,
    dropout=0.25,
)

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss="mse",
    metrics=["mae"]
)

In [None]:
normal_data = X[enhanced_df['Is Fraud?'] == 0]
history = model.fit(
    normal_data, normal_data,
    epochs=50,
    batch_size=256,
    validation_split=0.2,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)
    ]
)

# 6. Calculate reconstruction error
train_pred = model.predict(normal_data)
train_mae = np.mean(np.abs(train_pred - normal_data), axis=1)

# 7. Set threshold at 95th percentile
threshold = np.percentile(train_mae, 95)

# 8. Save threshold for inference
import joblib
joblib.dump(threshold, 'anomaly_threshold.pkl')

# 9. Save entire pipeline
import pickle

with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

model.save('mtw_fraud_detection_transformer.h5')

In [None]:
all_data_pred = model.predict(X)
all_data_mae = np.mean(np.abs(all_data_pred - X), axis=1)
predicted_fraud = all_data_mae > threshold

# Calculate accuracy metrics
from sklearn.metrics import classification_report, confusion_matrix

print("Fraud Detection Results:")
print(classification_report(enhanced_df['Is Fraud?'], predicted_fraud))
print("Confusion Matrix:")
print(confusion_matrix(enhanced_df['Is Fraud?'], predicted_fraud))