In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from cuml.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle

class RobustLabelEncoder:
    def __init__(self):
        self.mapping = {}
        self.counter = 0
        
    def fit(self, data):
        unique_values = pd.Series(data).unique()
        for value in unique_values:
            self.mapping[value] = self.counter
            self.counter += 1
        return self
    
    def transform(self, data):
        return pd.Series(data).map(lambda x: self.mapping.get(x, self.counter))
    
    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

def train_fraud_detection(train_path, test_path=None, fraud_ratio=0.5, sample_size=100000):
    """Train fraud detection model using GPU with focus on reducing false negatives"""
    print("Loading training data...")
    train_data = pd.read_csv(train_path)
    
    # Remove unnecessary columns
    drop_cols = [
        'Unnamed: 0', 'cc_num', 'first', 'last', 'street', 'city', 
        'state', 'zip', 'dob', 'trans_num', 'trans_date_trans_time'
    ]
    train_data = train_data.drop(columns=drop_cols)
    
    # Separate fraud and non-fraud cases
    fraud_data = train_data[train_data['is_fraud'] == 1]
    non_fraud_data = train_data[train_data['is_fraud'] == 0]
    
    print(f"Original dataset - Fraud cases: {len(fraud_data)}, Non-fraud cases: {len(non_fraud_data)}")
    
    # Increase fraud ratio in training data
    n_fraud = min(len(fraud_data), int(sample_size * fraud_ratio))
    n_non_fraud = min(len(non_fraud_data), int(sample_size * (1 - fraud_ratio)))
    
    # Sample data with higher representation of fraud cases
    fraud_sampled = fraud_data.sample(n=n_fraud, random_state=42)
    non_fraud_sampled = non_fraud_data.sample(n=n_non_fraud, random_state=42)
    
    # Combine and shuffle
    train_data = pd.concat([fraud_sampled, non_fraud_sampled])
    train_data = shuffle(train_data, random_state=42)
    
    print(f"Training dataset - Fraud cases: {len(fraud_sampled)}, Non-fraud cases: {len(non_fraud_sampled)}")
    
    # Split features and target
    X = train_data.drop('is_fraud', axis=1)
    y = train_data['is_fraud']
    
    # Handle test data
    if test_path:
        print("Loading test data...")
        test_data = pd.read_csv(test_path)
        test_data = test_data.drop(columns=drop_cols)
        X_test = test_data.drop('is_fraud', axis=1)
        y_test = test_data['is_fraud']
    else:
        from sklearn.model_selection import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        X, y = X_train, y_train

    print("Preprocessing data...")
    
    # Encode categorical features
    categorical_cols = ['merchant', 'category', 'gender', 'job']
    encoders = {}
    for col in categorical_cols:
        encoders[col] = RobustLabelEncoder()
        X[col] = encoders[col].fit_transform(X[col])
        X_test[col] = encoders[col].transform(X_test[col])
    
    # Scale numerical features
    numerical_cols = ['amt', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long']
    scaler = StandardScaler()
    X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
    X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])
    
    # Convert to numpy arrays
    X = X.to_numpy()
    y = y.to_numpy()
    X_test = X_test.to_numpy()
    y_test = y_test.to_numpy()
    
    print("Training model on GPU...")
    
    # Adjusted SVM parameters to reduce false negatives
    model = SVC(
        kernel='rbf',
        C=5.0,  # Reduced to make decision boundary more flexible
        gamma='auto',  # Changed to auto for better adaptation to data distribution
        probability=False,
        cache_size=4000,
        # Heavily weighted towards fraud detection
        class_weight={0: 1, 1: 4},  # Increased weight for fraud cases
        random_state=42
    )
    
    model.fit(X, y)
    
    print("Evaluating model...")
    y_pred = model.predict(X_test)
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    cm = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix - Optimized for Lower False Negatives')
    plt.show()
    
    # Calculate and print false negative rate
    fn = cm.iloc[1, 0]  # False negatives
    tp = cm.iloc[1, 1]  # True positives
    false_negative_rate = fn / (fn + tp)
    print(f"\nFalse Negative Rate: {false_negative_rate:.4f}")
    
    return model, encoders, scaler

def predict_fraud(model, encoders, scaler, transaction_df):
    """Predict fraud for new transactions"""
    df = transaction_df.copy()
    
    # Encode categorical features
    categorical_cols = ['merchant', 'category', 'gender', 'job']
    for col in categorical_cols:
        df[col] = encoders[col].transform(df[col])
    
    # Scale numerical features
    numerical_cols = ['amt', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long']
    df[numerical_cols] = scaler.transform(df[numerical_cols])
    
    # Convert to numpy for prediction
    X = df.to_numpy()
    
    return model.predict(X)

model, encoders, scaler = train_fraud_detection(
    train_path='/home/hessel/code/fraudTrain.csv',
    test_path='/home/hessel/code/fraudTest.csv',
    fraud_ratio=0.48,  # Try different ratios (0.2-0.4)
    sample_size=100000  # Adjust based on GPU memory)
)





Loading training data...


FileNotFoundError: [Errno 2] No such file or directory: 'fraudTrain.csv'