# Fraud Detection in Applications

### Dataset Overview
- Size: 2,000 applications with 15% fraud rate (300 fraudulent)

- Features: 28 columns including personal info, behavior patterns, and derived anomaly detection features

- Target: Binary classification (is_fraudulent)

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import hashlib

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

def generate_fraud_detection_dataset(n_records=2000, fraud_rate=0.15):
    # Personal information patterns
    domains = ['gmail.com', 'yahoo.com', 'hotmail.com', 'outlook.com', 'company.com']
    universities = ['State University', 'Tech Institute', 'Liberal Arts College', 
                   'Community College', 'International University', 'Online University']
    
    # Countries and timezones
    countries = ['USA', 'India', 'China', 'UK', 'Canada', 'Germany', 'Brazil', 'Australia']
    
    # Skills and positions
    technical_skills = ['Python', 'Java', 'SQL', 'JavaScript', 'Machine Learning', 'Data Analysis']
    soft_skills = ['Communication', 'Teamwork', 'Leadership', 'Problem Solving']
    positions = ['Data Science Intern', 'Software Developer Intern', 'Marketing Intern', 
                'Research Assistant', 'Business Analyst Intern']
    
    data = []
    
    # Generate some legitimate applicant patterns
    legitimate_applicants = []
    for i in range(int(n_records * (1 - fraud_rate))):
        applicant_id = f"APP_{i+1:06d}"
        
        # Generate consistent personal information
        first_name = random.choice(['John', 'Jane', 'Mike', 'Sarah', 'David', 'Emily', 'Chris', 'Lisa'])
        last_name = random.choice(['Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia', 'Miller'])
        email = f"{first_name.lower()}.{last_name.lower()}{random.randint(1, 999)}@{random.choice(domains)}"
        
        # Consistent geographic information
        country = random.choice(countries)
        timezone = random.choice(['EST', 'PST', 'CST', 'GMT', 'IST', 'CET'])
        
        # Realistic application behavior
        application_date = datetime(2024, 1, 1) + timedelta(days=random.randint(0, 180))
        submission_time = application_date + timedelta(hours=random.randint(9, 17))
        
        # Normal submission duration (minutes spent on application)
        submission_duration = random.randint(20, 120)
        
        # Consistent device and IP patterns
        device_type = random.choice(['Windows Chrome', 'Mac Safari', 'Windows Firefox', 'Mobile iOS'])
        ip_prefix = f"192.168.{random.randint(1, 255)}"
        
        legitimate_applicants.append({
            'applicant_id': applicant_id,
            'first_name': first_name,
            'last_name': last_name,
            'email': email,
            'phone': f"+1-{random.randint(200, 999)}-{random.randint(200, 999)}-{random.randint(1000, 9999)}",
            'university': random.choice(universities),
            'gpa': round(np.random.normal(3.4, 0.3), 2),
            'country': country,
            'timezone': timezone,
            'application_date': application_date.strftime('%Y-%m-%d'),
            'submission_time': submission_time.strftime('%H:%M:%S'),
            'submission_duration_min': submission_duration,
            'position_applied': random.choice(positions),
            'years_experience': max(0, int(np.random.normal(1.5, 1))),
            'technical_skills_count': random.randint(2, 5),
            'soft_skills_count': random.randint(2, 4),
            'resume_file_size_kb': random.randint(50, 500),
            'device_type': device_type,
            'ip_address': f"{ip_prefix}.{random.randint(1, 255)}",
            'referral_source': random.choice(['University', 'LinkedIn', 'Company Website', 'Job Portal']),
            'email_domain': email.split('@')[1],
            'is_fraudulent': False
        })
    
    # Generate fraudulent applications with suspicious patterns
    fraudulent_applicants = []
    fraud_patterns = ['duplicate_submission', 'rapid_fire', 'inconsistent_data', 'synthetic_identity']
    
    for i in range(int(n_records * fraud_rate)):
        fraud_type = random.choice(fraud_patterns)
        applicant_id = f"APP_FRAUD_{i+1:04d}"
        
        if fraud_type == 'duplicate_submission':
            # Duplicate entries with slight variations
            base_app = random.choice(legitimate_applicants)
            email = base_app['email'].replace('@', f"{random.randint(1,9)}@")
            first_name = base_app['first_name']
            last_name = base_app['last_name'] + random.choice(['', 'son', 'sen', 'ovich'])
            
        elif fraud_type == 'rapid_fire':
            # Rapid submissions from same IP/device
            first_name = ''.join(random.choices('abcdefghijklmnopqrstuvwxyz', k=6)).capitalize()
            last_name = ''.join(random.choices('abcdefghijklmnopqrstuvwxyz', k=8)).capitalize()
            email = f"{random.randint(100000, 999999)}@{random.choice(['temp-mail.org', 'fake.com', 'spam.net'])}"
            
        elif fraud_type == 'inconsistent_data':
            # Inconsistent information
            first_name = random.choice(['X Ã† A-12', '123Test', 'Admin', 'User'])
            last_name = random.choice(['Test', 'Demo', 'Example', 'User'])
            email = f"{first_name.lower()}{random.randint(1, 999)}@invalid.com"
            
        else:  # synthetic_identity
            # Completely synthetic but plausible-looking
            first_name = random.choice(['John', 'Jane', 'Robert', 'Maria'])
            last_name = random.choice(['Smith', 'Johnson', 'Davis', 'Wilson'])
            email = f"{first_name.lower()}.{last_name.lower()}@synthetic-mail.com"
        
        # Fraudulent application behavior patterns
        application_date = datetime(2024, 1, 1) + timedelta(days=random.randint(0, 180))
        
        if fraud_type == 'rapid_fire':
            submission_time = application_date + timedelta(minutes=random.randint(0, 5))
            submission_duration = random.randint(1, 5)  # Very fast submissions
        else:
            submission_time = application_date + timedelta(hours=random.randint(0, 23))
            submission_duration = random.randint(2, 10)  # Suspiciously fast
        
        # Suspicious patterns
        if fraud_type in ['rapid_fire', 'duplicate_submission']:
            ip_address = f"10.0.{random.randint(1, 255)}.1"  # Suspicious IP range
            device_type = "Automated Script"
        else:
            ip_address = f"192.168.{random.randint(1, 255)}.{random.randint(1, 255)}"
            device_type = random.choice(['Windows Chrome', 'Unknown Browser'])
        
        # Inconsistent data patterns
        if fraud_type == 'inconsistent_data':
            gpa = round(random.uniform(1.0, 4.0), 2)
            years_experience = random.randint(0, 10)
            university = random.choice(['Fake University', 'Online Degree Mill', 'No University'])
        else:
            gpa = round(np.random.normal(3.8, 0.1), 2)  # Suspiciously high
            years_experience = random.randint(5, 8)  # Overqualified for intern
            university = random.choice(universities)
        
        fraudulent_applicants.append({
            'applicant_id': applicant_id,
            'first_name': first_name,
            'last_name': last_name,
            'email': email,
            'phone': f"+{random.randint(1, 999)}-{random.randint(100, 999)}-{random.randint(100, 999)}-{random.randint(1000, 9999)}",
            'university': university,
            'gpa': gpa,
            'country': random.choice(['Unknown', 'Multiple', ''] + countries),
            'timezone': random.choice(['Unknown', 'UTC', '']),
            'application_date': application_date.strftime('%Y-%m-%d'),
            'submission_time': submission_time.strftime('%H:%M:%S'),
            'submission_duration_min': submission_duration,
            'position_applied': random.choice(positions),
            'years_experience': years_experience,
            'technical_skills_count': random.randint(6, 10),  # Suspiciously high
            'soft_skills_count': random.randint(5, 8),  # Suspiciously high
            'resume_file_size_kb': random.randint(10, 100),  # Suspiciously small
            'device_type': device_type,
            'ip_address': ip_address,
            'referral_source': random.choice(['Direct', 'Unknown', '']),
            'email_domain': email.split('@')[1],
            'is_fraudulent': True,
            'fraud_type': fraud_type
        })
    
    # Combine datasets
    all_applications = legitimate_applicants + fraudulent_applicants
    random.shuffle(all_applications)
    
    df = pd.DataFrame(all_applications)
    
    # Add derived features for anomaly detection
    df['application_datetime'] = pd.to_datetime(df['application_date'] + ' ' + df['submission_time'])
    df['email_username_length'] = df['email'].apply(lambda x: len(x.split('@')[0]))
    df['has_suspicious_domain'] = df['email_domain'].isin(['temp-mail.org', 'fake.com', 'spam.net', 'synthetic-mail.com']).astype(int)
    df['name_length_ratio'] = (df['first_name'].str.len() + df['last_name'].str.len()) / 20
    df['suspicious_ip'] = df['ip_address'].str.startswith('10.0.').astype(int)
    df['submission_speed'] = df['submission_duration_min'] / df['technical_skills_count']
    
    # Time-based features
    df['application_hour'] = pd.to_datetime(df['submission_time']).dt.hour
    df['is_night_hour'] = ((df['application_hour'] >= 0) & (df['application_hour'] <= 6)).astype(int)
    
    return df

# Generate the dataset
print("Generating fraud detection dataset for internship applications...")
df = generate_fraud_detection_dataset(2000, fraud_rate=0.15)

# Save to CSV
csv_filename = 'internship_fraud_detection.csv'
df.to_csv(csv_filename, index=False)

print(f"Dataset successfully saved as '{csv_filename}'")
print(f"Dataset shape: {df.shape}")

# Display comprehensive fraud analysis
print("\n" + "="*70)
print("FRAUD DETECTION DATASET SUMMARY")
print("="*70)

print(f"\nTotal applications: {len(df):,}")
print(f"Fraudulent applications: {df['is_fraudulent'].sum():,} ({df['is_fraudulent'].mean():.1%})")

print(f"\n=== FRAUD TYPE BREAKDOWN ===")
if 'fraud_type' in df.columns:
    fraud_breakdown = df[df['is_fraudulent']]['fraud_type'].value_counts()
    for fraud_type, count in fraud_breakdown.items():
        percentage = count / df['is_fraudulent'].sum() * 100
        print(f"{fraud_type:<20}: {count:>4} cases ({percentage:.1f}%)")

print(f"\n=== SUSPICIOUS PATTERN ANALYSIS ===")
suspicious_metrics = {
    'Rapid Submissions (<5 min)': (df['submission_duration_min'] < 5).mean(),
    'Suspicious Email Domains': df['has_suspicious_domain'].mean(),
    'Night-time Applications (12AM-6AM)': df['is_night_hour'].mean(),
    'Suspicious IP Addresses': df['suspicious_ip'].mean(),
    'Overqualified Applicants (5+ years exp)': (df['years_experience'] >= 5).mean(),
    'Perfect GPA (3.9-4.0)': (df['gpa'] >= 3.9).mean()
}

for pattern, rate in suspicious_metrics.items():
    print(f"{pattern:<35}: {rate:.1%}")

print(f"\n=== COMPARISON: LEGITIMATE vs FRAUDULENT APPLICATIONS ===")
comparison_metrics = [
    'submission_duration_min', 'gpa', 'years_experience', 
    'technical_skills_count', 'resume_file_size_kb', 'application_hour'
]

legit_stats = df[~df['is_fraudulent']][comparison_metrics].mean()
fraud_stats = df[df['is_fraudulent']][comparison_metrics].mean()

comparison_df = pd.DataFrame({
    'Legitimate': legit_stats,
    'Fraudulent': fraud_stats,
    'Difference': fraud_stats - legit_stats
}).round(2)

print(comparison_df)

print(f"\n=== ANOMALY DETECTION FEATURES ===")
anomaly_features = [
    'submission_duration_min', 'submission_speed', 'gpa', 'years_experience',
    'technical_skills_count', 'resume_file_size_kb', 'email_username_length',
    'name_length_ratio', 'has_suspicious_domain', 'suspicious_ip', 'is_night_hour'
]

print("Features engineered for anomaly detection:")
for feature in anomaly_features:
    print(f"  - {feature}")

print(f"\n=== MACHINE LEARNING READINESS ===")
print("Target variable: is_fraudulent (Binary classification)")
print(f"Number of features: {len(df.columns)}")
print(f"Fraud rate: {df['is_fraudulent'].mean():.1%} (Slightly imbalanced - good for anomaly detection)")

print(f"\nRecommended algorithms:")
print("1. Isolation Forest - For unsupervised anomaly detection")
print("2. Local Outlier Factor (LOF) - For density-based anomaly detection")
print("3. K-Means Clustering - For pattern-based fraud grouping")
print("4. Random Forest - For supervised fraud classification")

print(f"\n=== SAMPLE FRAUDULENT APPLICATIONS ===")
fraud_samples = df[df['is_fraudulent']].head(3)
print(fraud_samples[['applicant_id', 'email', 'submission_duration_min', 'gpa', 'fraud_type']].to_string(index=False))

print(f"\n=== SAMPLE LEGITIMATE APPLICATIONS ===")
legit_samples = df[~df['is_fraudulent']].head(3)
print(legit_samples[['applicant_id', 'email', 'submission_duration_min', 'gpa']].to_string(index=False))

Generating fraud detection dataset for internship applications...
Dataset successfully saved as 'internship_fraud_detection.csv'
Dataset shape: (2000, 31)

FRAUD DETECTION DATASET SUMMARY

Total applications: 2,000
Fraudulent applications: 300 (15.0%)

=== FRAUD TYPE BREAKDOWN ===
synthetic_identity  :   95 cases (31.7%)
rapid_fire          :   75 cases (25.0%)
inconsistent_data   :   68 cases (22.7%)
duplicate_submission:   62 cases (20.7%)

=== SUSPICIOUS PATTERN ANALYSIS ===
Rapid Submissions (<5 min)         : 6.0%
Suspicious Email Domains           : 8.5%
Night-time Applications (12AM-6AM) : 7.0%
Suspicious IP Addresses            : 6.9%
Overqualified Applicants (5+ years exp): 13.5%
Perfect GPA (3.9-4.0)              : 6.8%

=== COMPARISON: LEGITIMATE vs FRAUDULENT APPLICATIONS ===
                         Legitimate  Fraudulent  Difference
submission_duration_min       69.50        5.47      -64.03
gpa                            3.40        3.49        0.09
years_experience     

  df['application_hour'] = pd.to_datetime(df['submission_time']).dt.hour
