In [1]:
pip install pandas scikit-learn matplotlib seaborn





In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Load dataset
data = pd.read_csv('fraudTest.csv')
data = pd.read_csv('fraudTrain.csv')

In [9]:
# Display basic information about the dataset
print(data.info())

# Display the first few rows of the dataset
print(data.head())

# Check for class imbalance
print(data['is_fraud'].value_counts())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long              

In [15]:
from sklearn.preprocessing import StandardScaler

# Standardize the 'amt' and 'trans_date_trans_time' (or 'unix_time') features
data['amt'] = StandardScaler().fit_transform(data['amt'].values.reshape(-1, 1))
data['unix_time'] = StandardScaler().fit_transform(data['unix_time'].values.reshape(-1, 1))


In [17]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_selector as selector

# Check for missing values
print(data.isnull().sum())

# Define the feature matrix and target variable
X = data.drop(columns='is_fraud')
y = data['is_fraud']

# Preprocessing: handle categorical and numerical data
numeric_features = selector(dtype_include=['int64', 'float64'])
categorical_features = selector(dtype_include=object)

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Apply the transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Apply preprocessing pipeline to the data
X_processed = preprocessor.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.3, random_state=42)



Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_selector as selector
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
import joblib

# Load datasets
train_data = pd.read_csv('fraudTrain.csv')
test_data = pd.read_csv('fraudTest.csv')

# Combine both datasets for preprocessing
data = pd.concat([train_data, test_data], ignore_index=True)

# Check for missing values
print(data.isnull().sum())

# Define the feature matrix and target variable
X = data.drop(columns='is_fraud')
y = data['is_fraud']

# Preprocessing: handle categorical and numerical data
numeric_features = selector(dtype_include=['int64', 'float64'])
categorical_features = selector(dtype_include=object)

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Apply the transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Apply preprocessing pipeline to the data
X_processed = preprocessor.fit_transform(X)

# Feature selection: Keep only the top k features
k = 20  # You can adjust k to keep more or fewer features
feature_selector = SelectKBest(score_func=chi2, k=k)
X_selected = feature_selector.fit_transform(X_processed, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Define the models with optimized parameters
decision_tree = DecisionTreeClassifier(random_state=42)
random_forest = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)  # Parallel processing
logistic_regression = LogisticRegression(max_iter=1000, random_state=42)

# Use cross-validation to evaluate models and choose the best one
models = {
    "Decision Tree": decision_tree,
    "Random Forest": random_forest,
    "Logistic Regression": logistic_regression
}

best_model = None
best_auc = 0

for name, model in models.items():
    # Use cross-validation for a more robust evaluation
    auc_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
    mean_auc = auc_scores.mean()
    
    print(f"{name} AUC (cross-validated): {mean_auc}\n")
    
    if mean_auc > best_auc:
        best_auc = mean_auc
        best_model = model

# Train the best model on the entire training set
best_model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = best_model.predict(X_test)
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
print(f"Classification Report:\n{classification_report(y_test, y_pred)}")
print(f"AUC: {roc_auc_score(y_test, y_pred)}\n")

# Save the best model
joblib.dump(best_model, 'fraud_detection_model.pkl')

print(f"Best model saved with AUC: {best_auc}")


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_selector as selector
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
import joblib

# Load datasets
train_data = pd.read_csv('fraudTrain.csv')
test_data = pd.read_csv('fraudTest.csv')

# Combine both datasets for preprocessing
data = pd.concat([train_data, test_data], ignore_index=True)

# Check for missing values
print(data.isnull().sum())

# Define the feature matrix and target variable
X = data.drop(columns='is_fraud')
y = data['is_fraud']

# Preprocessing: handle categorical and numerical data
numeric_features = selector(dtype_include=['int64', 'float64'])
categorical_features = selector(dtype_include=object)

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Apply the transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Apply preprocessing pipeline to the data
X_processed = preprocessor.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.3, random_state=42)

# Train the Decision Tree model
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)

# Train the Random Forest model
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, y_train)

# Train the Logistic Regression model
logistic_regression = LogisticRegression(max_iter=1000, random_state=42)
logistic_regression.fit(X_train, y_train)

# Evaluate the models
models = {
    "Decision Tree": decision_tree,
    "Random Forest": random_forest,
    "Logistic Regression": logistic_regression
}

best_model = None
best_auc = 0

for name, model in models.items():
    y_pred = model.predict(X_test)
    auc = roc_auc_score(y_test, y_pred)
    
    print(f"{name}:")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print(f"AUC: {auc}\n")
    
    if auc > best_auc:
        best_auc = auc
        best_model = model

# Save the best model
joblib.dump(best_model, 'fraud_detection_model.pkl')

print(f"Best model saved with AUC: {best_auc}")


Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64
