In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv("C:/Users/naure/OneDrive/Desktop/codsoft/fraud/fraudTrain.csv")
test = pd.read_csv("C:/Users/naure/OneDrive/Desktop/codsoft/fraud/fraudTest.csv")


In [None]:
print("Train columns:", train.columns.tolist())
print("Test columns:", test.columns.tolist())

In [5]:
# Check for missing values
print("Missing values in train:", train.isnull().sum().sum())
print("Missing values in test:", test.isnull().sum().sum())

Missing values in train: 0
Missing values in test: 0


In [6]:

# Strip column names (clean any spaces)
train.columns = train.columns.str.strip()
test.columns = test.columns.str.strip()

In [7]:

# Drop unnecessary columns
drop_cols = ['Unnamed: 0', 'trans_date_trans_time', 'first', 'last', 'street', 'job', 'dob', 'trans_num']
train.drop(columns=drop_cols, inplace=True)
test.drop(columns=drop_cols, inplace=True)

In [8]:
# Separate features and target
target_col = 'is_fraud'
X_train = train.drop(target_col, axis=1)
y_train = train[target_col]
X_test = test.drop(target_col, axis=1)
y_test = test[target_col]

In [9]:
# Combine train and test to encode categorical columns consistently
combined = pd.concat([X_train, X_test])
categorical_cols = combined.select_dtypes(include=['object']).columns

In [10]:
# One-hot encode categorical columns
combined_encoded = pd.get_dummies(combined, columns=categorical_cols, drop_first=True)

In [11]:
# Split encoded combined back to train and test
X_train_encoded = combined_encoded.iloc[:len(X_train), :]
X_test_encoded = combined_encoded.iloc[len(X_train):, :]

In [12]:
# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

In [13]:
# Train and compare models
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}
best_model = None
best_accuracy = 0

In [None]:
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_test_scaled)
    
    acc = accuracy_score(y_test, preds)
    print(f"\n{name} Accuracy: {acc:.4f}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, preds))
    print("Classification Report:\n", classification_report(y_test, preds))
   

In [None]:
if acc > best_accuracy:
        best_accuracy = acc
        best_model = name

print(f"\n✅ Best Model: {best_model} with Accuracy: {best_accuracy:.4f}")

In [None]:

def plot_confusion_matrix(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Reds',
                xticklabels=['Legit (0)', 'Fraud (1)'],
                yticklabels=['Legit (0)', 'Fraud (1)'])
    
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.show()
plot_confusion_matrix(y_test, y_pred, model_name=name)