In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
# Load the CSV files again
train_df = pd.read_csv("fraudTrain.csv")
test_df  = pd.read_csv("fraudTest.csv")

# Strip whitespace from column names
train_df.columns = train_df.columns.str.strip()
test_df.columns  = test_df.columns.str.strip()

# Check the columns
print(train_df.columns.tolist())



In [None]:
columns_to_drop = ['Unnamed: 0','trans_date_trans_time','cc_num','first','last',
                   'street','city','state','zip','job','dob','trans_num']

train_df = train_df.drop(columns=[c for c in columns_to_drop if c in train_df.columns])
test_df  = test_df.drop(columns=[c for c in columns_to_drop if c in test_df.columns])

# Check remaining columns
print(train_df.columns.tolist())


In [None]:
# Convert categorical columns to numeric using One-Hot Encoding
train_df = pd.get_dummies(train_df, columns=['category','gender'], drop_first=True)
test_df  = pd.get_dummies(test_df, columns=['category','gender'], drop_first=True)

# Check new columns
print(train_df.columns.tolist())


In [None]:


# List of numeric columns to scale
num_cols = ['amt', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long']

scaler = StandardScaler()
train_df[num_cols] = scaler.fit_transform(train_df[num_cols])
test_df[num_cols]  = scaler.transform(test_df[num_cols])

# Check first 5 rows
print(train_df.head())


In [None]:
# Features (X) and target (y)
X_train = train_df.drop(columns=['is_fraud','merchant'])  # drop merchant for now
y_train = train_df['is_fraud']

X_test  = test_df.drop(columns=['is_fraud','merchant'])
y_test  = test_df['is_fraud']

# Check shapes
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)


In [None]:

model = LogisticRegression(max_iter=1000, class_weight='balanced')  # balanced handles class imbalance

# Train the model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
plt.figure(figsize=(10,6))
# Plot legit transactions lightly
plt.scatter(train_df['long'], train_df['lat'], c='blue', s=1, alpha=0.1, label='Legit')
# Plot fraud transactions in red
fraud_df = train_df[train_df['is_fraud']==1]
plt.scatter(fraud_df['long'], fraud_df['lat'], c='red', s=10, alpha=0.6, label='Fraud')

plt.title("Fraud vs Legit Transactions by Location")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.legend()
plt.show()
