In [None]:
pwd

In [None]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from  sklearn.ensemble import RandomForestClassifier

In [None]:
df_train = pd.read_csv('/content/fraudTrain.csv', header = 0)
df_test = pd.read_csv('/content/fraudTest.csv', header = 0)

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train.tail()

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
df_train.size

In [None]:
df_test.size

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
df_train.describe()

In [None]:
df_test.describe()

In [None]:
df_train.isnull().values.any()

In [None]:
df_test.isnull().values.any()

In [None]:
df_train.count()

In [None]:
df_test.count()

In [None]:
df_combined = pd.concat([df_train, df_test], axis = 0)

In [None]:
df_combined.head()

In [None]:
df_combined.shape

In [None]:
df_combined.size

In [None]:
df_combined.info()

In [None]:
df_combined.drop(labels = ["first", "last", "job", "dob", "trans_num", "street", "trans_date_trans_time","city","state"], axis = 1, inplace = True)

In [None]:
df_combined.head()

In [None]:
# Step 1: Check column names and verify 'gender_M'
if 'gender_M' in df_combined.columns:
    # Step 2: Plot the countplot
    import seaborn as sns
    import matplotlib.pyplot as plt

    sns.countplot(x='gender_M', data=df_combined)
    plt.title("Gender Distribution")
    plt.show()
else:
    # Step 3: Debug message and display available columns
    print("Column 'gender_M' does not exist. Available columns are:")
    print(df_combined.columns)


In [None]:
# Step 1: Select numeric columns
numeric_df = df_combined.select_dtypes(include=['number'])

# Step 2: Compute the correlation matrix
correlation_matrix = numeric_df.corr()

# Step 3: Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, cmap='coolwarm', annot=False, fmt=".2f")
plt.title("Correlation Matrix")
plt.show()


In [None]:
from sklearn.preprocessing import LabelEncoder

# Verify and encode the column 'merchant' (or correct name)
if "merchant" in df_combined.columns:
    encoder = LabelEncoder()
    df_combined["merchant_new"] = encoder.fit_transform(df_combined["merchant"].values)
    print("Column 'merchant_new' created successfully.")
else:
    print("Column 'merchant' not found. Please check the DataFrame.")

# Drop 'merchant_new' if necessary
if "merchant_new" in df_combined.columns:
    df_combined.drop(labels=["merchant_new"], axis=1, inplace=True)
    print("Column 'merchant_new' dropped successfully.")


In [None]:
from sklearn.preprocessing import LabelEncoder

# Step 1: Verify and encode the original column
if "category" in df_combined.columns:
    encoder = LabelEncoder()
    df_combined["category_new"] = encoder.fit_transform(df_combined["category"].values)
    print("Column 'category_new' created successfully.")
else:
    print("Column 'category' not found in the DataFrame. Please check the data.")

# Step 2: Drop 'category_new' if necessary
if "category_new" in df_combined.columns:
    df_combined.drop(labels=["category_new"], axis=1, inplace=True)
    print("Column 'category_new' dropped successfully.")
else:
    print("Column 'category_new' not found. Nothing to drop.")


In [None]:
df_combined = pd.get_dummies(df_combined)
df_combined.drop(labels=['gender_M'], axis = 1, inplace = True)

In [None]:
df_combined.head()

In [None]:
X = df_combined.drop("is_fraud", axis = 1)
y = df_combined["is_fraud"]

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train = X_train.dropna()  # Drop rows with missing values
y_train = y_train.dropna()  # Drop rows with missing values

In [None]:
lr_model = LogisticRegression()

In [None]:
lr_model.fit(X_train, y_train)

In [None]:
lr_predictions = lr_model.predict(X_test)

In [None]:
print("Logistic Regression Model: ")
print(confusion_matrix(y_test, lr_predictions))
print(classification_report(y_test, lr_predictions))
print("Accuracy: ", accuracy_score(y_test, lr_predictions))

In [None]:
dt_model = DecisionTreeClassifier()

In [None]:
dt_model.fit(X_train, y_train)

In [None]:
dt_predictions = dt_model.predict(X_test)

In [None]:
print("Decision Tree Model: ")
print(confusion_matrix(y_test, dt_predictions))
print(classification_report(y_test, dt_predictions))
print("Accuracy: ", accuracy_score(y_test, dt_predictions))

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)

In [None]:
rf_model.fit(X_train, y_train)

In [None]:
rf_predictions = rf_model.predict(X_test)

In [None]:
print("Random Forest Model: ")
print(confusion_matrix(y_test, rf_predictions))
print(classification_report(y_test, rf_predictions))
print("Accuracy: ", accuracy_score(y_test, rf_predictions))