In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
data = pd.read_csv("Fraud.csv")


In [None]:
data

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0,0
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,1,0
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1048570,95,CASH_OUT,132557.35,C1179511630,479803.00,347245.65,C435674507,484329.37,616886.72,0,0
1048571,95,PAYMENT,9917.36,C1956161225,90545.00,80627.64,M668364942,0.00,0.00,0,0
1048572,95,PAYMENT,14140.05,C2037964975,20545.00,6404.95,M1355182933,0.00,0.00,0,0
1048573,95,PAYMENT,10020.05,C1633237354,90605.00,80584.95,M1964992463,0.00,0.00,0,0


In [None]:
data.dropna(inplace=True)

In [None]:
missing_values = data.isnull().sum()
print("Missing Values:")
print(missing_values)

Missing Values:
step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64


In [None]:

# Step 2: Outlier Detection and Treatment
# Assuming 'amount' is an important feature, we'll use IQR method to detect and treat outliers
Q1 = data['amount'].quantile(0.25)
Q3 = data['amount'].quantile(0.75)
IQR = Q3 - Q1

In [None]:
# Define upper and lower bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [None]:
# Identify outliers
outliers = data[(data['amount'] < lower_bound) | (data['amount'] > upper_bound)]

In [None]:
# Remove outliers
data = data[~((data['amount'] < lower_bound) | (data['amount'] > upper_bound))]

In [None]:
data

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0,0
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,1,0
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1048570,95,CASH_OUT,132557.35,C1179511630,479803.00,347245.65,C435674507,484329.37,616886.72,0,0
1048571,95,PAYMENT,9917.36,C1956161225,90545.00,80627.64,M668364942,0.00,0.00,0,0
1048572,95,PAYMENT,14140.05,C2037964975,20545.00,6404.95,M1355182933,0.00,0.00,0,0
1048573,95,PAYMENT,10020.05,C1633237354,90605.00,80584.95,M1964992463,0.00,0.00,0,0


#5.What are the key factors that predict fraudulent customer?

 The following factors may play a significant role in predicting fraudulent transactions:

Transaction Type (Type): Certain types of transactions may be more indicative of fraudulent behavior than others. For example, large transfers or cash-outs might be more likely to be fraudulent.

Transaction Amount (Amount): Unusually large or irregular transaction amounts could signal potential fraudulent activity.

Initial Balance of Origin Account (OldBalanceOrg): Discrepancies between the initial balance of the origin account and the transaction amount might be indicative of fraudulent behavior.

New Balance of Origin Account (NewBalanceOrig): Changes in the balance of the origin account after the transaction may provide insights into potential fraudulent activity.

Initial Balance of Destination Account (OldBalanceDest): Similar to the origin account, imbalances in the initial balance of the destination account could raise suspicion.

New Balance of Destination Account (NewBalanceDest): Changes in the balance of the destination account following the transaction could be indicative of fraudulent behavior, especially if the transaction involves transferring funds to another account.

Flagged Transactions (IsFlaggedFraud): Transactions flagged as potentially fraudulent by the business model may serve as strong predictors of actual fraud.

By considering the importance of these features as determined by the trained model, we can identify the key factors that contribute most significantly to predicting fraudulent transactions. These factors provide valuable insights for detecting and preventing fraudulent activity within the financial system.

In [None]:
# Step 3: Multicollinearity Check
# Assuming 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest' are important features
# Check for correlation matrix
correlation_matrix = data[['oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']].corr()

# 6.Do these factors make sense? If yes, How? If not, How not?
yes.

Transaction Type: Different types of transactions carry varying levels of risk for fraud.

Transaction Amount: Unusually large or irregular transaction amounts can signal potential fraud.

Initial and New Balances: Discrepancies in account balances before and after transactions may indicate fraudulent activity.

Flagged Transactions: Any transactions flagged as potentially fraudulent are important indicators of actual fraud.

These factors collectively help financial institutions detect and prevent fraudulent activity by identifying suspicious patterns and transactions.

In [None]:
# Print correlation matrix
print("Correlation Matrix:")
print(correlation_matrix)

Correlation Matrix:
                oldbalanceOrg  newbalanceOrig  oldbalanceDest  newbalanceDest
oldbalanceOrg        1.000000        0.999511        0.106343        0.082517
newbalanceOrig       0.999511        1.000000        0.108470        0.083357
oldbalanceDest       0.106343        0.108470        1.000000        0.983497
newbalanceDest       0.082517        0.083357        0.983497        1.000000


# 3.How did you select variables to be included in the model?
variables included in the model are selected based on their relevance to predicting fraudulent transactions. Here's a breakdown of the selection process:

Variables related to transaction details: These variables provide information about the transaction itself, such as the type of transaction (CASH-IN, CASH-OUT, DEBIT, PAYMENT, TRANSFER), the amount of the transaction,the initial balance before the transaction, and the new balance after the transaction. These variables are crucial as they directly relate to the characteristics of each transaction.

Exclusion of irrelevant variables: The nameOrig and nameDest variables are dropped from the dataset.These variables represent customer identifiers, which are not directly related to  the nature of the transaction or its likelihood of being fraudulent. Therefore, they are considered irrelevant for the purpose of fraud detection in this context.

Target variable: The target variable isFraud indicates whether a transaction is fraudulent or not.This variable serves as the label for training the model to predict fraudulent behavior based on the transaction details.

Encoding categorical variables: The categorical variable type, representing the type of transaction, is encoded using Label Encoding. This transformation converts categorical data into numerical format, which can be processed by machine learning algorithms.



In [None]:
# Drop irrelevant columns
data.drop(['nameOrig', 'nameDest'], axis=1, inplace=True)

In [None]:
# Encode categorical variables
label_encoder = LabelEncoder()
data['type'] = label_encoder.fit_transform(data['type'])

In [None]:
# Split features and target variable
X = data.drop('isFraud', axis=1)
y = data['isFraud']

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# 2.Describe your fraud detection model in elaboration.
 Let's elaborate on various aspects of this model:

Algorithm Selection: XGBoost (Extreme Gradient Boosting) was chosen as the underlying algorithm due to its effectiveness in handling imbalanced datasets commonly encountered in fraud detection tasks. It is an ensemble learning technique that builds multiple decision trees sequentially, with each tree correcting the errors of its predecessors, resulting in a strong predictive model.

Data Preprocessing: The dataset is preprocessed to prepare it for modeling. Irrelevant columns (nameOrig and nameDest) are dropped as they do not contribute to fraud detection. Categorical variables, such as type, are encoded using label encoding to convert them into numeric format suitable for machine learning algorithms.

Model Training: The XGBoost classifier is trained on the preprocessed dataset using the training data. During training, the model learns to distinguish between fraudulent and non-fraudulent transactions based on the patterns and features present in the data.

Hyperparameter Tuning: GridSearchCV is employed to perform hyperparameter tuning, which involves systematically searching for the best combination of hyperparameters (e.g., learning rate, max depth, number of estimators) that optimize the model's performance. This helps in improving the model's predictive accuracy and generalization ability.

Model Evaluation: The performance of the trained model is evaluated using various metrics such as accuracy, confusion matrix, and classification report. These metrics provide insights into how well the model is performing in terms of correctly identifying fraudulent and non-fraudulent transactions, as well as its precision, recall, and F1-score.

Interpretability: While XGBoost models are known for their predictive accuracy, they may lack interpretability due to their complex nature. However, feature importance analysis can be performed to identify the most influential features contributing to the model's predictions, providing insights into the factors driving fraudulent activities.



In [None]:
# Step 3: Model training
# Initialize XGBoost classifier
model = XGBClassifier(random_state=42)

In [None]:
# Fit the model on training data
model.fit(X_train, y_train)

In [None]:
# Hyperparameter tuning
# Define hyperparameters to tune
param_grid = {
    'learning_rate': [0.1, 0.01],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300]
}

In [None]:
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)


In [None]:
# Perform grid search to find the best parameters
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


In [None]:
# Get the best parameters
best_params = grid_search.best_params_

In [None]:
# Step 5: Evaluate the model
# Predictions on test set
y_pred = grid_search.predict(X_test)

#4 .Demonstrate the performance of the model by using best set of tools.
To demonstrate the performance of the model, we can utilize various evaluation metrics and visualization techniques. Here's how we can proceed:

Confusion Matrix: We can visualize the confusion matrix to understand the distribution of true positive, true negative, false positive, and false negative predictions.

Accuracy Score: We can calculate the accuracy of the model, which is the proportion of correctly classified transactions.

Classification Report: We can generate a classification report, which includes precision, recall, F1-score, and support for each class (fraudulent and non-fraudulent transactions).

In [None]:
# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Best Parameters:", best_params)
print("Accuracy:", accuracy)
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(report)

Best Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300}
Accuracy: 0.9998392751308401
Confusion Matrix:
[[198958      5]
 [    27    108]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    198963
           1       0.96      0.80      0.87       135

    accuracy                           1.00    199098
   macro avg       0.98      0.90      0.94    199098
weighted avg       1.00      1.00      1.00    199098



In [None]:
This is my colab file, if you want any reference you can check this file.

In [None]:
https://colab.research.google.com/drive/1kP_-QAHlM8EuWdH2SR8mgiJKm7wRyjUl?usp=sharing

# 7. What kind of prevention should be adopted while company update its infrastructure?

During infrastructure updates, companies should prioritize security measures to prevent fraud. This includes implementing robust encryption and access controls, utilizing multi-factor authentication, and deploying real-time monitoring systems to detect suspicious activities. Employing fraud detection algorithms, setting transaction limits, and conducting regular security audits are also essential. Additionally, educating employees on security best practices and ensuring third-party vendors adhere to security standards play crucial roles in preventing fraud. By adopting these preventive measures, companies can minimize the risk of fraudulent activities and safeguard their assets and sensitive data.

# 8.Assuming these actions have been implemented, how would you determine if they work?

here are the key points to determine if fraud prevention measures are effective:

Monitoring Metrics: Track fraud-related KPIs like detected fraudulent transactions, false positive rates, and response times.

Incident Analysis: Analyze reported incidents to see if prevention measures successfully mitigated risks.

Review Alerts: Assess the frequency and accuracy of alerts generated by monitoring systems.

Comparison Over Time: Compare incidence of fraud before and after implementing prevention measures.

Testing and Simulation: Conduct tests to evaluate security infrastructure against fraud scenarios.

Feedback: Gather feedback from stakeholders to gauge perceptions of effectiveness.

By following these steps, companies can gauge the effectiveness of their fraud prevention efforts and make necessary adjustments.