In [32]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, fbeta_score, log_loss, matthews_corrcoef
from sklearn.metrics import average_precision_score, precision_recall_curve, auc
import pickle
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [2]:
final_df=pd.read_csv('Final_DF.csv')

In [3]:
# Set up display options
pd.set_option('display.max_colwidth', None)  # or use a large number if 'None' does not work
specific_trx_id = '018db93d-4402-766e-93b4-ec51c0f67cbb'
filtered_df = final_df[final_df['trx_id'] == specific_trx_id]

# Display the filtered DataFrame
filtered_df

Unnamed: 0,date,time,trx_id,trx_type,trx_status,category,business_type_class,wallet_number_from,wallet_number_to,amount,...,Days Since First Transaction,Transaction Frequency,Adjusted Account Age,flagged,ip_number,country,city,region,lat,long
3,2019-10-05,08:08:38,018db93d-4402-766e-93b4-ec51c0f67cbb,client_transaction_transfer,processed,Computers,ClientTransactionProcess,3.0,731009,9.81,...,1665.0,416.25,0.017779,False,1979451563,India,Delhi,Delhi,28.666775,77.216681


In [4]:
# Remove the specified columns from the DataFrame
final_df = final_df.drop(columns=['Wallet_to_change_frequency', 'Transaction Count'])
final_df = final_df.drop(columns=['Transaction Frequency'])
final_df = final_df.drop(columns=['amount_bin','Days Since First Transaction'])
final_df = final_df.drop(columns=['currency','ip','First Transaction Date'])
final_df = final_df.drop(columns=['date','time'])

In [5]:
final_df['datetime'] = pd.to_datetime(final_df['datetime'])

In [6]:
# Assuming 'final_df' is your DataFrame and it contains a column 'datetime' of datatype datetime
final_df['date'] = final_df['datetime'].dt.day
final_df['month'] = final_df['datetime'].dt.month
final_df['year'] = final_df['datetime'].dt.year
final_df['hour'] = final_df['datetime'].dt.hour
final_df['minute'] = final_df['datetime'].dt.minute
final_df['second'] = final_df['datetime'].dt.second

In [7]:
columns_to_encode = ['trx_type', 'trx_status','category', 'business_type_class','description', 'browser_environment','flagged','country', 'city','region'] 

In [8]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()
encoders = {}
# Applying label encoding to each specified categorical column
for column in columns_to_encode:
    le = LabelEncoder()
    final_df[column] = le.fit_transform(final_df[column])
    encoders[column] = le

# Save the dictionary of encoders
with open('encoders.pkl', 'wb') as file:
    pickle.dump(encoders, file)

In [13]:
selected_columns = ['trx_type', 'trx_status', 'category', 'business_type_class', 'wallet_number_from', 'wallet_number_to', 'amount', 'description', 'browser_environment', 'hour', 'day_of_week', 'IP Change Frequency', 'Adjusted Account Age', 'ip_number', 'country', 'city', 'region', 'lat', 'long', 'month', 'year', 'minute', 'second', 'date']

In [16]:
X = final_df[selected_columns]
# Assume X is the feature matrix
autoencoder = MLPRegressor(hidden_layer_sizes=(len(X.columns)//2, len(X.columns)//4, len(X.columns)//2), max_iter=100)
autoencoder.fit(X, X)  # X is both the input and the output

# Calculate reconstruction error
reconstruction_error = ((X - autoencoder.predict(X)) ** 2).mean(axis=0)
feature_importance = reconstruction_error.sort_values(ascending=False)
# Number of features you want to keep
num_features_to_select = 10

# Select the top N features with the highest reconstruction error
top_features = feature_importance.nlargest(num_features_to_select).index

# Now, top_features contains the names of the top N features you might consider most important
print(top_features)

['ip_number', 'wallet_number_to', 'region', 'Adjusted Account Age', 'browser_environment', 'wallet_number_from', 'business_type_class', 'city', 'trx_type', 'country', 'amount']


In [17]:
selected_columns = ['ip_number', 'wallet_number_to', 'region', 'Adjusted Account Age','browser_environment', 'wallet_number_from', 'business_type_class',
'city', 'trx_type', 'country','amount']
#selected_columns=['amount']

#### Isolation Forest

In [18]:
# Assuming 'df' is your DataFrame and it has the columns 'feature1', 'feature2', 'feature3'
X = final_df[selected_columns]

# Initialize Isolation Forest
iso_forest = IsolationForest(n_estimators=100, contamination='auto', random_state=42)

# Fit the model on the selected features
iso_forest.fit(X)

# Predict anomalies (-1 for outliers and 1 for inliers)
predictions = iso_forest.predict(X)

# Add predictions to the DataFrame
final_df['anomaly'] = predictions

# Get anomaly scores
scores = iso_forest.decision_function(X)
final_df['scores'] = scores

# Print the DataFrame with anomaly labels and scores
print(final_df[['amount', 'anomaly', 'scores']])

        amount  anomaly    scores
0         6.46       -1 -0.059312
1         5.00       -1 -0.006662
2         5.00       -1 -0.006049
3         9.81        1  0.047185
4         9.81        1  0.054686
...        ...      ...       ...
485526   10.15       -1 -0.045190
485527    1.73       -1 -0.077499
485528    5.00       -1 -0.079429
485529    8.00       -1 -0.069603
485530    1.00       -1 -0.058882

[485531 rows x 3 columns]


In [19]:
# Assuming 'final_df' is your DataFrame and it contains an 'anomaly' column after Isolation Forest prediction
# Count the number of anomalies (entries with anomaly = -1)
number_of_anomalies = (final_df['anomaly'] == -1).sum()

# Get the total number of entries in the DataFrame
total_entries = len(final_df)

# Calculate the percentage of anomalies
percentage_of_anomalies = (number_of_anomalies / total_entries) * 100

print(f"Number of anomalies: {number_of_anomalies}")
print(f"Total number of entries: {total_entries}")
print(f"Percentage of anomalies: {percentage_of_anomalies:.2f}%")

Number of anomalies: 138462
Total number of entries: 485531
Percentage of anomalies: 28.52%


In [20]:
final_df.head(5)

Unnamed: 0,trx_id,trx_type,trx_status,category,business_type_class,wallet_number_from,wallet_number_to,amount,description,invoice_details,...,region,lat,long,date,month,year,minute,second,anomaly,scores
0,018db93d-2fd0-7eb2-98e5-c8a7bf0fa5be,0,0,14,0,1.0,302391,6.46,4819,Nicole Martinez,...,3,0.0,0.0,4,5,2020,8,6,-1,-0.059312
1,018db75b-8219-7b29-ada7-73a672510b7a,0,0,13,0,1.0,969391,5.0,3681,Dustin Frost,...,553,38.901566,-77.05078,24,8,2020,9,0,-1,-0.006662
2,018dbe06-4ae0-7f3b-9f7f-37ffc8f9f9c3,1,0,15,1,1.0,405063180069,5.0,4817,Linda Mendoza,...,1266,42.365079,-71.104519,18,1,2024,36,25,-1,-0.006049
3,018db93d-4402-766e-93b4-ec51c0f67cbb,0,0,5,0,3.0,731009,9.81,2986,Joel Carter,...,536,28.666775,77.216681,5,10,2019,8,38,1,0.047185
4,018db93d-4f80-795a-ab26-c684492342eb,0,0,4,0,3.0,744481,9.81,3866,Douglas Smith,...,755,21.024411,105.84146,22,2,2020,8,18,1,0.054686


In [21]:
# Assuming final_df is your DataFrame
# Convert anomalies from -1 (anomaly) and 1 (normal) to 1 (anomaly) and 0 (normal) to align with 'flagged'
final_df['predicted'] = final_df['anomaly'].apply(lambda x: 1 if x == -1 else 0)

# Generate classification report
report = classification_report(final_df['flagged'], final_df['predicted'], target_names=['Normal', 'Anomaly'])

# Print the classification report
print(report)

              precision    recall  f1-score   support

      Normal       0.81      0.80      0.80    350352
     Anomaly       0.49      0.50      0.50    135179

    accuracy                           0.72    485531
   macro avg       0.65      0.65      0.65    485531
weighted avg       0.72      0.72      0.72    485531



In [21]:
final_df.head(5)

Unnamed: 0,trx_id,trx_type,trx_status,category,business_type_class,wallet_number_from,wallet_number_to,amount,description,invoice_details,browser_environment,datetime,hour,day_of_week,IP Change Frequency,Adjusted Account Age,flagged,ip_number,country,city,region,lat,long,date,month,year,minute,second,anomaly,scores,predicted
0,018db93d-2fd0-7eb2-98e5-c8a7bf0fa5be,0,0,14,0,1.0,302391,6.46,4819,Nicole Martinez,10,2020-05-04 13:08:06,13,0,3.0,0.015004,0,2887286643,0,14,3,0.0,0.0,4,5,2020,8,6,-1,-0.059312,1
1,018db75b-8219-7b29-ada7-73a672510b7a,0,0,13,0,1.0,969391,5.0,3681,Dustin Frost,24,2020-08-24 19:09:00,19,0,3.0,0.015004,0,3438293894,215,19662,553,38.901566,-77.05078,24,8,2020,9,0,-1,-0.006662,1
2,018dbe06-4ae0-7f3b-9f7f-37ffc8f9f9c3,1,0,15,1,1.0,405063180069,5.0,4817,Linda Mendoza,33,2024-01-18 16:36:25,16,3,3.0,0.015004,0,303929162,215,2955,1266,42.365079,-71.104519,18,1,2024,36,25,-1,-0.006049,1
3,018db93d-4402-766e-93b4-ec51c0f67cbb,0,0,5,0,3.0,731009,9.81,2986,Joel Carter,21,2019-10-05 08:08:38,8,5,4.0,0.017779,0,1979451563,94,4540,536,28.666775,77.216681,5,10,2019,8,38,1,0.047185,0
4,018db93d-4f80-795a-ab26-c684492342eb,0,0,4,0,3.0,744481,9.81,3866,Douglas Smith,12,2020-02-22 05:08:18,5,5,4.0,0.017779,0,1908295359,220,6979,755,21.024411,105.84146,22,2,2020,8,18,1,0.054686,0


In [35]:
final_df.to_csv('Final_Encoded.csv')