<a href="https://colab.research.google.com/github/LigiaKaczmarek/Uczenie-Maszynowe-w-finansach/blob/main/Uczenie_maszynowe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.decomposition import TruncatedSVD
from imblearn.over_sampling import SMOTE

df = pd.read_csv('case1.csv', delimiter=';')

df_filtered = df.iloc[:, 3:-2]

missing_values_before = df_filtered.isnull().sum()
print("Braki danych przed przetwarzaniem:")
print(missing_values_before)

imputer = SimpleImputer(strategy='median')
data = imputer.fit_transform(df_filtered)

data = pd.DataFrame(data, columns=df_filtered.columns)

data = data.loc[:, (data != 0).any(axis=0)]

svd = TruncatedSVD(n_components=10)
data_reduced = svd.fit_transform(data)

smote = SMOTE(sampling_strategy='auto')
data_reduced_X, data_reduced_Y = smote.fit_resample(data_reduced, data['FLAG'])

missing_values_after = data.isnull().sum()
print("\nBraki danych po przetwarzaniu:")
print(missing_values_after)


Braki danych przed przetwarzaniem:
FLAG                                                      0
Avg min between sent tnx                                  0
Avg min between received tnx                              0
Time Diff between first and last (Mins)                   0
Sent tnx                                                  0
Received Tnx                                              0
Number of Created Contracts                               0
Unique Received From Addresses                            0
Unique Sent To Addresses                                  0
min value received                                        0
max value received                                        0
avg val received                                          0
min val sent                                              0
max val sent                                              0
avg val sent                                              0
min value sent to contract                                0
max v

In [5]:
data

Unnamed: 0,FLAG,Avg min between sent tnx,Avg min between received tnx,Time Diff between first and last (Mins),Sent tnx,Received Tnx,Number of Created Contracts,Unique Received From Addresses,Unique Sent To Addresses,min value received,...,ERC20 uniq sent addr.1,ERC20 uniq rec contract addr,ERC20 min val rec,ERC20 max val rec,ERC20 avg val rec,ERC20 min val sent,ERC20 max val sent,ERC20 avg val sent,ERC20 uniq sent token name,ERC20 uniq rec token name
0,0.0,844.26,1093.71,704785.63,721.0,89.0,0.0,40.0,118.0,0.000000,...,0.0,58.0,0.00,1.500000e+07,265586.147600,0.000000,1.683100e+07,271779.920000,39.0,57.0
1,0.0,12709.07,2958.44,1218216.73,94.0,8.0,0.0,5.0,14.0,0.000000,...,0.0,7.0,0.00,3.650000e+02,57.632615,2.260809,2.260809e+00,2.260809,1.0,7.0
2,0.0,246194.54,2434.02,516729.30,2.0,10.0,0.0,10.0,2.0,0.113119,...,0.0,8.0,0.00,4.428198e+02,65.189009,0.000000,0.000000e+00,0.000000,0.0,8.0
3,0.0,10219.60,15785.09,397555.90,25.0,9.0,0.0,7.0,13.0,0.000000,...,0.0,11.0,0.00,1.141223e+04,1555.550174,100.000000,9.029231e+03,3804.076893,1.0,11.0
4,0.0,36.61,10707.77,382472.42,4598.0,20.0,1.0,7.0,19.0,0.000000,...,0.0,27.0,0.00,9.000000e+04,4934.232147,0.000000,4.500000e+04,13726.659220,6.0,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9836,1.0,12635.10,631.39,58748.48,4.0,13.0,0.0,11.0,4.0,0.004082,...,0.0,2.0,0.00,1.337000e+00,0.668500,0.000000,0.000000e+00,0.000000,0.0,2.0
9837,1.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,1.0,13.37,1.337000e+01,13.370000,0.000000,0.000000e+00,0.000000,0.0,1.0
9838,1.0,2499.44,2189.29,261601.88,67.0,43.0,0.0,31.0,44.0,0.001078,...,0.0,5.0,0.00,1.500000e+03,300.693901,0.000000,0.000000e+00,0.000000,0.0,5.0
9839,1.0,0.00,0.00,0.00,0.0,1.0,0.0,1.0,0.0,0.500000,...,0.0,1.0,0.00,0.000000e+00,0.000000,0.000000,0.000000e+00,0.000000,0.0,1.0


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(data_reduced_X, data_reduced_Y, test_size=0.2, random_state=42)

models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Support Vector Machine': SVC()
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_roc = roc_auc_score(y_test, y_pred)
    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'AUC-ROC': auc_roc
    }

best_model = max(results, key=lambda x: results[x]['Accuracy'])

print("Wyniki oceny modeli:")
for name, metrics in results.items():
    print(f"\n{name}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value}")

print("\nNajlepszy model:", best_model)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Wyniki oceny modeli:

Logistic Regression:
Accuracy: 0.5722675367047309
Precision: 0.9912280701754386
Recall: 0.1472312703583062
F1 Score: 0.2563811684628474
AUC-ROC: 0.5729620404079112

Random Forest:
Accuracy: 0.9393148450244698
Precision: 0.9304403318442884
Recall: 0.9498371335504886
F1 Score: 0.9400386847195359
AUC-ROC: 0.9392976517425645

Gradient Boosting:
Accuracy: 0.8962479608482871
Precision: 0.8984937786509496
Recall: 0.8938110749185668
F1 Score: 0.8961463096015676
AUC-ROC: 0.8962519426880416

Support Vector Machine:
Accuracy: 0.5050570962479608
Precision: 0.5029546946815495
Recall: 0.9980456026058632
F1 Score: 0.6688495961580441
AUC-ROC: 0.5042515594728663

Najlepszy model: Random Forest


Najlepszym modelem do klasyfikacji oszustw na podstawie dostarczonych wyników jest Random Forest. Model Random Forest osiągnął najwyższą dokładność (93.18%) oraz najwyższy F1 Score (93.19%), co oznacza, że jest w stanie skutecznie identyfikować zarówno pozytywne, jak i negatywne przypadki oszustw. Dodatkowo, AUC-ROC wynoszący 0.93 wskazuje na wysoką zdolność modelu do rozróżniania między klasami.