In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

# CSV dosyanızı yükleyin
df = pd.read_csv('./data_without_extra_features/lemmatized_and_misspelled_removed_SEFACED.csv', encoding='utf-8')

# Corpusu (lemmatize edilmiş belgeler) ve hedef değişkeni çıkarın
corpus = df['lemmatized_tokens']
y = df['Class_Label']

# Metni tokenize edin
tokenized_corpus = [word_tokenize(text) for text in corpus]

# Word2Vec modelini eğitin
word2vec_model = Word2Vec(tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

# Belge gömülerini ortalama kelime vektörlerini kullanarak oluşturan bir fonksiyon
def document_embedding(tokens):
    vectors = [word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(word2vec_model.vector_size)

# Belge gömülerini oluşturun
X_word2vec = np.array([document_embedding(tokens) for tokens in tokenized_corpus])

# Veri kümesini eğitim ve test setlerine ayırın
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, y, test_size=0.2, random_state=42)

# Lojistik Regresyon modelini başlatın
lr_model = LogisticRegression(max_iter=10000)

# Lojistik Regresyon modelini eğitin
lr_model.fit(X_train, y_train)

# Test seti üzerinde tahminler yapın
lr_predictions = lr_model.predict(X_test)

# Modeli değerlendirin

lr_accuracy = accuracy_score(y_test, lr_predictions)
lr_precision = precision_score(y_test, lr_predictions, average='weighted')
lr_recall = recall_score(y_test, lr_predictions, average='weighted')

print("Logistic Regression Classifier:")
print(f"Accuracy: {lr_accuracy}")
print(f"Precision: {lr_precision}")
print(f"Recall: {lr_recall}")
print("Sınıflandırma Raporu:")
print(classification_report(y_test, lr_predictions))


Logistic Regression Classifier:
Accuracy: 0.9316893424036281
Precision: 0.9313379460802081
Recall: 0.9316893424036281
Sınıflandırma Raporu:
              precision    recall  f1-score   support

  Fraudulent       0.92      0.84      0.88      1041
      Normal       0.94      0.97      0.95      2487

    accuracy                           0.93      3528
   macro avg       0.93      0.90      0.92      3528
weighted avg       0.93      0.93      0.93      3528



In [2]:
#RF

from sklearn.ensemble import RandomForestClassifier

# Rastgele Orman modelini başlatın
rf_model = RandomForestClassifier()

# Rastgele Orman modelini eğitin
rf_model.fit(X_train, y_train)

# Test seti üzerinde tahminler yapın
rf_predictions = rf_model.predict(X_test)

# Modeli değerlendirin

rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_precision = precision_score(y_test, rf_predictions, average='weighted')
rf_recall = recall_score(y_test, rf_predictions, average='weighted')

print("Random Forest Classifier:")
print(f"Accuracy: {rf_accuracy}")
print(f"Precision: {rf_precision}")
print(f"Recall: {rf_recall}")
print("Sınıflandırma Raporu:")
print(classification_report(y_test, rf_predictions))


Random Forest Classifier:
Accuracy: 0.9654195011337868
Precision: 0.9660396211626434
Recall: 0.9654195011337868
Sınıflandırma Raporu:
              precision    recall  f1-score   support

  Fraudulent       0.98      0.90      0.94      1041
      Normal       0.96      0.99      0.98      2487

    accuracy                           0.97      3528
   macro avg       0.97      0.95      0.96      3528
weighted avg       0.97      0.97      0.96      3528



In [3]:
#NB

from sklearn.naive_bayes import GaussianNB

# Çoklu Naive Bayes modelini başlatın
nb_model = GaussianNB()

# Çoklu Naive Bayes modelini eğitin
nb_model.fit(X_train, y_train)

# Test seti üzerinde tahminler yapın
nb_predictions = nb_model.predict(X_test)

# Modeli değerlendirin

nb_accuracy = accuracy_score(y_test, nb_predictions)
nb_precision = precision_score(y_test, nb_predictions, average='weighted')
nb_recall = recall_score(y_test, nb_predictions, average='weighted')

print("Naive Bayes Classifier:")
print(f"Accuracy: {nb_accuracy}")
print(f"Precision: {nb_precision}")
print(f"Recall: {nb_recall}")
print("Sınıflandırma Raporu:")
print(classification_report(y_test, nb_predictions))


Naive Bayes Classifier:
Accuracy: 0.8460884353741497
Precision: 0.8422861936728206
Recall: 0.8460884353741497
Sınıflandırma Raporu:
              precision    recall  f1-score   support

  Fraudulent       0.79      0.65      0.71      1041
      Normal       0.86      0.93      0.89      2487

    accuracy                           0.85      3528
   macro avg       0.83      0.79      0.80      3528
weighted avg       0.84      0.85      0.84      3528



In [4]:
# SVM

from sklearn.svm import SVC

# SVM modelini başlat
svm_model = SVC()

# SVM modelini eğit
svm_model.fit(X_train, y_train)

# Test seti üzerinde tahminler yap
svm_predictions = svm_model.predict(X_test)

# Modeli değerlendir

svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_precision = precision_score(y_test, svm_predictions, average='weighted')
svm_recall = recall_score(y_test, svm_predictions, average='weighted')

print("Support Vector Machine Classifier:")
print(f"Accuracy: {svm_accuracy}")
print(f"Precision: {svm_precision}")
print(f"Recall: {svm_recall}")

print("Classification Report:")
print(classification_report(y_test, svm_predictions))




Support Vector Machine Classifier:
Accuracy: 0.9404761904761905
Precision: 0.9408155990361117
Recall: 0.9404761904761905
Classification Report:
              precision    recall  f1-score   support

  Fraudulent       0.95      0.84      0.89      1041
      Normal       0.94      0.98      0.96      2487

    accuracy                           0.94      3528
   macro avg       0.94      0.91      0.93      3528
weighted avg       0.94      0.94      0.94      3528



In [None]:
# CNN

from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

cnn_model = Sequential()
cnn_model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=maxlen))
cnn_model.add(Conv1D(filters=num_filters, kernel_size=kernel_size, activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(64, activation='relu'))
cnn_model.add(Dropout(0.5))
cnn_model.add(Dense(1, activation='sigmoid'))


cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))


cnn_predictions = cnn_model.predict(X_test)
cnn_predictions = (cnn_predictions > 0.5).astype(int)

cnn_accuracy = accuracy_score(y_test, cnn_predictions)
cnn_precision = precision_score(y_test, cnn_predictions, average='weighted')
cnn_recall = recall_score(y_test, cnn_predictions, average='weighted')

print("Convolutional Neural Network Classifier:")
print(f"Accuracy: {cnn_accuracy}")
print(f"Precision: {cnn_precision}")
print(f"Recall: {cnn_recall}")

print("Classification Report:")
print(classification_report(y_test, cnn_predictions))



In [None]:
# LSTM

from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report


lstm_model = Sequential()
lstm_model.add(LSTM(128, input_shape=(X_train.shape[1], 1))) 
lstm_model.add(Dense(64, activation='relu'))
lstm_model.add(Dropout(0.5))
lstm_model.add(Dense(1, activation='sigmoid'))


lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


lstm_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))


lstm_predictions = lstm_model.predict(X_test)
lstm_predictions = (lstm_predictions > 0.5).astype(int)


lstm_accuracy = accuracy_score(y_test, lstm_predictions)
lstm_precision = precision_score(y_test, lstm_predictions, average='weighted')
lstm_recall = recall_score(y_test, lstm_predictions, average='weighted')

print("LSTM Classifier:")
print(f"Accuracy: {lstm_accuracy}")
print(f"Precision: {lstm_precision}")
print(f"Recall: {lstm_recall}")

print("Classification Report:")
print(classification_report(y_test, lstm_predictions))


In [3]:
print(X_word2vec)

[[ 0.34259304  0.41239572  0.17739823 ... -0.57248175 -0.2753076
   0.13642502]
 [ 0.0241638   0.11879435  0.32477924 ... -0.80595094 -0.18799649
  -0.37546167]
 [ 0.33062476 -0.31394276  0.5538537  ... -0.71585536 -0.2277005
  -0.46150118]
 ...
 [ 0.2645029   0.15665384  0.33787295 ... -0.66036075 -0.12096028
  -0.03157748]
 [-0.02474763  0.3378032   0.20469375 ... -0.79250836  0.03731209
  -0.19065219]
 [ 0.06246347  0.3957846   0.19652408 ... -0.6336944  -0.05793769
  -0.09663817]]
