In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

# CSV dosyanızı yükleyin
df = pd.read_csv('./preprocessed_data/lemmatized_and_misspelled_removed_SEFACED.csv', encoding='utf-8')

# Corpusu (lemmatize edilmiş belgeler) ve hedef değişkeni çıkarın
corpus = df['lemmatized_tokens']
y = df['Class_Label']

# Metni tokenize edin
tokenized_corpus = [word_tokenize(text) for text in corpus]

# Word2Vec modelini eğitin
word2vec_model = Word2Vec(tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

# Belge gömülerini ortalama kelime vektörlerini kullanarak oluşturan bir fonksiyon
def document_embedding(tokens):
    vectors = [word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(word2vec_model.vector_size)

# Belge gömülerini oluşturun
X_word2vec = np.array([document_embedding(tokens) for tokens in tokenized_corpus])

# Veri kümesini eğitim ve test setlerine ayırın
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, y, test_size=0.2, random_state=42)

# Lojistik Regresyon modelini başlatın
lr_model = LogisticRegression(max_iter=10000)

# Lojistik Regresyon modelini eğitin
lr_model.fit(X_train, y_train)

# Test seti üzerinde tahminler yapın
lr_predictions = lr_model.predict(X_test)

# Modeli değerlendirin

lr_accuracy = accuracy_score(y_test, lr_predictions)
lr_precision = precision_score(y_test, lr_predictions, average='weighted')
lr_recall = recall_score(y_test, lr_predictions, average='weighted')

print("Logistic Regression Classifier:")
print(f"Accuracy: {lr_accuracy}")
print(f"Precision: {lr_precision}")
print(f"Recall: {lr_recall}")
print("Sınıflandırma Raporu:")
print(classification_report(y_test, lr_predictions))


Logistic Regression Classifier:
Accuracy: 0.9322562358276644
Precision: 0.9326226596269023
Recall: 0.9322562358276644
Sınıflandırma Raporu:
              precision    recall  f1-score   support

  Fraudulent       0.88      0.89      0.89      1041
      Normal       0.96      0.95      0.95      2487

    accuracy                           0.93      3528
   macro avg       0.92      0.92      0.92      3528
weighted avg       0.93      0.93      0.93      3528



In [2]:
#RF

from sklearn.ensemble import RandomForestClassifier

# Rastgele Orman modelini başlatın
rf_model = RandomForestClassifier()

# Rastgele Orman modelini eğitin
rf_model.fit(X_train, y_train)

# Test seti üzerinde tahminler yapın
rf_predictions = rf_model.predict(X_test)

# Modeli değerlendirin

rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_precision = precision_score(y_test, rf_predictions, average='weighted')
rf_recall = recall_score(y_test, rf_predictions, average='weighted')

print("Random Forest Classifier:")
print(f"Accuracy: {rf_accuracy}")
print(f"Precision: {rf_precision}")
print(f"Recall: {rf_recall}")
print("Sınıflandırma Raporu:")
print(classification_report(y_test, rf_predictions))


Random Forest Classifier:
Accuracy: 0.9665532879818595
Precision: 0.9672072588572905
Recall: 0.9665532879818595
Sınıflandırma Raporu:
              precision    recall  f1-score   support

  Fraudulent       0.99      0.90      0.94      1041
      Normal       0.96      0.99      0.98      2487

    accuracy                           0.97      3528
   macro avg       0.97      0.95      0.96      3528
weighted avg       0.97      0.97      0.97      3528



In [3]:
#NB

from sklearn.naive_bayes import GaussianNB

# Çoklu Naive Bayes modelini başlatın
nb_model = GaussianNB()

# Çoklu Naive Bayes modelini eğitin
nb_model.fit(X_train, y_train)

# Test seti üzerinde tahminler yapın
nb_predictions = nb_model.predict(X_test)

# Modeli değerlendirin

nb_accuracy = accuracy_score(y_test, nb_predictions)
nb_precision = precision_score(y_test, nb_predictions, average='weighted')
nb_recall = recall_score(y_test, nb_predictions, average='weighted')

print("Naive Bayes Classifier:")
print(f"Accuracy: {nb_accuracy}")
print(f"Precision: {nb_precision}")
print(f"Recall: {nb_recall}")
print("Sınıflandırma Raporu:")
print(classification_report(y_test, nb_predictions))


Naive Bayes Classifier:
Accuracy: 0.8378684807256236
Precision: 0.8334594897401826
Recall: 0.8378684807256236
Sınıflandırma Raporu:
              precision    recall  f1-score   support

  Fraudulent       0.78      0.63      0.70      1041
      Normal       0.86      0.92      0.89      2487

    accuracy                           0.84      3528
   macro avg       0.82      0.78      0.79      3528
weighted avg       0.83      0.84      0.83      3528



In [5]:
# SVM

from sklearn.svm import SVC

# SVM modelini başlat
svm_model = SVC()

# SVM modelini eğit
svm_model.fit(X_train, y_train)

# Test seti üzerinde tahminler yap
svm_predictions = svm_model.predict(X_test)

# Modeli değerlendir

svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_precision = precision_score(y_test, svm_predictions, average='weighted')
svm_recall = recall_score(y_test, svm_predictions, average='weighted')

print("Support Vector Machine Classifier:")
print(f"Accuracy: {svm_accuracy}")
print(f"Precision: {svm_precision}")
print(f"Recall: {svm_recall}")

print("Classification Report:")
print(classification_report(y_test, svm_predictions))


Support Vector Machine Classifier:
Accuracy: 0.9404761904761905
Precision: 0.9406785158908646
Recall: 0.9404761904761905
Classification Report:
              precision    recall  f1-score   support

  Fraudulent       0.95      0.85      0.89      1041
      Normal       0.94      0.98      0.96      2487

    accuracy                           0.94      3528
   macro avg       0.94      0.91      0.93      3528
weighted avg       0.94      0.94      0.94      3528

