In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
from scipy import sparse


# Ön işlenmiş verileri oku
df = pd.read_csv('./preprocessed_data/lemmatized_and_misspelled_removed_SEFACED.csv', encoding='utf-8')

# Kök alınmış kelimeleri çıkar
corpus = df['lemmatized_tokens']

# Bir TfidfVectorizer oluştur
vectorizer = TfidfVectorizer()

# Kök alınmış kelimeleri dönüştür
X_tfidf = vectorizer.fit_transform(corpus)

# Seyrek matrisi bir DataFrame'e dönüştür
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

# Ek özellikleri içeren CSV dosyasını yükle
df_features = pd.read_csv('./data_with_extra_features/SEFACED_all_features.csv')

# İstenilen sütunları df_features'dan çıkar
desired_columns = ['html_tag_count', 'css_tag_count', 'spam_phrase_count', 'link_count', 'grammar_error_count', 'misspelled_word_count']
df_desired_features = df_features[desired_columns]

# df_tfidf'yi istenilen özelliklerle birleştir
df_tfidf_merged = pd.concat([df_tfidf, df_desired_features], axis=1)

print(df_tfidf_merged)


        aa  aba  abandon  abandoned  abandonment  abased  abatement  abb  \
0      0.0  0.0      0.0        0.0          0.0     0.0        0.0  0.0   
1      0.0  0.0      0.0        0.0          0.0     0.0        0.0  0.0   
2      0.0  0.0      0.0        0.0          0.0     0.0        0.0  0.0   
3      0.0  0.0      0.0        0.0          0.0     0.0        0.0  0.0   
4      0.0  0.0      0.0        0.0          0.0     0.0        0.0  0.0   
...    ...  ...      ...        ...          ...     ...        ...  ...   
17635  0.0  0.0      0.0        0.0          0.0     0.0        0.0  0.0   
17636  0.0  0.0      0.0        0.0          0.0     0.0        0.0  0.0   
17637  0.0  0.0      0.0        0.0          0.0     0.0        0.0  0.0   
17638  0.0  0.0      0.0        0.0          0.0     0.0        0.0  0.0   
17639  0.0  0.0      0.0        0.0          0.0     0.0        0.0  0.0   

       abbas  abbey  ...  zone  zoning  zoo  zoom  html_tag_count  \
0        0.0    0.

In [2]:
# LR

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report



# Korpusu (lemmatize edilmiş belgeler) ve hedef değişkeni çıkarın
corpus = df['lemmatized_tokens']
y = df['Class_Label']

# Veri kümesini eğitim ve test setlerine ayırın
X_train, X_test, y_train, y_test = train_test_split(df_tfidf_merged, y, test_size=0.2, random_state=42)

# Logistic Regresyon modelini başlatın
lr_model = LogisticRegression(max_iter=10000)

# Logistic Regresyon modelini eğitin
lr_model.fit(X_train, y_train)

# Test seti üzerinde tahminler yapın
lr_predictions = lr_model.predict(X_test)

# Modeli değerlendirin

lr_accuracy = accuracy_score(y_test, lr_predictions)
lr_precision = precision_score(y_test, lr_predictions, average='weighted')
lr_recall = recall_score(y_test, lr_predictions, average='weighted')

print("Logistic Regression Classifier:")
print(f"Accuracy: {lr_accuracy}")
print(f"Precision: {lr_precision}")
print(f"Recall: {lr_recall}")
print("Sınıflandırma Raporu:")
print(classification_report(y_test, lr_predictions))


Logistic Regression Classifier:
Accuracy: 0.9719387755102041
Precision: 0.9720385547345911
Recall: 0.9719387755102041
Sınıflandırma Raporu:
              precision    recall  f1-score   support

  Fraudulent       0.98      0.93      0.95      1041
      Normal       0.97      0.99      0.98      2487

    accuracy                           0.97      3528
   macro avg       0.97      0.96      0.97      3528
weighted avg       0.97      0.97      0.97      3528



In [3]:
# RF

from sklearn.ensemble import RandomForestClassifier

# Rastgele Orman modelini başlatın
rf_model = RandomForestClassifier()

# Rastgele Orman modelini eğitin
rf_model.fit(X_train, y_train)

# Test seti üzerinde tahminler yapın
rf_predictions = rf_model.predict(X_test)

# Modeli değerlendirin
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_precision = precision_score(y_test, rf_predictions, average='weighted')
rf_recall = recall_score(y_test, rf_predictions, average='weighted')

print("Random Forest Classifier:")
print(f"Accuracy: {rf_accuracy}")
print(f"Precision: {rf_precision}")
print(f"Recall: {rf_recall}")
print("Sınıflandırma Raporu:")
print(classification_report(y_test, rf_predictions))


Random Forest Classifier:
Accuracy: 0.9866780045351474
Precision: 0.9866596552902617
Recall: 0.9866780045351474
Sınıflandırma Raporu:
              precision    recall  f1-score   support

  Fraudulent       0.98      0.97      0.98      1041
      Normal       0.99      0.99      0.99      2487

    accuracy                           0.99      3528
   macro avg       0.99      0.98      0.98      3528
weighted avg       0.99      0.99      0.99      3528



In [4]:
#NB

from sklearn.naive_bayes import MultinomialNB

# Çoklu Naive Bayes modelini başlatın
nb_model = MultinomialNB()

# Çoklu Naive Bayes modelini eğitin
nb_model.fit(X_train, y_train)

# Test seti üzerinde tahminler yapın
nb_predictions = nb_model.predict(X_test)

# Modeli değerlendirin
nb_accuracy = accuracy_score(y_test, nb_predictions)
nb_precision = precision_score(y_test, nb_predictions, average='weighted')
nb_recall = recall_score(y_test, nb_predictions, average='weighted')

print("Naive Bayes Classifier:")
print(f"Accuracy: {nb_accuracy}")
print(f"Precision: {nb_precision}")
print(f"Recall: {nb_recall}")
print("Sınıflandırma Raporu:")
print(classification_report(y_test, nb_predictions))


Naive Bayes Classifier:
Accuracy: 0.927437641723356
Precision: 0.9276555819262243
Recall: 0.927437641723356
Sınıflandırma Raporu:
              precision    recall  f1-score   support

  Fraudulent       0.87      0.88      0.88      1041
      Normal       0.95      0.95      0.95      2487

    accuracy                           0.93      3528
   macro avg       0.91      0.91      0.91      3528
weighted avg       0.93      0.93      0.93      3528



In [5]:
# SVM

from sklearn.svm import SVC

# SVM modelini başlat
svm_model = SVC()

# SVM modelini eğit
svm_model.fit(X_train, y_train)

# Test seti üzerinde tahminler yap
svm_predictions = svm_model.predict(X_test)

# Modeli değerlendir

svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_precision = precision_score(y_test, svm_predictions, average='weighted')
svm_recall = recall_score(y_test, svm_predictions, average='weighted')

print("Support Vector Machine Classifier:")
print(f"Accuracy: {svm_accuracy}")
print(f"Precision: {svm_precision}")
print(f"Recall: {svm_recall}")

print("Classification Report:")
print(classification_report(y_test, svm_predictions))