In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Ön işlenmiş verileri oku
df = pd.read_csv('./preprocessed_data/lemmatized_and_misspelled_removed_SEFACED.csv', encoding='utf-8')

# Kök alınmış kelimeleri çıkar
corpus = df['lemmatized_tokens']

# Bir CountVectorizer oluştur
vectorizer = CountVectorizer()

# Kök alınmış kelimeleri dönüştür
X_bow = vectorizer.fit_transform(corpus)

# Seyrek matrisi bir DataFrame'e dönüştür
df_bow = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())

# Ek özellikleri içeren CSV dosyasını yükle
df_features = pd.read_csv('./data_with_extra_features/SEFACED_all_features.csv')

# İstenilen sütunları df_features'dan çıkar
desired_columns = ['html_tag_count', 'css_tag_count', 'spam_phrase_count', 'link_count', 'grammar_error_count', 'misspelled_word_count']
df_desired_features = df_features[desired_columns]

# df_bow'u istenilen özelliklerle birleştir
df_merged = pd.concat([df_bow, df_desired_features], axis=1)

print(df_merged)



      aa  aba  abandon  abandoned  abased  abbas  abbreviation  abdal  \
0      0    0        0          0       0      0             0      0   
1      0    0        0          0       0      0             0      0   
2      0    0        0          0       0      0             0      0   
3      0    0        0          0       0      0             0      0   
4      0    0        0          0       0      0             0      0   
...   ..  ...      ...        ...     ...    ...           ...    ...   
7995   0    0        0          0       0      0             0      0   
7996   0    0        0          0       0      0             0      0   
7997   0    0        0          0       0      0             0      0   
7998   0    0        0          0       0      0             0      0   
7999   0    0        0          0       0      0             0      0   

      abduction  abey  ...  zipper  zone  zoning  zoom  html_tag_count  \
0             0     0  ...       0     0       0 

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, classification_report

y = df['Class_Label']

# Veri kümesini eğitim ve test kümelerine ayır
X_train, X_test, y_train, y_test = train_test_split(df_merged, y, test_size=0.2, random_state=42)


In [4]:
# Lojistik regresyon

# Lojistik regresyon modelini eğit
model = LogisticRegression(max_iter=10000)  
model.fit(X_train, y_train)

# Test kümesi üzerinde tahminler yap
predictions = model.predict(X_test)

# Modeli değerlendir

accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

print("Sınıflandırma Raporu:")
print(classification_report(y_test, predictions))

Accuracy: 0.96875
Precision: 0.9687533203332521
Recall: 0.96875
Sınıflandırma Raporu:
              precision    recall  f1-score   support

  Fraudulent       0.97      0.97      0.97       796
      Normal       0.97      0.97      0.97       804

    accuracy                           0.97      1600
   macro avg       0.97      0.97      0.97      1600
weighted avg       0.97      0.97      0.97      1600



In [5]:
#RF

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, classification_report




# Random Forest modelini başlat
rf_model = RandomForestClassifier()

# Random Forest modelini eğit
rf_model.fit(X_train, y_train)

# Test kümesi üzerinde tahminler yap
rf_predictions = rf_model.predict(X_test)

# Modeli değerlendir
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_precision = precision_score(y_test, rf_predictions, average='weighted')
rf_recall = recall_score(y_test, rf_predictions, average='weighted')

print("Random Forest Classifier:")
print(f"Accuracy: {rf_accuracy}")
print(f"Precision: {rf_precision}")
print(f"Recall: {rf_recall}")

print("Sınıflandırma Raporu:")
print(classification_report(y_test, rf_predictions))



Random Forest Classifier:
Accuracy: 0.979375
Precision: 0.9794670568527902
Recall: 0.979375
Sınıflandırma Raporu:
              precision    recall  f1-score   support

  Fraudulent       0.97      0.99      0.98       796
      Normal       0.99      0.97      0.98       804

    accuracy                           0.98      1600
   macro avg       0.98      0.98      0.98      1600
weighted avg       0.98      0.98      0.98      1600



In [6]:
# NB

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, classification_report

# Naive Bayes modelini başlat
nb_model = MultinomialNB()

# Naive Bayes modelini eğit
nb_model.fit(X_train, y_train)

# Test seti üzerinde tahminler yap
nb_predictions = nb_model.predict(X_test)

# Modeli değerlendir
nb_accuracy = accuracy_score(y_test, nb_predictions)
nb_precision = precision_score(y_test, nb_predictions, average='weighted')
nb_recall = recall_score(y_test, nb_predictions, average='weighted')

print("Naive Bayes Classifier:")
print(f"Accuracy: {nb_accuracy}")
print(f"Precision: {nb_precision}")
print(f"Recall: {nb_recall}")

print("Classification Report:")
print(classification_report(y_test, nb_predictions))




Naive Bayes Classifier:
Accuracy: 0.9425
Precision: 0.9451942083074159
Recall: 0.9425
Classification Report:
              precision    recall  f1-score   support

  Fraudulent       0.91      0.98      0.94       796
      Normal       0.98      0.90      0.94       804

    accuracy                           0.94      1600
   macro avg       0.95      0.94      0.94      1600
weighted avg       0.95      0.94      0.94      1600



In [5]:
# SVM

from sklearn.svm import SVC

# SVM modelini başlat
svm_model = SVC()

# SVM modelini eğit
svm_model.fit(X_train, y_train)

# Test seti üzerinde tahminler yap
svm_predictions = svm_model.predict(X_test)

# Modeli değerlendir

svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_precision = precision_score(y_test, svm_predictions, average='weighted')
svm_recall = recall_score(y_test, svm_predictions, average='weighted')

print("Support Vector Machine Classifier:")
print(f"Accuracy: {svm_accuracy}")
print(f"Precision: {svm_precision}")
print(f"Recall: {svm_recall}")

print("Classification Report:")
print(classification_report(y_test, svm_predictions))




Support Vector Machine Classifier:
Accuracy: 0.900625
Precision: 0.9008628619436522
Recall: 0.900625
Classification Report:
              precision    recall  f1-score   support

  Fraudulent       0.89      0.91      0.90       796
      Normal       0.91      0.89      0.90       804

    accuracy                           0.90      1600
   macro avg       0.90      0.90      0.90      1600
weighted avg       0.90      0.90      0.90      1600

