In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# CSV dosyanızı yükleyin
df = pd.read_csv('../data/lemmatized_and_misspelled_removed_SEFACED.csv', encoding='utf-8')

# Korpusu (lemmatize edilmiş belgeler) çıkarın
corpus = df['lemmatized_tokens']

# Bir TfidfVectorizer oluşturun
tfidf_vectorizer = TfidfVectorizer()

# Korpusu TF-IDF seyrek matrise dönüştürün ve uydurun
X_tfidf = tfidf_vectorizer.fit_transform(corpus)

# TF-IDF matrisini bir DataFrame'e dönüştür
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

print(df_tfidf)


        aa  aba  abandon  abandoned  abased  abbas  abbreviation  abdal  \
0      0.0  0.0      0.0        0.0     0.0    0.0           0.0    0.0   
1      0.0  0.0      0.0        0.0     0.0    0.0           0.0    0.0   
2      0.0  0.0      0.0        0.0     0.0    0.0           0.0    0.0   
3      0.0  0.0      0.0        0.0     0.0    0.0           0.0    0.0   
4      0.0  0.0      0.0        0.0     0.0    0.0           0.0    0.0   
...    ...  ...      ...        ...     ...    ...           ...    ...   
15995  0.0  0.0      0.0        0.0     0.0    0.0           0.0    0.0   
15996  0.0  0.0      0.0        0.0     0.0    0.0           0.0    0.0   
15997  0.0  0.0      0.0        0.0     0.0    0.0           0.0    0.0   
15998  0.0  0.0      0.0        0.0     0.0    0.0           0.0    0.0   
15999  0.0  0.0      0.0        0.0     0.0    0.0           0.0    0.0   

       abduction  abey  ...  zing  zip  zipper   zo  zombie  zone  zoned  \
0            0.0   0.0 

In [2]:
from sklearn.model_selection import train_test_split

# Korpusu (lemmatize edilmiş belgeler) ve hedef değişkeni çıkarın
corpus = df['lemmatized_tokens']
y = df['Class_Label']

# Veri kümesini eğitim ve test setlerine ayırın
X_train, X_test, y_train, y_test = train_test_split(df_tfidf, y, test_size=0.2, random_state=42)


In [2]:
# LR

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report




# Logistic Regresyon modelini başlatın
lr_model = LogisticRegression(max_iter=10000)

# Logistic Regresyon modelini eğitin
lr_model.fit(X_train, y_train)

# Test seti üzerinde tahminler yapın
lr_predictions = lr_model.predict(X_test)

# Modeli değerlendirin

lr_accuracy = accuracy_score(y_test, lr_predictions)
lr_precision = precision_score(y_test, lr_predictions, average='weighted')
lr_recall = recall_score(y_test, lr_predictions, average='weighted')

print("Logistic Regression Classifier:")
print(f"Accuracy: {lr_accuracy}")
print(f"Precision: {lr_precision}")
print(f"Recall: {lr_recall}")
print("Sınıflandırma Raporu:")
print(classification_report(y_test, lr_predictions))


Logistic Regression Classifier:
Accuracy: 0.8775
Precision: 0.8789020827092141
Recall: 0.8775
Sınıflandırma Raporu:
              precision    recall  f1-score   support

  Fraudulent       0.96      0.91      0.93       837
  Harrasment       0.83      0.85      0.84       737
      Normal       0.93      0.95      0.94       767
  Suspicious       0.80      0.81      0.80       859

    accuracy                           0.88      3200
   macro avg       0.88      0.88      0.88      3200
weighted avg       0.88      0.88      0.88      3200



In [3]:
# RF

from sklearn.ensemble import RandomForestClassifier

# Rastgele Orman modelini başlatın
rf_model = RandomForestClassifier()

# Rastgele Orman modelini eğitin
rf_model.fit(X_train, y_train)

# Test seti üzerinde tahminler yapın
rf_predictions = rf_model.predict(X_test)

# Modeli değerlendirin
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_precision = precision_score(y_test, rf_predictions, average='weighted')
rf_recall = recall_score(y_test, rf_predictions, average='weighted')

print("Random Forest Classifier:")
print(f"Accuracy: {rf_accuracy}")
print(f"Precision: {rf_precision}")
print(f"Recall: {rf_recall}")
print("Sınıflandırma Raporu:")
print(classification_report(y_test, rf_predictions))


Random Forest Classifier:
Accuracy: 0.8740625
Precision: 0.8756715507313997
Recall: 0.8740625
Sınıflandırma Raporu:
              precision    recall  f1-score   support

  Fraudulent       0.87      0.97      0.92       837
  Harrasment       0.81      0.86      0.83       737
      Normal       0.95      0.93      0.94       767
  Suspicious       0.87      0.73      0.80       859

    accuracy                           0.87      3200
   macro avg       0.88      0.88      0.87      3200
weighted avg       0.88      0.87      0.87      3200



In [4]:
#NB

from sklearn.naive_bayes import MultinomialNB

# Çoklu Naive Bayes modelini başlatın
nb_model = MultinomialNB()

# Çoklu Naive Bayes modelini eğitin
nb_model.fit(X_train, y_train)

# Test seti üzerinde tahminler yapın
nb_predictions = nb_model.predict(X_test)

# Modeli değerlendirin
nb_accuracy = accuracy_score(y_test, nb_predictions)
nb_precision = precision_score(y_test, nb_predictions, average='weighted')
nb_recall = recall_score(y_test, nb_predictions, average='weighted')

print("Naive Bayes Classifier:")
print(f"Accuracy: {nb_accuracy}")
print(f"Precision: {nb_precision}")
print(f"Recall: {nb_recall}")
print("Sınıflandırma Raporu:")
print(classification_report(y_test, nb_predictions))


Naive Bayes Classifier:
Accuracy: 0.8465625
Precision: 0.8703527509999011
Recall: 0.8465625
Sınıflandırma Raporu:
              precision    recall  f1-score   support

  Fraudulent       0.97      0.90      0.93       837
  Harrasment       0.67      0.93      0.78       737
      Normal       0.91      0.95      0.93       767
  Suspicious       0.92      0.64      0.75       859

    accuracy                           0.85      3200
   macro avg       0.86      0.85      0.85      3200
weighted avg       0.87      0.85      0.85      3200



In [4]:
# SVM

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, classification_report

# SVM modelini başlat
svm_model = SVC()

# SVM modelini eğit
svm_model.fit(X_train, y_train)

# Test seti üzerinde tahminler yap
svm_predictions = svm_model.predict(X_test)

# Modeli değerlendir

svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_precision = precision_score(y_test, svm_predictions, average='weighted')
svm_recall = recall_score(y_test, svm_predictions, average='weighted')

print("Support Vector Machine Classifier:")
print(f"Accuracy: {svm_accuracy}")
print(f"Precision: {svm_precision}")
print(f"Recall: {svm_recall}")

print("Classification Report:")
print(classification_report(y_test, svm_predictions))



Support Vector Machine Classifier:
Accuracy: 0.888125
Precision: 0.8884487021953597
Recall: 0.888125
Classification Report:
              precision    recall  f1-score   support

  Fraudulent       0.90      0.97      0.93       837
  Harrasment       0.83      0.88      0.85       737
      Normal       0.97      0.94      0.95       767
  Suspicious       0.86      0.77      0.81       859

    accuracy                           0.89      3200
   macro avg       0.89      0.89      0.89      3200
weighted avg       0.89      0.89      0.89      3200

