In [None]:
# Cell 1 (Updated)
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_curve, auc
from sklearn.linear_model import LogisticRegression  
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Cell 2
# Load your dataset
df = pd.read_csv('cybertroll_dataset.csv')

# Assuming your target variable is named 'annotation'
X = df['content']  
y = df['annotation']  

# Data Cleaning and Preprocessing (example: converting to lowercase)
X = X.str.lower()


In [None]:
# Cell 3
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Cell 4 (Updated)
# Create TF-IDF and Count Vectorizers
tfidf_vectorizer = TfidfVectorizer()
count_vectorizer = CountVectorizer()

# Transform the text data to TF-IDF and Count features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

In [None]:
def evaluate_model(model, X_test_tfidf, y_test, model_name):
    # Predictions
    y_pred = model.predict(X_test_tfidf)
    
    # Precision-Recall Curve
    plt.figure(figsize=(12, 6))

    # Precision-Recall Curve Plot
    plt.subplot(1, 2, 1)
    
    if hasattr(model, 'predict_proba'):  # Check if the model has predict_proba method
        prob_pos = model.predict_proba(X_test_tfidf)[:, 1]
    else:
        prob_pos = model.decision_function(X_test_tfidf)

    precision, recall, _ = precision_recall_curve(y_test, prob_pos)
    area_under_curve = auc(recall, precision)

    plt.plot(recall, precision, color='darkorange', lw=2, label=f'Area under PR Curve = {area_under_curve:.2f}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'{model_name} Precision-Recall Curve')
    plt.legend(loc='lower left')

    # Confusion Matrix Plot
    plt.subplot(1, 2, 2)
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=model.classes_, yticklabels=model.classes_)
    plt.title(f'{model_name} Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')

    # Add space between plots
    plt.tight_layout()

    # Show the plots
    plt.show()

    # Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy * 100:.2f}%")

    # Classification Report
    print(f"{model_name} Classification Report:")
    print(classification_report(y_test, y_pred))


In [None]:
# Cell 6 (Updated)
# Logistic Regression with TF-IDF
lr_tfidf = LogisticRegression()
lr_tfidf.fit(X_train_tfidf, y_train)
evaluate_model(lr_tfidf, X_test_tfidf, y_test, 'Logistic Regression (TF-IDF)')

# Logistic Regression with CountVectorizer
lr_count = LogisticRegression()
lr_count.fit(X_train_count, y_train)
evaluate_model(lr_count, X_test_count, y_test, 'Logistic Regression (Count)')

In [None]:
# Cell 7 (Updated)
# Random Forest with TF-IDF
rf_tfidf = RandomForestClassifier(n_jobs=-1)
rf_tfidf.fit(X_train_tfidf, y_train)
evaluate_model(rf_tfidf, X_test_tfidf, y_test, 'Random Forest (TF-IDF)')

# Random Forest with CountVectorizer
rf_count = RandomForestClassifier(n_jobs=-1)
rf_count.fit(X_train_count, y_train)
evaluate_model(rf_count, X_test_count, y_test, 'Random Forest (Count)')

In [None]:
# Cell 8 (Updated)
# XGBoost with TF-IDF
xgboost_tfidf = xgb.XGBClassifier(n_jobs=-1)
xgboost_tfidf.fit(X_train_tfidf, y_train)
evaluate_model(xgboost_tfidf, X_test_tfidf, y_test, 'XGBoost (TF-IDF)')

# XGBoost with CountVectorizer
xgboost_count = xgb.XGBClassifier(n_jobs=-1)
xgboost_count.fit(X_train_count, y_train)
evaluate_model(xgboost_count, X_test_count, y_test, 'XGBoost (Count)')

In [None]:
# Cell 9 (Updated)
# Naive Bayes with TF-IDF
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_tfidf, y_train)
evaluate_model(nb_tfidf, X_test_tfidf, y_test, 'Naive Bayes (TF-IDF)')

# Naive Bayes with CountVectorizer
nb_count = MultinomialNB()
nb_count.fit(X_train_count, y_train)
evaluate_model(nb_count, X_test_count, y_test, 'Naive Bayes (Count)')

In [None]:
# Cell 10 (Added)
# Support Vector Classifier (SVC) with TF-IDF
svc_tfidf = SVC()
svc_tfidf.fit(X_train_tfidf, y_train)
evaluate_model(svc_tfidf, X_test_tfidf, y_test, 'Support Vector Classifier (TF-IDF)')

# Support Vector Classifier (SVC) with CountVectorizer
svc_count = SVC()
svc_count.fit(X_train_count, y_train)
evaluate_model(svc_count, X_test_count, y_test, 'Support Vector Classifier (Count)')


In [None]:
# Cell 11 (Updated)
# Decision Tree with TF-IDF
dt_tfidf = DecisionTreeClassifier()
dt_tfidf.fit(X_train_tfidf, y_train)
evaluate_model(dt_tfidf, X_test_tfidf, y_test, 'Decision Tree (TF-IDF)')

# Decision Tree with CountVectorizer
dt_count = DecisionTreeClassifier()
dt_count.fit(X_train_count, y_train)
evaluate_model(dt_count, X_test_count, y_test, 'Decision Tree (Count)')