In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from skmultiflow.trees import HoeffdingTree
import matplotlib.pyplot as plt

# Load and preprocess data
fake_news = "D:/Desktop/Fake_News_Dataset/ISOT_fake.csv"
true_news = "D:/Desktop/Fake_News_Dataset/ISOT_true.csv"
true_df = pd.read_csv(fake_news)
fake_df = pd.read_csv(true_news)
true_df["label"] = 1  # 1 for true news
fake_df["label"] = 0  # 0 for fake news
combined_df = pd.concat([true_df, fake_df], ignore_index=True)
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)


In [None]:

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(combined_df['text'], combined_df['label'], test_size=0.2, random_state=42)

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize classifiers
pac = PassiveAggressiveClassifier(random_state=42)
sgd = SGDClassifier(loss='log_loss', random_state=42)
ht = HoeffdingTree()
gb = GradientBoostingClassifier()

# Initialize lists to store accuracy values
pac_accuracies = []
sgd_accuracies = []
ht_accuracies = []
gb_accuracies = []
training_sizes = []


In [None]:
batch_size = 1000
num_batches = len(combined_df) // batch_size

for i in range(num_batches):
    start_index = i * batch_size
    end_index = min((i + 1) * batch_size, len(combined_df))
    
    X_partial = X_train_tfidf[start_index:end_index]
    y_partial = y_train[start_index:end_index]
    
    # Train Passive Aggressive Classifier
    pac.fit(X_partial, y_partial)
    pac_pred = pac.predict(X_test_tfidf)
    pac_accuracies.append(accuracy_score(y_test, pac_pred))
    
    # Train SGD Classifier
    sgd.fit(X_partial, y_partial)
    sgd_pred = sgd.predict(X_test_tfidf)
    sgd_accuracies.append(accuracy_score(y_test, sgd_pred))
    
    # Train Hoeffding Tree classifier incrementally
    for j in range(start_index, end_index):
        text = combined_df.loc[j, 'text']
        label = combined_df.loc[j, 'label']
        X_partial_ht = vectorizer.transform([text]).toarray()[0]
        ht.partial_fit([X_partial_ht], [int(label)])
    
    # Evaluate accuracy every 1000 samples
    if (i + 1) % 10 == 0:
        pac_pred = pac.predict(X_test_tfidf)
        pac_accuracies.append(accuracy_score(y_test, pac_pred))
        
        sgd_pred = sgd.predict(X_test_tfidf)
        sgd_accuracies.append(accuracy_score(y_test, sgd_pred))
        
        ht_pred = ht.predict(X_test_tfidf.toarray())
        ht_accuracies.append(accuracy_score(y_test, ht_pred))
        
        X_test_features = pd.DataFrame({'PAC': pac_pred, 'SGD': sgd_pred, 'HT': ht_pred})
        gb_pred = gb.predict(X_test_features)
        gb_accuracies.append(accuracy_score(y_test, gb_pred))
        
        training_sizes.append(end_index)


In [None]:
plt.plot(training_sizes, pac_accuracies, label='Passive Aggressive Classifier')
plt.plot(training_sizes, sgd_accuracies, label='SGD Classifier')
plt.plot(training_sizes, ht_accuracies, label='Hoeffding Tree')
plt.plot(training_sizes, gb_accuracies, label='Gradient Boosting Ensemble')
plt.xlabel('Training Size')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Training Size')
plt.legend()
plt.show()