In [11]:
import pandas as pd
import spacy
from sklearn.metrics.pairwise import cosine_similarity

# Load the data
fake_news = pd.read_csv(r"Dataset/DataSet_Misinfo_FAKE.csv", nrows=1000)
true_news = pd.read_csv(r"Dataset/DataSet_Misinfo_TRUE.csv", nrows=1000)

# Require GPU
spacy.require_gpu()

# Load the spaCy English language model with the GPU enabled
nlp = spacy.load('en_core_web_sm')

# Calculate the average cosine similarity between true and false news articles
total_sim_score = 0
count = 0
for i in range(len(true_news)):
    for j in range(len(fake_news)):
        doc1 = nlp(true_news.iloc[i]['text'])
        doc2 = nlp(fake_news.iloc[j]['text'])
        sim_score = cosine_similarity(doc1.vector.reshape(1, -1), doc2.vector.reshape(1, -1))[0][0]
        total_sim_score += sim_score
        count += 1

avg_sim_score = total_sim_score / count
print('Average cosine similarity between true and false news articles:', avg_sim_score)


ValueError: No GPU devices detected

In [None]:
import pandas as pd
import spacy
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Load the data
fake_news = pd.read_csv(r"Dataset/DataSet_Misinfo_FAKE.csv")
true_news = pd.read_csv(r"Dataset/DataSet_Misinfo_TRUE.csv")

# Load the spaCy English language model
nlp = spacy.load('en_core_web_sm')

# Set the similarity threshold
threshold = 0.8

# Find pairs of true and false news articles with a similarity score above the threshold
similar_pairs = []
for i in range(len(true_news)):
    for j in range(len(fake_news)):
        doc1 = nlp(true_news.iloc[i]['text'])
        doc2 = nlp(fake_news.iloc[j]['text'])
        sim_score = cosine_similarity(doc1.vector.reshape(1, -1), doc2.vector.reshape(1, -1))[0][0]
        if sim_score > threshold:
            similar_pairs.append((true_news.iloc[i]['text'], fake_news.iloc[j]['text'], sim_score))

# Print the number of similar pairs found
print(f"Found {len(similar_pairs)} pairs of true and false news articles with a similarity score above {threshold}.")

# Generate word clouds for the true and false news articles in each similar pair
for pair in similar_pairs:
    true_text = pair[0]
    false_text = pair[1]
    true_wordcloud = WordCloud().generate(true_text)
    false_wordcloud = WordCloud().generate(false_text)
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
    ax1.imshow(true_wordcloud, interpolation='bilinear')
    ax1.set_title('True News')
    ax1.axis('off')
    ax2.imshow(false_wordcloud, interpolation='bilinear')
    ax2.set_title('Fake News')
    ax2.axis('off')
    plt.show()


In [None]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the data
fake_news_df = pd.read_csv(r"Dataset/DataSet_Misinfo_FAKE.csv")
true_news_df = pd.read_csv(r"Dataset/DataSet_Misinfo_TRUE.csv")

# Concatenate the true and fake news dataframes
news_df = pd.concat([true_news_df, fake_news_df])

# Create a binary label column
news_df['label'] = news_df['type'].apply(lambda x: 1 if x == 'true' else 0)

# Create a document-term matrix using CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(news_df['text'])
y = news_df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a logistic regression classifier
lr_classifier = LogisticRegression(random_state=42)
lr_classifier.fit(X_train, y_train)

# Evaluate the classifier on the testing set
y_pred = lr_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Logistic Regression Accuracy:', accuracy)


In [None]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load the data
fake_news_df = pd.read_csv(r"Dataset/DataSet_Misinfo_FAKE.csv")
true_news_df = pd.read_csv(r"Dataset/DataSet_Misinfo_TRUE.csv")

# Concatenate the true and fake news dataframes
news_df = pd.concat([true_news_df, fake_news_df])

# Create a binary label column
news_df['label'] = news_df['type'].apply(lambda x: 1 if x == 'true' else 0)

# Create a document-term matrix using CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(news_df['text'])
y = news_df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a decision tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

# Evaluate the classifier on the testing set
y_pred = dt_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Decision Tree Accuracy:', accuracy)


In [None]:
import matplotlib.pyplot as plt

# Train the model on the data
clf.fit(X_train, y_train)

# Get the feature importances
importances = clf.feature_importances_

# Sort the features by importance score
indices = importances.argsort()[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances
plt.figure(figsize=(10, 5))
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices], color="r", align="center")
plt.xticks(range(X_train.shape[1]), indices)
plt.xlim([-1, X_train.shape[1]])
plt.show()


In [None]:
from sklearn.tree import export_graphviz
import graphviz

# Train the model on the data
clf.fit(X_train, y_train)

# Export the decision tree as a .dot file
dot_data = export_graphviz(clf, out_file=None, 
                           feature_names=vectorizer.get_feature_names(),  
                           class_names=['Fake', 'True'],  
                           filled=True, rounded=True,  
                           special_characters=True)  
graph = graphviz.Source(dot_data)  

# Show the decision tree
graph.render('decision_tree')  # Saves the tree as a PDF file
