<a href="https://colab.research.google.com/github/Kasunageswar/Turing/blob/main/Fake_News_Detection_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Setup the Environment**

In [None]:
pip install pandas numpy scikit-learn nltk spacy gensim




**Import Libraries**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import nltk
import spacy
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from textblob import TextBlob




**Download necessary NLP resources**

In [None]:
spacy.cli.download('en_core_web_sm')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


**Loading the Dataset**

In [None]:
# Load the dataset
df = pd.read_csv('/content/news.csv')  # Adjust the filename if necessary

# Check for missing values and drop them
df = df.dropna()

# Inspect the first few rows of the dataset
print(df.head())
df = df.dropna()


   Unnamed: 0                                              title  \
0        8476                       You Can Smell Hillary’s Fear   
1       10294  Watch The Exact Moment Paul Ryan Committed Pol...   
2        3608        Kerry to go to Paris in gesture of sympathy   
3       10142  Bernie supporters on Twitter erupt in anger ag...   
4         875   The Battle of New York: Why This Primary Matters   

                                                text label  
0  Daniel Greenfield, a Shillman Journalism Fello...  FAKE  
1  Google Pinterest Digg Linkedin Reddit Stumbleu...  FAKE  
2  U.S. Secretary of State John F. Kerry said Mon...  REAL  
3  — Kaydee King (@KaydeeKing) November 9, 2016 T...  FAKE  
4  It's primary day in New York and front-runners...  REAL  


**Preprocess the Data**

In [None]:
def preprocess_text(text):
    # Tokenization
    words = nltk.word_tokenize(text)
    # Lowercasing
    words = [word.lower() for word in words if word.isalpha()]
    return ' '.join(words)

df['processed_text'] = df['text'].apply(preprocess_text)



**Sentiment Analysis**

In [None]:
def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity

df['sentiment'] = df['processed_text'].apply(get_sentiment)


**Named Entity Recognition**

In [None]:
nlp = spacy.load('en_core_web_sm')

def extract_entities(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents]
    return ' '.join(entities)

df['entities'] = df['processed_text'].apply(extract_entities)


**Topic Modelling**

In [None]:
# Tokenization for topic modeling
def tokenize(text):
    return [word for word in nltk.word_tokenize(text) if word.isalpha()]

df['tokens'] = df['processed_text'].apply(tokenize)

# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(df['tokens'])
corpus = [dictionary.doc2bow(tokens) for tokens in df['tokens']]

# Train LDA model
lda_model = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=10)

def get_topics(text):
    tokens = tokenize(text)
    bow = dictionary.doc2bow(tokens)
    topics = lda_model.get_document_topics(bow)
    return topics

df['topics'] = df['processed_text'].apply(get_topics)


**Combining features**

In [None]:
def combine_features(row):
    return f"{row['processed_text']} {row['entities']} {row['sentiment']} {row['topics']}"

df['combined_features'] = df.apply(combine_features, axis=1)


**Feature Extraction and Model Training**

In [None]:
X = df['combined_features']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train, y_train)

y_pred = pac.predict(tfidf_test)

score = accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(score*100, 2)}%')



Accuracy: 90.84%


**Confusion matrix**

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=['FAKE', 'REAL'])
print(cm)


[[575  63]
 [ 53 576]]
