<a href="https://colab.research.google.com/github/Ishanml/Fact-Filter-using-NLP/blob/main/FactFilter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Fact Filter using ML and NLP**


In [1]:
# 1) Data Loading and Exploring
import pandas as pd

# Load the datasets
fake_df = pd.read_csv('Fake.csv')
true_df = pd.read_csv('True.csv')

# Display the first few rows of each dataset
print("Fake News Data:")
print(fake_df.head())
print("\nTrue News Data:")
print(true_df.head())

# Display the shape of each dataset
print("\nFake News Data Shape:", fake_df.shape)
print("True News Data Shape:", true_df.shape)

Fake News Data:
                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  
0  December 31, 2017  
1  December 31, 2017  
2  December 30, 2017  
3  December 29, 2017  
4  December 25, 2017  

True News Data:
                                               title  \
0  As U.S. budget fight looms, Republicans flip t..

In [2]:
!pip install spacy
!pip install nltk
!pip install pandas




In [3]:
# 2) Data Preprocessing
import pandas as pd
import numpy as np
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from multiprocessing import Pool

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Define preprocessing function
def preprocess(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Stop word removal
    tokens = [word for word in tokens if word.lower() not in stopwords.words('english')]

    # Stemming and Lemmatization
    tokens = [stemmer.stem(word) for word in tokens]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

# Apply preprocessing using multiprocessing
def parallel_preprocess(text_series, num_partitions=10):
    data_split = np.array_split(text_series, num_partitions)
    pool = Pool(num_partitions)
    processed_data = pd.concat(pool.map(preprocess_partition, data_split))
    pool.close()
    pool.join()
    return processed_data

def preprocess_partition(text_series):
    return text_series.apply(preprocess)

# Load the datasets
fake_df = pd.read_csv('Fake.csv')
true_df = pd.read_csv('True.csv')

# Add a label column to each dataframe
fake_df['label'] = 0
true_df['label'] = 1

# Combine the datasets
data = pd.concat([fake_df, true_df], axis=0).reset_index(drop=True)

# Shuffle the data
data = data.sample(frac=1).reset_index(drop=True)

# Drop duplicates and missing values
data = data.drop_duplicates()
data = data.dropna()

# Apply parallel preprocessing to text data
data['processed_text'] = parallel_preprocess(data['text'])

# Save the preprocessed data to avoid reprocessing in the future
data.to_csv('preprocessed_data.csv', index=False)

print("Data preprocessing complete.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
  return bound(*args, **kwds)


Data preprocessing complete.


In [4]:
#3) Model Training
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer # Import TfidfVectorizer for text vectorization

# Extract features (X) and labels (y) from the preprocessed data
X = data['processed_text']  # Assuming 'processed_text' column contains the features
y = data['label']  # Assuming 'label' column contains the labels

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X) # Transform text data to numerical vectors

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

# Train models
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)

Training Logistic Regression...
Training Random Forest...
Training Decision Tree...
Training Gradient Boosting...


In [5]:
# 4) Model Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluate models
for name, model in models.items():
    print(f"\n{name} Evaluation:")
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))


Logistic Regression Evaluation:
Accuracy: 0.9898187513985232
Precision: 0.9880411384836164
Recall: 0.9901725790987536
F1 Score: 0.9891057105231653

Random Forest Evaluation:
Accuracy: 0.9946296710673529
Precision: 0.9951969260326609
Recall: 0.9932885906040269
F1 Score: 0.9942418426103646

Decision Tree Evaluation:
Accuracy: 0.9963078988588051
Precision: 0.9959261921878745
Recall: 0.9961649089165868
F1 Score: 0.9960455362492511

Gradient Boosting Evaluation:
Accuracy: 0.9966435444170956
Precision: 0.9954545454545455
Recall: 0.9973633748801534
F1 Score: 0.9964080459770116


In [6]:
# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV

# Define parameter grid for Logistic Regression
param_grid_lr = {
    'C': [0.1, 1, 10],
    'solver': ['liblinear']
}

# Initialize GridSearchCV
grid_lr = GridSearchCV(LogisticRegression(), param_grid_lr, cv=5, scoring='accuracy')

# Train and tune the model
grid_lr.fit(X_train, y_train)

# Get the best model
best_lr = grid_lr.best_estimator_

# Evaluate the best model
y_pred = best_lr.predict(X_test)
print("\nTuned Logistic Regression Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))


Tuned Logistic Regression Evaluation:
Accuracy: 0.9942940255090624
Precision: 0.9930605407992342
Recall: 0.9947267497603068
F1 Score: 0.9938929469524608
