In [None]:
from google.colab import files
import zipfile
import os

# Upload the dataset manually if not done
uploaded = files.upload()

# Extract if it's a zip file
for file in uploaded.keys():
    if file.endswith(".zip"):
        with zipfile.ZipFile(file, 'r') as zip_ref:
            zip_ref.extractall("/content/dataset")

# Check extracted files
print("Extracted files:", os.listdir("/content/dataset"))


Saving archive (2).zip to archive (2).zip
Extracted files: ['Fake.csv', 'True.csv']


In [None]:
import pandas as pd

# Load data
fake_df = pd.read_csv("/content/dataset/Fake.csv")
true_df = pd.read_csv("/content/dataset/True.csv")

# Add labels
fake_df["label"] = 0  # Fake news
true_df["label"] = 1  # Real news

# Combine datasets
df = pd.concat([fake_df, true_df], ignore_index=True)
print(df.head())


                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  label  
0  December 31, 2017      0  
1  December 31, 2017      0  
2  December 30, 2017      0  
3  December 29, 2017      0  
4  December 25, 2017      0  


In [None]:
# Check for missing values
print(df.isnull().sum())

# Drop any empty rows if they exist
df.dropna(inplace=True)

# Shuffle the dataset to mix fake and real news
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Check dataset structure
print(df.head())


title      0
text       0
subject    0
date       0
label      0
dtype: int64
                                               title  \
0  Ben Stein Calls Out 9th Circuit Court: Committ...   
1  Trump drops Steve Bannon from National Securit...   
2  Puerto Rico expects U.S. to lift Jones Act shi...   
3   OOPS: Trump Just Accidentally Confirmed He Le...   
4  Donald Trump heads for Scotland to reopen a go...   

                                                text       subject  \
0  21st Century Wire says Ben Stein, reputable pr...       US_News   
1  WASHINGTON (Reuters) - U.S. President Donald T...  politicsNews   
2  (Reuters) - Puerto Rico Governor Ricardo Rosse...  politicsNews   
3  On Monday, Donald Trump once again embarrassed...          News   
4  GLASGOW, Scotland (Reuters) - Most U.S. presid...  politicsNews   

                  date  label  
0    February 13, 2017      0  
1       April 5, 2017       1  
2  September 27, 2017       1  
3         May 22, 2017      0  
4   

In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already available
nltk.download('stopwords')

# Function to clean text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = ' '.join(word for word in text.split() if word not in stopwords.words('english'))  # Remove stopwords
    return text

# Apply text cleaning
df["clean_text"] = df["title"] + " " + df["text"]  # Combine title and body
df["clean_text"] = df["clean_text"].apply(clean_text)

# Check cleaned text
print(df[["clean_text", "label"]].head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


KeyboardInterrupt: 

In [None]:
print(df.shape)  # Check dataset size


(44898, 6)


In [None]:
df["clean_text"] = df["statement"].apply(preprocess_text)


KeyError: 'statement'

In [None]:
print(df.columns)


Index(['title', 'text', 'subject', 'date', 'label', 'clean_text'], dtype='object')


In [None]:
print(df['text'].head())  # Check if 'text' contains the news content


0    21st Century Wire says Ben Stein, reputable pr...
1    WASHINGTON (Reuters) - U.S. President Donald T...
2    (Reuters) - Puerto Rico Governor Ricardo Rosse...
3    On Monday, Donald Trump once again embarrassed...
4    GLASGOW, Scotland (Reuters) - Most U.S. presid...
Name: text, dtype: object


In [None]:
print(df.columns)


Index(['title', 'text', 'subject', 'date', 'label', 'clean_text'], dtype='object')


In [None]:
df.rename(columns={'text': 'statement'}, inplace=True)


In [None]:
print(df.columns)


Index(['title', 'statement', 'subject', 'date', 'label', 'clean_text'], dtype='object')


In [None]:
print(df['statement'].head())  # Verify the first few rows


0    21st Century Wire says Ben Stein, reputable pr...
1    WASHINGTON (Reuters) - U.S. President Donald T...
2    (Reuters) - Puerto Rico Governor Ricardo Rosse...
3    On Monday, Donald Trump once again embarrassed...
4    GLASGOW, Scotland (Reuters) - Most U.S. presid...
Name: statement, dtype: object


In [None]:
print(df.columns)  # Ensure all expected columns exist


Index(['title', 'statement', 'subject', 'date', 'label', 'clean_text'], dtype='object')


In [None]:
print(df['statement'].head())


0    21st Century Wire says Ben Stein, reputable pr...
1    WASHINGTON (Reuters) - U.S. President Donald T...
2    (Reuters) - Puerto Rico Governor Ricardo Rosse...
3    On Monday, Donald Trump once again embarrassed...
4    GLASGOW, Scotland (Reuters) - Most U.S. presid...
Name: statement, dtype: object


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download stopwords if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Define a function to clean text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    tokens = word_tokenize(text)  # Tokenize words
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(tokens)

# Apply the function to the 'statement' column
df['clean_text'] = df['statement'].apply(clean_text)

print("✅ Step 4 complete: Text data cleaned!")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
print(df.columns)



Index(['title', 'statement', 'subject', 'date', 'label', 'clean_text'], dtype='object')


In [None]:
print(df['statement'].isnull().sum())


0


In [None]:
df['statement'] = df['statement'].fillna("")


In [None]:
print(clean_text(df['statement'].iloc[0]))


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
df['clean_text'] = df['statement'].apply(clean_text)
print(df[['statement', 'clean_text']].head())  # Print cleaned text


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
df['clean_text'] = df['statement'].apply(clean_text)
print(df[['statement', 'clean_text']].head())  # Print cleaned text


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
df['clean_text'] = df['statement'].apply(clean_text)
print(df[['statement', 'clean_text']].head())  # Print cleaned text


NameError: name 'df' is not defined

In [None]:
import pandas as pd

# Load dataset (make sure the path is correct)
df = pd.read_csv("dataset/train.tsv", sep="\t", header=None, names=["id", "label", "statement", "category", "speaker", "job", "state", "party", "barely_true", "false", "half_true", "mostly_true", "pants_on_fire", "context"])

# Check if it's loaded correctly
print(df.head())


FileNotFoundError: [Errno 2] No such file or directory: 'dataset/train.tsv'

In [None]:
import os
print(os.listdir())


['.config', 'dataset', 'archive (2).zip', 'sample_data']


In [None]:
import os
print(os.listdir("dataset"))


['Fake.csv', 'True.csv']


In [None]:
import pandas as pd

# Load Fake News data
df_fake = pd.read_csv("dataset/Fake.csv")
df_fake["label"] = 0  # Assign label 0 for fake news

# Load True News data
df_true = pd.read_csv("dataset/True.csv")
df_true["label"] = 1  # Assign label 1 for true news

# Combine both datasets
df = pd.concat([df_fake, df_true], ignore_index=True)

# Display dataset info
print(df.head())
print(df.columns)


                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  label  
0  December 31, 2017      0  
1  December 31, 2017      0  
2  December 30, 2017      0  
3  December 29, 2017      0  
4  December 25, 2017      0  
Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')


In [None]:
df['clean_text'] = df['text'].apply(clean_text)  # Use 'text' instead of 'statement'
print(df[['text', 'clean_text']].head())  # Verify the cleaned text


NameError: name 'clean_text' is not defined

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK stopwords if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

def clean_text(text):
    if isinstance(text, str):  # Check if the input is a string
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'\W', ' ', text)  # Remove non-word characters
        text = re.sub(r'\d+', '', text)  # Remove numbers
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
        words = word_tokenize(text)  # Tokenize text
        words = [word for word in words if word not in stopwords.words('english')]  # Remove stopwords
        return ' '.join(words)
    return ""

print("✅ Text cleaning function is ready!")


✅ Text cleaning function is ready!


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
df['clean_text'] = df['text'].apply(clean_text)  # Use 'text' instead of 'statement'
print(df[['text', 'clean_text']].head())  # Verify the cleaned text


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Ensure stopwords and tokenizer are downloaded
nltk.download("stopwords")
nltk.download("punkt")

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    tokens = word_tokenize(text)  # Tokenize text
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return " ".join(tokens)

# Apply the function to clean text data
df["clean_text"] = df["text"].apply(clean_text)

# Print sample cleaned text
print(df[["text", "clean_text"]].head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
import nltk

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("omw-1.4")  # Optional, for lemmatization


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Ensure stopwords and tokenizer are downloaded
nltk.download("stopwords")
nltk.download("punkt")

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    tokens = word_tokenize(text)  # Tokenize text
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return " ".join(tokens)

# Apply the function to clean text data
df["clean_text"] = df["text"].apply(clean_text)

# Print sample cleaned text
print(df[["text", "clean_text"]].head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
import nltk

nltk.download("all")


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

True

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Ensure stopwords and tokenizer are downloaded
nltk.download("stopwords")
nltk.download("punkt")

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    tokens = word_tokenize(text)  # Tokenize text
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return " ".join(tokens)

# Apply the function to clean text data
df["clean_text"] = df["text"].apply(clean_text)

# Print sample cleaned text
print(df[["text", "clean_text"]].head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


NameError: name 'df' is not defined

In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv("Fake.csv")  # Change to "True.csv" if needed

# Verify data
print(df.head())


FileNotFoundError: [Errno 2] No such file or directory: 'Fake.csv'

In [None]:
import os
print(os.listdir())


['.config', 'dataset', 'archive (2).zip', 'sample_data']


In [None]:
import os
print(os.listdir("dataset"))


['Fake.csv', 'True.csv']


In [None]:
import pandas as pd

df = pd.read_csv("Fake.csv")  # Change to "True.csv" if needed
print(df.head())  # Check the first few rows


FileNotFoundError: [Errno 2] No such file or directory: 'Fake.csv'

In [None]:
import os

print(os.listdir("dataset"))  # Check the files inside the "dataset" folder


['Fake.csv', 'True.csv']


In [None]:
import pandas as pd

df_fake = pd.read_csv("./Fake.csv")  # Load Fake News dataset
df_true = pd.read_csv("./True.csv")  # Load True News dataset

print(df_fake.head())  # Check the first few rows of Fake.csv
print(df_true.head())  # Check the first few rows of True.csv


FileNotFoundError: [Errno 2] No such file or directory: './Fake.csv'

In [None]:
import os

print(os.listdir())  # List files in the current directory
print(os.listdir("dataset"))  # If your files are inside "dataset" folder


['.config', 'dataset', 'archive (2).zip', 'sample_data']
['Fake.csv', 'True.csv']


In [None]:
import pandas as pd

df_fake = pd.read_csv("dataset/Fake.csv")  # Load Fake News dataset
df_true = pd.read_csv("dataset/True.csv")  # Load True News dataset

print(df_fake.head())  # Check the first few rows of the Fake news dataset
print(df_true.head())  # Check the first few rows of the True news dataset


                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  
0  December 31, 2017  
1  December 31, 2017  
2  December 30, 2017  
3  December 29, 2017  
4  December 25, 2017  
                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept t

In [None]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download required NLTK datasets
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load datasets
df_fake = pd.read_csv("Fake.csv")
df_true = pd.read_csv("True.csv")

# Add labels
df_fake["label"] = 0  # Fake news -> 0
df_true["label"] = 1  # True news -> 1

# Merge datasets
df = pd.concat([df_fake, df_true], axis=0)

# Define text cleaning function
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    words = word_tokenize(text)  # Tokenize
    words = [word for word in words if word not in stopwords.words('english')]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatization
    return ' '.join(words)

# Apply cleaning to text column
df['clean_text'] = df['text'].apply(clean_text)

# Check cleaned data
print(df[['text', 'clean_text']].head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


FileNotFoundError: [Errno 2] No such file or directory: 'Fake.csv'

In [None]:
import os
print(os.listdir())


['.config', 'dataset', 'archive (2).zip', 'sample_data']


In [None]:
import pandas as pd

df_fake = pd.read_csv("dataset/Fake.csv")
df_true = pd.read_csv("dataset/True.csv")

print(df_fake.head())  # Check the first few rows of the Fake news dataset
print(df_true.head())  # Check the first few rows of the True news dataset


                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  
0  December 31, 2017  
1  December 31, 2017  
2  December 30, 2017  
3  December 29, 2017  
4  December 25, 2017  
                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept t

In [None]:
# Add a 'label' column: 1 for Fake news, 0 for True news
df_fake["label"] = 1
df_true["label"] = 0

# Combine both datasets
df = pd.concat([df_fake, df_true], axis=0)

# Shuffle the dataset to mix fake and real news
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the first few rows
print(df.head())



                                               title  \
0  Ben Stein Calls Out 9th Circuit Court: Committ...   
1  Trump drops Steve Bannon from National Securit...   
2  Puerto Rico expects U.S. to lift Jones Act shi...   
3   OOPS: Trump Just Accidentally Confirmed He Le...   
4  Donald Trump heads for Scotland to reopen a go...   

                                                text       subject  \
0  21st Century Wire says Ben Stein, reputable pr...       US_News   
1  WASHINGTON (Reuters) - U.S. President Donald T...  politicsNews   
2  (Reuters) - Puerto Rico Governor Ricardo Rosse...  politicsNews   
3  On Monday, Donald Trump once again embarrassed...          News   
4  GLASGOW, Scotland (Reuters) - Most U.S. presid...  politicsNews   

                  date  label  
0    February 13, 2017      1  
1       April 5, 2017       0  
2  September 27, 2017       0  
3         May 22, 2017      1  
4       June 24, 2016       0  


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

# Initialize stopwords & lemmatizer
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# Function to clean text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    tokens = word_tokenize(text)  # Tokenize words
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply function to dataset
df["clean_text"] = df["text"].apply(clean_text)

# Verify the cleaned text
print(df[["text", "clean_text"]].head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                                text  \
0  21st Century Wire says Ben Stein, reputable pr...   
1  WASHINGTON (Reuters) - U.S. President Donald T...   
2  (Reuters) - Puerto Rico Governor Ricardo Rosse...   
3  On Monday, Donald Trump once again embarrassed...   
4  GLASGOW, Scotland (Reuters) - Most U.S. presid...   

                                          clean_text  
0  21st century wire say ben stein reputable prof...  
1  washington reuters u president donald trump re...  
2  reuters puerto rico governor ricardo rossello ...  
3  monday donald trump embarrassed country accide...  
4  glasgow scotland reuters u presidential candid...  


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Use top 5000 words

# Transform text into numerical form
X = vectorizer.fit_transform(df["clean_text"])

# Convert labels to binary (1 = Fake, 0 = Real)
y = df["label"].apply(lambda x: 1 if x == "FAKE" else 0)

# Print shape of transformed data
print("Shape of TF-IDF matrix:", X.shape)
print("Sample feature names:", vectorizer.get_feature_names_out()[:10])  # Show first 10 words


Shape of TF-IDF matrix: (44898, 5000)
Sample feature names: ['00' '000' '10' '100' '11' '12' '120' '13' '14' '15']


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Logistic Regression model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

In [None]:
import numpy as np
unique, counts = np.unique(y, return_counts=True)
print("Label distribution:", dict(zip(unique, counts)))


Label distribution: {0: 44898}


In [None]:
# Reload datasets
df_fake = pd.read_csv("Fake.csv")
df_true = pd.read_csv("True.csv")

# Assign labels
df_fake["label"] = 0  # Fake news → Label 0
df_true["label"] = 1  # True news → Label 1

# Combine both datasets
df = pd.concat([df_fake, df_true], axis=0)

# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Check distribution again
print("Label distribution:", df["label"].value_counts())


FileNotFoundError: [Errno 2] No such file or directory: 'Fake.csv'

In [None]:
import os
print(os.listdir())  # Check available files


['.config', 'dataset', 'archive (2).zip', 'sample_data']


In [None]:
import pandas as pd

df_fake = pd.read_csv("dataset/Fake.csv")
df_true = pd.read_csv("dataset/True.csv")

print(df_fake.head())  # Verify the data
print(df_true.head())


                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  
0  December 31, 2017  
1  December 31, 2017  
2  December 30, 2017  
3  December 29, 2017  
4  December 25, 2017  
                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept t

In [None]:
import pandas as pd

# Add labels: Fake = 0, True = 1
df_fake["label"] = 0
df_true["label"] = 1

# Combine both datasets
df = pd.concat([df_fake, df_true], axis=0)

# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(df["label"].value_counts())  # Check class distribution


label
0    23481
1    21417
Name: count, dtype: int64


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

nltk.download("stopwords")
nltk.download("punkt")

stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    words = word_tokenize(text)  # Tokenize words
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return " ".join(words)

# Apply cleaning function to text column
df["clean_text"] = df["text"].apply(clean_text)

print(df[["text", "clean_text"]].head())  # Check sample cleaned text


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                                text  \
0  21st Century Wire says Ben Stein, reputable pr...   
1  WASHINGTON (Reuters) - U.S. President Donald T...   
2  (Reuters) - Puerto Rico Governor Ricardo Rosse...   
3  On Monday, Donald Trump once again embarrassed...   
4  GLASGOW, Scotland (Reuters) - Most U.S. presid...   

                                          clean_text  
0  21st century wire says ben stein reputable pro...  
1  washington reuters us president donald trump r...  
2  reuters puerto rico governor ricardo rossello ...  
3  monday donald trump embarrassed country accide...  
4  glasgow scotland reuters us presidential candi...  


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)  # Convert text to TF-IDF vectors
X = vectorizer.fit_transform(df["clean_text"])
y = df["label"]

print("TF-IDF Matrix Shape:", X.shape)  # Check matrix shape


TF-IDF Matrix Shape: (44898, 5000)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.984521158129176
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      4710
           1       0.98      0.99      0.98      4270

    accuracy                           0.98      8980
   macro avg       0.98      0.98      0.98      8980
weighted avg       0.98      0.98      0.98      8980



In [None]:
from google.colab import files
files.download("your_notebook.ipynb")  # Replace with your actual notebook filename


FileNotFoundError: Cannot find file: your_notebook.ipynb

In [None]:
import os
print(os.listdir())  # Lists all files in the current directory


['.config', 'dataset', 'archive (2).zip', 'sample_data']


In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
print(os.listdir('/content/drive/My Drive/Colab Notebooks'))  # Lists your saved notebooks


MessageError: Error: credential propagation was unsuccessful