In [42]:
import pandas as pd
import numpy as np
import zipfile
import os
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import subprocess

In [2]:
import zipfile
from nltk.corpus import wordnet

nltk.download('punkt')
nltk.download('stopwords')

# Define the directory and file paths
download_dir = '../Resources'
os.makedirs(download_dir, exist_ok=True)
wordnet_zip_path = os.path.join(download_dir, 'corpora/wordnet.zip')

# Download and unzip wordnet
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet', download_dir=download_dir)
    if os.path.exists(wordnet_zip_path):
        with zipfile.ZipFile(wordnet_zip_path, 'r') as zip_ref:
            zip_ref.extractall(download_dir)
    else:
        print(f"File not found: {wordnet_zip_path}")

# Now you can import the NLTK resources as usual
from nltk.corpus import wordnet

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\draxe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\draxe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to ../Resources...
[nltk_data]   Package wordnet is already up-to-date!


# Load and combine data

In [3]:
fake_df = pd.read_csv('../Dataset_Original/ISOT/Fake.csv')
true_df = pd.read_csv('../Dataset_Original/ISOT/True.csv')

In [4]:
fake_df['label'] = 1
true_df['label'] = 0

In [5]:
combined_df = pd.concat([fake_df, true_df], ignore_index=True)

In [6]:
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [7]:
combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",1
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",0
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",0
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",1
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",0


# Remove date column

In [8]:
combined_df.drop(columns=['date'], inplace=True)

In [9]:
combined_df.head()

Unnamed: 0,title,text,subject,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,1
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,0
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,0
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,1
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,0


# Combine column text and title

In [10]:
combined_df['combined_text'] = combined_df['title'] + " " + combined_df['text']

In [11]:
combined_df.head()

Unnamed: 0,title,text,subject,label,combined_text
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,1,Ben Stein Calls Out 9th Circuit Court: Committ...
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,0,Trump drops Steve Bannon from National Securit...
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,0,Puerto Rico expects U.S. to lift Jones Act shi...
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,1,OOPS: Trump Just Accidentally Confirmed He Le...
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,0,Donald Trump heads for Scotland to reopen a go...


# Remove duplicates

In [12]:
duplicates_df = combined_df[combined_df.duplicated()]
print(duplicates_df)

                                                   title  \
4106   Islamic State claims responsibility for Aden c...   
5547   Turkey seeks life sentences for 60 ex-military...   
5909   Highlights: The Trump presidency on March 31 a...   
6382   Britain preparing to transfer 400 million poun...   
6779   Israel ambassador asks to meet New Zealand pop...   
...                                                  ...   
44673  Kuwait says GCC to keep operating despite Qata...   
44721  Syrian rebels say discussing evacuation from t...   
44786  Russia may widen designation for media outlets...   
44849  Senate tax bill stalls on deficit-focused 'tri...   
44860  British PM May vows to stay as party plotters ...   

                                                    text       subject  label  \
4106   CAIRO (Reuters) - Militant group Islamic State...     worldnews      0   
5547   ISTANBUL (Reuters) - Sixty people including a ...     worldnews      0   
5909   (Reuters) - Highlights of the

In [13]:
combined_df.drop_duplicates(subset='text', keep='first', inplace=True)

# Remove outliers

## Remove rows with text length <= 10

In [14]:
combined_df = combined_df[combined_df['text'].str.len() > 10]

In [15]:
short_texts_df_after_removal = combined_df[combined_df['text'].str.len() <= 10]
print("\nNumber of rows with text length <= 10 after removal:")
print(len(short_texts_df_after_removal))


Number of rows with text length <= 10 after removal:
0


## Remove rows where title is "Homepage"

In [16]:
combined_df = combined_df[combined_df['title'].str.lower() != "homepage"]

In [17]:
print("\nNumber of rows after removing 'Homepage' title:", combined_df.shape[0])


Number of rows after removing 'Homepage' title: 38632


## Remove rows where text contains only YouTube URL (text length = 43)


In [18]:
combined_df = combined_df[combined_df['text'].str.len() != 43]

In [19]:
print("\nNumber of rows after removing YouTube URL only texts:", combined_df.shape[0])


Number of rows after removing YouTube URL only texts: 38592


# Remove special characters

## URL

In [20]:
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'www\S+', '', text)   # Remove URLs
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

In [21]:
combined_df['combined_text'] = combined_df['combined_text'].apply(clean_text)

# Punctuation

In [22]:
combined_df['combined_text'] = combined_df['combined_text'].str.replace('['+string.punctuation+']', '', regex=True)

# Stopwords

In [23]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [24]:
def preprocess_text(text):
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

In [26]:
combined_df['combined_text'] = combined_df['combined_text'].apply(preprocess_text)

# Split data into training and testing sets

In [35]:
combined_df = combined_df[['combined_text', 'label']]

# Chia dữ liệu thành tập huấn luyện và kiểm tra
X = combined_df['combined_text']  # Chỉ sử dụng cột 'combined_text'
y = combined_df['label']  # Nhãn

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tạo DataFrame cho tập huấn luyện và kiểm tra
train_df = pd.DataFrame({'combined_text': X_train, 'label': y_train})
test_df = pd.DataFrame({'combined_text': X_test, 'label': y_test})

# Save data

In [36]:
# Remove rows with empty text again after processing
combined_df['combined_text'].dropna(inplace=True)

In [37]:
output_dir = '../Dataset_Cleaned/ISOT'
os.makedirs(output_dir, exist_ok=True)
train_file = os.path.join(output_dir, 'clean_train_isot.csv')
test_file = os.path.join(output_dir, 'clean_test_isot.csv')

In [38]:
train_df.to_csv(train_file, index=False)
test_df.to_csv(test_file, index=False)

In [39]:
# Print the first few rows of the train and test DataFrames to confirm
print("\nFirst few rows of the train DataFrame:")
train_df.head()


First few rows of the train DataFrame:


Unnamed: 0,combined_text,label
33995,conservative terrorist given insanely light se...,1
21818,clinton ad slam trump disgusting insult toward...,1
22091,hardliner protest french labor reform macron c...,0
5456,nba kowtow racist order player stand anthem la...,1
1713,ben carson praise time trump compared child mo...,1


In [40]:
print("\nFirst few rows of the test DataFrame:")
test_df.head()


First few rows of the test DataFrame:


Unnamed: 0,combined_text,label
33661,republican actually support obama finding scal...,1
34806,mitt romneys effort stop trump end like 2012 b...,1
42318,houston problem houston mayor defends evacuati...,1
220,angry mark levin writes letter cnn anchor bria...,1
25034,gay voter blacklivesmatter obama kid cancernot...,1


In [41]:
num_train_rows = train_df.shape[0]
num_test_rows = test_df.shape[0]
print(f"Number of rows in the train file: {num_train_rows}")
print(f"Number of rows in the test file: {num_test_rows}")

Number of rows in the train file: 30873
Number of rows in the test file: 7719
