In [34]:
import pandas as pd
import numpy as np
import os
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import subprocess

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
# Download and unzip wordnet
try:
    nltk.data.find('wordnet.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
    subprocess.run(command.split())
    nltk.data.path.append('/kaggle/working/')

# Now you can import the NLTK resources as usual
from nltk.corpus import wordnet

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /kaggle/working/...
Archive:  /kaggle/working/corpora/wordnet.zip
   creating: /kaggle/working/corpora/wordnet/
  inflating: /kaggle/working/corpora/wordnet/lexnames  
  inflating: /kaggle/working/corpora/wordnet/data.verb  
  inflating: /kaggle/working/corpora/wordnet/index.adv  
  inflating: /kaggle/working/corpora/wordnet/adv.exc  
  inflating: /kaggle/working/corpora/wordnet/index.verb  
  inflating: /kaggle/working/corpora/wordnet/cntlist.rev  
  inflating: /kaggle/working/corpora/wordnet/data.adj  
  inflating: /kaggle/working/corpora/wordnet/index.adj  
  inflating: /kaggle/working/corpora/wordnet/LICENSE  
  inflating: /kaggle/working/corpora/wordnet/citation.bib  
  inflating: /kaggle/working/corpor

# Load and combine data

In [3]:
fake_df = pd.read_csv('/kaggle/input/dataset-original/Dataset_Original/ISOT/Fake.csv')
true_df = pd.read_csv('/kaggle/input/dataset-original/Dataset_Original/ISOT/True.csv')

In [4]:
fake_df['label'] = 1
true_df['label'] = 0

In [5]:
combined_df = pd.concat([fake_df, true_df], ignore_index=True)

In [6]:
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [7]:
combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",1
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",0
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",0
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",1
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",0


In [8]:
print(combined_df['label'].value_counts())

label
1    23481
0    21417
Name: count, dtype: int64


In [9]:
combined_df.dropna(inplace=True)

# Remove duplicates

In [10]:
duplicates_df = combined_df[combined_df.duplicated()]
print(duplicates_df)

                                                   title  \
4106   Islamic State claims responsibility for Aden c...   
5547   Turkey seeks life sentences for 60 ex-military...   
5909   Highlights: The Trump presidency on March 31 a...   
6382   Britain preparing to transfer 400 million poun...   
6779   Israel ambassador asks to meet New Zealand pop...   
...                                                  ...   
44673  Kuwait says GCC to keep operating despite Qata...   
44721  Syrian rebels say discussing evacuation from t...   
44786  Russia may widen designation for media outlets...   
44849  Senate tax bill stalls on deficit-focused 'tri...   
44860  British PM May vows to stay as party plotters ...   

                                                    text       subject  \
4106   CAIRO (Reuters) - Militant group Islamic State...     worldnews   
5547   ISTANBUL (Reuters) - Sixty people including a ...     worldnews   
5909   (Reuters) - Highlights of the day for U.S. Pre... 

In [11]:
combined_df.drop_duplicates(subset='text', keep='first', inplace=True)

# Remove outliers

In [12]:
num_long_rows = (combined_df['title'].str.len() >= 286).sum()
print(num_long_rows)

1


In [13]:
combined_df = combined_df[combined_df['title'].str.len() < 286]

In [14]:
num_long_rows = (combined_df['text'].str.len() >= 29781).sum()
print(num_long_rows)

15


In [15]:
print(combined_df[combined_df['text'].str.len() >= 29781]['label'])

429      1
2354     1
2460     1
2529     1
5923     1
6016     1
8083     1
9033     1
10937    1
11969    1
13420    0
14844    1
16177    1
17013    1
25129    1
Name: label, dtype: int64


# Remove special characters

## URL

In [16]:
# Create a boolean mask for rows that contain a URL
mask = combined_df['text'].str.contains(r'http\S+|www\S+', regex=True)

# Use the mask to select a subset of the DataFrame
url_examples = combined_df[mask]

# Print the first few examples
url_examples.head()

Unnamed: 0,title,text,subject,date,label
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",1
7,STAND UP AND CHEER! UKIP Party Leader SLAMS Ge...,He s been Europe s version of the outspoken Te...,left-news,"Mar 8, 2016",1
25,It Looks Like The Trump Campaign Just Got Thi...,While Donald Trump accuses Hillary Clinton of ...,News,"September 18, 2016",1
26,Asian American Organizations RAIN HELL Upon F...,Asian Americans are pissed off after Fox News ...,News,"October 6, 2016",1
56,WATCH: SEAN HANNITY FANS Make HILARIOUS Videos...,When Keurig decided to side with Soros against...,politics,"Nov 13, 2017",1


In [17]:
# Save original text for comparison
combined_df = combined_df.assign(original_text = combined_df['text'])

# Remove URLs
combined_df.loc[:, 'text'] = combined_df['text'].str.replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)

In [18]:
combined_df.drop(columns=['original_text'], inplace=True)

# Remove special characters

In [19]:
special_char_rows = combined_df[combined_df['text'].str.contains(r'[^A-Za-z0-9\s]', regex=True)]

In [20]:
print("\nExamples of rows with special characters or numbers:")
special_char_rows.head()


Examples of rows with special characters or numbers:


Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",1
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",0
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",0
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",1
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",0


In [21]:
# Function to clean text
def clean_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

In [22]:
combined_df['text'] = combined_df['text'].apply(clean_text)
combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,21st century wire says ben stein reputable pro...,US_News,"February 13, 2017",1
1,Trump drops Steve Bannon from National Securit...,washington reuters us president donald trump ...,politicsNews,"April 5, 2017",0
2,Puerto Rico expects U.S. to lift Jones Act shi...,reuters puerto rico governor ricardo rossello...,politicsNews,"September 27, 2017",0
3,OOPS: Trump Just Accidentally Confirmed He Le...,on monday donald trump once again embarrassed ...,News,"May 22, 2017",1
4,Donald Trump heads for Scotland to reopen a go...,glasgow scotland reuters most us presidential...,politicsNews,"June 24, 2016",0


# Punctuation

In [25]:
combined_df['text'] = combined_df['text'].str.replace('['+string.punctuation+']', '', regex=True)

# Stopwords

In [None]:
#stop_words = set(stopwords.words('english'))
#lemmatizer = WordNetLemmatizer()

In [None]:
#def preprocess_text(text):
    # Tokenize
    #words = word_tokenize(text)
    # Remove stopwords and lemmatize
    #words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    #return ' '.join(words)

In [None]:
#combined_df['text'] = combined_df['text'].apply(preprocess_text)

In [26]:
vectorizer = CountVectorizer()

vectorized_text = vectorizer.fit_transform(combined_df['text'])

In [30]:
# Print example processed data
combined_df.head()

Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,21st century wire says ben stein reputable pro...,US_News,"February 13, 2017",1
1,Trump drops Steve Bannon from National Securit...,washington reuters us president donald trump ...,politicsNews,"April 5, 2017",0
2,Puerto Rico expects U.S. to lift Jones Act shi...,reuters puerto rico governor ricardo rossello...,politicsNews,"September 27, 2017",0
3,OOPS: Trump Just Accidentally Confirmed He Le...,on monday donald trump once again embarrassed ...,News,"May 22, 2017",1
4,Donald Trump heads for Scotland to reopen a go...,glasgow scotland reuters most us presidential...,politicsNews,"June 24, 2016",0


# Save data

In [31]:
# Remove rows with empty text again after processing
combined_df['text'].dropna(inplace=True)

In [37]:
output_dir = '../Dataset_Cleaned'
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, 'clean_train_ISOT.csv')
combined_df.to_csv(output_file, index=False)