In [1]:
import pandas as pd

import re

The Fake News Classification Dataset is an English-language dataset containing just over 45,000 unique news articles. These articles are classified as true (1) or false (0), making it a valuable resource for researchers and practitioners in the field of fake news identification using Transformers models. This is the first version of the dataset aimed at studying fake news detection.

https://www.kaggle.com/datasets/aadyasingh55/fake-news-classification

In [2]:
df_dataset_3 = pd.read_csv("csv_proyecto_personal/dataset_3/train_2.csv", delimiter=";")


pd.set_option('display.max_columns', None)

df_dataset_3

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,Palestinians switch off Christmas lights in Be...,"RAMALLAH, West Bank (Reuters) - Palestinians s...",1
1,1,China says Trump call with Taiwan president wo...,BEIJING (Reuters) - U.S. President-elect Donal...,1
2,2,FAIL! The Trump Organization’s Credit Score W...,While the controversy over Trump s personal ta...,0
3,3,Zimbabwe military chief's China trip was norma...,BEIJING (Reuters) - A trip to Beijing last wee...,1
4,4,THE MOST UNCOURAGEOUS PRESIDENT EVER Receives ...,There has never been a more UNCOURAGEOUS perso...,0
...,...,...,...,...
24348,24348,Mexico Senate committee OK's air transport dea...,MEXICO CITY (Reuters) - A key committee in Mex...,1
24349,24349,BREAKING: HILLARY CLINTON’S STATE DEPARTMENT G...,IF SHE S NOT TOAST NOW THEN WE RE IN BIGGER TR...,0
24350,24350,trump breaks from stump speech to admire beaut...,kremlin nato was created for agression \nruss...,0
24351,24351,NFL PLAYER Delivers Courageous Message: Stop B...,Dallas Cowboys star wide receiver Dez Bryant t...,0


In [3]:
print(f'There are {df_dataset_3.shape[0]} rows in Dataset3')
print(f'There are {df_dataset_3.shape[1]} columns in Dataset3')

There are 24353 rows in Dataset3
There are 4 columns in Dataset3


In [4]:
df_dataset_3.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,24353.0,12176.0,7030.249889,0.0,6088.0,12176.0,18264.0,24352.0
label,24353.0,0.543917,0.498078,0.0,0.0,1.0,1.0,1.0


In [5]:
df_dataset_3.describe(include = 'object').T

Unnamed: 0,count,unique,top,freq
title,24353,24062,no title,105
text,24353,24352,"In its 109-year history, only one F.B.I. direc...",2


In [6]:
df_dataset_3['title'].unique()

array(['Palestinians switch off Christmas lights in Bethlehem in anti-Trump protest',
       "China says Trump call with Taiwan president won't change island's status",
       ' FAIL! The Trump Organization’s Credit Score Will Make You Laugh',
       ...,
       'trump breaks from stump speech to admire beautiful trump mask',
       'NFL PLAYER Delivers Courageous Message: Stop Blaming White People…Black People Are Holding Black People Back',
       'NORDSTROM STOCK TAKES NOSEDIVE After Trump Tweets About Their Decision To Discontinue Ivanka’s Brand'],
      shape=(24062,), dtype=object)

In [7]:
df_dataset_3.dtypes

Unnamed: 0     int64
title         object
text          object
label          int64
dtype: object

In [8]:
#1 is true, 0 is false

for idx, label in enumerate(df_dataset_3['label']): 
    if label == 'TRUE':
        df_dataset_3.at[idx, 'label'] = 'REAL'
    elif label == 'FALSE':
        df_dataset_3.at[idx, 'label'] = 'FAKE'

df_dataset_3.head()



Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,Palestinians switch off Christmas lights in Be...,"RAMALLAH, West Bank (Reuters) - Palestinians s...",1
1,1,China says Trump call with Taiwan president wo...,BEIJING (Reuters) - U.S. President-elect Donal...,1
2,2,FAIL! The Trump Organization’s Credit Score W...,While the controversy over Trump s personal ta...,0
3,3,Zimbabwe military chief's China trip was norma...,BEIJING (Reuters) - A trip to Beijing last wee...,1
4,4,THE MOST UNCOURAGEOUS PRESIDENT EVER Receives ...,There has never been a more UNCOURAGEOUS perso...,0


In [9]:
df_dataset_3['label'].value_counts()

label
1    13246
0    11107
Name: count, dtype: int64

In [10]:
df_dataset_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24353 entries, 0 to 24352
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  24353 non-null  int64 
 1   title       24353 non-null  object
 2   text        24353 non-null  object
 3   label       24353 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 761.2+ KB


In [11]:
duplicates_df_3 = df_dataset_3[df_dataset_3.duplicated(subset= ['title', 'text'])]


duplicates_df_3

Unnamed: 0.1,Unnamed: 0,title,text,label
18813,18813,MEDIA IGNORES Time That Bill Clinton FIRED His...,"In its 109-year history, only one F.B.I. direc...",0


In [12]:
df_dataset_3 = df_dataset_3.drop('Unnamed: 0', axis=1)

df_dataset_3.head()

Unnamed: 0,title,text,label
0,Palestinians switch off Christmas lights in Be...,"RAMALLAH, West Bank (Reuters) - Palestinians s...",1
1,China says Trump call with Taiwan president wo...,BEIJING (Reuters) - U.S. President-elect Donal...,1
2,FAIL! The Trump Organization’s Credit Score W...,While the controversy over Trump s personal ta...,0
3,Zimbabwe military chief's China trip was norma...,BEIJING (Reuters) - A trip to Beijing last wee...,1
4,THE MOST UNCOURAGEOUS PRESIDENT EVER Receives ...,There has never been a more UNCOURAGEOUS perso...,0


In [13]:

pattern = r"^[A-Za-z,]+(?:\s[A-Za-z]+)*\s\([A-Za-z\s]+\)\s"

pattern_hyphen = r"^\s*-"



# Apply regex to clean the text
df_dataset_3['text'] = df_dataset_3['text'].apply(lambda x: re.sub(pattern, "", str(x)).strip())

df_dataset_3['text'] = df_dataset_3['text'].apply(lambda x: re.sub(pattern_hyphen, "", str(x)).strip())

In [14]:
df_dataset_3.head()

Unnamed: 0,title,text,label
0,Palestinians switch off Christmas lights in Be...,Palestinians switched off Christmas lights at ...,1
1,China says Trump call with Taiwan president wo...,U.S. President-elect Donald Trump’s call with ...,1
2,FAIL! The Trump Organization’s Credit Score W...,While the controversy over Trump s personal ta...,0
3,Zimbabwe military chief's China trip was norma...,A trip to Beijing last week by Zimbabwe s mili...,1
4,THE MOST UNCOURAGEOUS PRESIDENT EVER Receives ...,There has never been a more UNCOURAGEOUS perso...,0


In [15]:
for idx, label in enumerate(df_dataset_3['label']): 
    if label == 0:
        df_dataset_3.at[idx, 'label'] = 'FAKE'
    elif label == 1:
        df_dataset_3.at[idx, 'label'] = 'REAL'

df_dataset_3.head()

  df_dataset_3.at[idx, 'label'] = 'REAL'


Unnamed: 0,title,text,label
0,Palestinians switch off Christmas lights in Be...,Palestinians switched off Christmas lights at ...,REAL
1,China says Trump call with Taiwan president wo...,U.S. President-elect Donald Trump’s call with ...,REAL
2,FAIL! The Trump Organization’s Credit Score W...,While the controversy over Trump s personal ta...,FAKE
3,Zimbabwe military chief's China trip was norma...,A trip to Beijing last week by Zimbabwe s mili...,REAL
4,THE MOST UNCOURAGEOUS PRESIDENT EVER Receives ...,There has never been a more UNCOURAGEOUS perso...,FAKE


In [16]:
df_dataset_3.duplicated().sum()

np.int64(2)

In [17]:
df_dataset_3 = df_dataset_3.drop_duplicates()
df_dataset_3.duplicated().sum()

np.int64(0)

In [18]:
df_dataset_3.to_csv('dataset_3.csv', index=False)