### Preprocessing

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Bidirectional, Dense, Dropout, GlobalMaxPooling1D, Input, concatenate
from tensorflow.keras.optimizers import Adamax
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import re
import time
import random

nltk.download('stopwords')
nltk.download('punkt_tab')

# Download WordNet data
nltk.download('wordnet')
nltk.download('omw-1.4')  # For multilingual support

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ashis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ashis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ashis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ashis\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
# Load dataset
true_df = pd.read_csv('true_1.csv')
fake_df = pd.read_csv('fake_1.csv')
true_df['label'] = 1
fake_df['label'] = 0

In [4]:
# Data preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenization
    tokens = text.split()
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [5]:
true_df['text'] = true_df['text'].apply(preprocess_text)

In [7]:
fake_df = fake_df.dropna(subset=['text'])
fake_df['text'] = fake_df['text'].apply(preprocess_text)

### Augmentation

In [10]:
!pip install deep_translator

Collecting deep_translator
  Using cached deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Collecting beautifulsoup4<5.0.0,>=4.9.1 (from deep_translator)
  Downloading beautifulsoup4-4.13.4-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4<5.0.0,>=4.9.1->deep_translator)
  Downloading soupsieve-2.7-py3-none-any.whl.metadata (4.6 kB)
Using cached deep_translator-1.11.4-py3-none-any.whl (42 kB)
Downloading beautifulsoup4-4.13.4-py3-none-any.whl (187 kB)
Downloading soupsieve-2.7-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4, deep_translator
Successfully installed beautifulsoup4-4.13.4 deep_translator-1.11.4 soupsieve-2.7


In [11]:
from deep_translator import GoogleTranslator
import pandas as pd
import random

In [12]:
# Compute required samples
num_to_generate = len(fake_df) - len(true_df)
extra_samples = true_df.sample(n=num_to_generate, replace=True, random_state=42).copy()

In [13]:
# Backtranslation function
counter = 0
def backtranslate(text, src='en', mid='de'):
    global counter
    try:
        translated = GoogleTranslator(source=src, target=mid).translate(text)
        back_translated = GoogleTranslator(source=mid, target=src).translate(translated)
        counter += 1
        print(f"Backtranslation successful: {counter}/{num_to_generate} completed.")
        return back_translated
    except Exception as e:
        print(f"Translation error: {e}")
        return text  # Return original if translation fails

In [15]:
# Apply backtranslation
extra_samples['text'] = extra_samples['text'].apply(lambda x: backtranslate(x))

Backtranslation successful: 14/2085 completed.
Backtranslation successful: 15/2085 completed.
Backtranslation successful: 16/2085 completed.
Backtranslation successful: 17/2085 completed.
Backtranslation successful: 18/2085 completed.
Backtranslation successful: 19/2085 completed.
Backtranslation successful: 20/2085 completed.
Backtranslation successful: 21/2085 completed.
Backtranslation successful: 22/2085 completed.
Backtranslation successful: 23/2085 completed.
Backtranslation successful: 24/2085 completed.
Backtranslation successful: 25/2085 completed.
Backtranslation successful: 26/2085 completed.
Backtranslation successful: 27/2085 completed.
Backtranslation successful: 28/2085 completed.
Backtranslation successful: 29/2085 completed.
Backtranslation successful: 30/2085 completed.
Backtranslation successful: 31/2085 completed.
Backtranslation successful: 32/2085 completed.
Backtranslation successful: 33/2085 completed.
Backtranslation successful: 34/2085 completed.
Backtranslati

In [29]:
augmented_true_df = pd.concat([true_df, extra_samples], ignore_index=True)

In [30]:
augmented_fake_df= fake_df.copy()

In [31]:
print(len(augmented_true_df))
print(len(augmented_fake_df))

23501
23501


In [32]:
df = pd.concat([augmented_true_df, augmented_fake_df])
df.head()

Unnamed: 0,title,text,subject,Unnamed: 4,label
0,"As U.S. budget fight looms, Republicans flip t...",washington reuters head conservative republica...,politicsNews,,1
1,U.S. military to accept transgender recruits o...,washington reuters transgender people allowed ...,politicsNews,,1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,washington reuters special counsel investigati...,politicsNews,,1
3,FBI Russia probe helped by Australian diplomat...,washington reuters trump campaign adviser geor...,politicsNews,,1
4,Trump wants Postal Service to charge 'much mor...,seattlewashington reuters president donald tru...,politicsNews,,1


In [33]:
df.drop(columns='subject', inplace=True)
df = df.drop(df.columns[2], axis=1)
df.drop(columns='title', inplace=True)

In [34]:
df.head()

Unnamed: 0,text,label
0,washington reuters head conservative republica...,1
1,washington reuters transgender people allowed ...,1
2,washington reuters special counsel investigati...,1
3,washington reuters trump campaign adviser geor...,1
4,seattlewashington reuters president donald tru...,1


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47002 entries, 0 to 23556
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    47002 non-null  object
 1   label   47002 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.1+ MB


In [36]:
df.to_csv('1_augmented_df.csv', index=False)