In [1]:
import pandas as pd
from sklearn.utils import shuffle
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Load dataset from CSV (assumes a column named 'text' contains the text data)
input_file = "machineLearning_Dataset.csv"  # Replace with your file path
output_file = "augmented_dataset.csv"
data = pd.read_csv(input_file)

In [4]:
# Ensure the dataset has a 'text' column
if 'description' not in data.columns:
    raise ValueError("Dataset must have a 'description' column.")


In [5]:
# Tokenization and Text Augmentation using TensorFlow
# Tokenizer initialization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['description'])

In [6]:
# Augmentation function using random shuffling of words
def augment_text(text):
    tokens = text.split()
    if len(tokens) > 1:
        tf.random.shuffle(tokens)
    return ' '.join(tokens)

In [8]:
# Apply augmentation
augmented_texts = []
for text in data['description']:
    augmented_texts.append(augment_text(text))

In [9]:
# Combine original and augmented data
augmented_data = pd.DataFrame({
    'text': list(data['description']) + augmented_texts,
    'label': list(data['category']) * 2  # Assuming 'label' column exists
})


In [10]:
# Shuffle the dataset
augmented_data = shuffle(augmented_data).reset_index(drop=True)

In [11]:
# Save to a new CSV file
augmented_data.to_csv(output_file, index=False)