# Synthetic Dataset Generation for NLP and Robotic Kinematics

This notebook generates a synthetic dataset optimized for combined NLP and robotic kinematics tasks in a reinforcement learning model. The dataset includes both textual and sensor data, with feature engineering applied to enhance its suitability for machine learning tasks.

In [None]:
%%capture
!pip install markovify

In [None]:
import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import nltk
from nltk.corpus import brown
import markovify
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from imblearn.under_sampling import RandomUnderSampler

nltk.download('brown')


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

## NLP Data Generation Using Brown Corpus

In [None]:

# Generate text data using Markov chains built from the Brown corpus
text = ' '.join(brown.words())
text_model = markovify.Text(text)

def generate_text_data(num_samples):
    return [text_model.make_short_sentence(100) for _ in range(num_samples)]


## Synthetic Dataset Generation Function

In [None]:

def generate_synthetic_dataset(num_samples=1000, num_sensors=128, num_actions=10, noise_level=0.05):
    timestamps = [datetime.now() + timedelta(seconds=i) for i in range(num_samples)]
    text_data = generate_text_data(num_samples)
    sensor_data = {}
    for i in range(num_sensors):
        period = np.random.randint(10, 100)
        sensor_data[f'sensor_{i}'] = np.sin(np.linspace(0, 2 * np.pi * period, num_samples)) + np.random.normal(0, noise_level, num_samples)
    action_data = np.random.randint(0, num_actions, num_samples)
    reward_data = np.random.uniform(-1, 1, num_samples)
    return pd.DataFrame({
        'timestamp': timestamps,
        'text_data': text_data,
        **sensor_data,
        'action': action_data,
        'reward': reward_data
    })


## Feature Engineering

In [None]:

def feature_engineering(df):
    scaler = StandardScaler()
    for col in df.columns:
        if col.startswith('sensor_'):
            df[col] = scaler.fit_transform(df[[col]])
    return df

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

def wrangle_data(file_path):
    # Load the dataset
    data = pd.read_csv(file_path)

    # Parse the timestamp
    data['timestamp'] = pd.to_datetime(data['timestamp'])

    # Handle missing values (if any)
    # This is a placeholder; you'll need to decide how to handle missing values based on your specific dataset
    data = data.fillna(method='ffill')

    # Encode the 'text_data' using TF-IDF
    # Note: You might want to adjust max_features based on the dataset
    tfidf = TfidfVectorizer(max_features=50)
    tfidf_features = tfidf.fit_transform(data['text_data']).toarray()
    tfidf_df = pd.DataFrame(tfidf_features, columns=[f'text_{i}' for i in range(tfidf_features.shape[1])])
    data = pd.concat([data, tfidf_df], axis=1).drop('text_data', axis=1)

    # Normalize the sensor data
    sensor_columns = [col for col in data.columns if col.startswith('sensor_')]
    scaler = StandardScaler()
    data[sensor_columns] = scaler.fit_transform(data[sensor_columns])

    return data

# Usage
file_path = '/content/synthetic_datasetV3.csv'
processed_data = wrangle_data(file_path)


## Dataset Reorganization for Reinforcement Learning

In [None]:

def reorganize_for_rl(df, num_sensors=128):
    sensor_columns = [f'sensor_{i}' for i in range(num_sensors)]
    action_counts = df['action'].value_counts()
    min_count = action_counts.min()
    balanced_df = pd.concat([df[df['action'] == action].sample(min_count) for action in action_counts.index])
    return balanced_df


## Directory Creation for File Saving

Before saving files, it's important to ensure that the target directory exists. The following cell will create the `/mnt/data` directory if it doesn't already exist. This prevents errors when saving files to this directory.

In [None]:
# Create the directory if it doesn't exist
os.makedirs('/mnt/data', exist_ok=True)

## Generate and Save the Dataset

In [None]:
# Generate the dataset
dataset = generate_synthetic_dataset()

# Apply feature engineering
engineered_dataset = feature_engineering(dataset)

# Reorganize the dataset for RL
rl_dataset = reorganize_for_rl(engineered_dataset)

# Save to CSV in Colab's local workspace
rl_dataset.to_csv('synthetic_dataset_advanced.csv', index=False)

# Optionally, to save in Google Drive, mount your drive (uncomment the following lines):

from google.colab import drive
drive.mount('/content/drive')

# Then, specify the path in your drive where you want to save the file
rl_dataset.to_csv('/content/drive/My Drive/synthetic_datasetV3.csv', index=False)


Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/synthetic_dataset_advanced.csv')
df.head(3)

Unnamed: 0,timestamp,text_data,sensor_0,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,...,sensor_120,sensor_121,sensor_122,sensor_123,sensor_124,sensor_125,sensor_126,sensor_127,action,reward
0,2024-02-06 00:53:50.668844,,-0.73309,-0.95224,-1.022493,-1.283032,0.450022,-0.284153,-0.911242,0.789324,...,1.287007,-0.979347,-1.007706,-0.778173,1.293042,-1.12106,0.745845,0.526928,6,-0.991363
1,2024-02-06 00:48:48.668504,7 the portion of the knife . **yc is defined b...,1.478311,-0.572608,0.995453,0.521949,-0.075699,1.381793,-0.511772,1.422475,...,-0.48974,0.40923,0.352797,1.500702,1.450614,1.414393,1.411396,-0.245006,6,-0.356302
2,2024-02-06 00:54:48.668907,2 . Plastic signs are economical . According t...,1.289525,-0.275127,-0.421727,-1.424338,-1.473373,1.157909,-0.2179,-0.820891,...,1.068416,1.485613,1.418063,1.284905,-1.401183,1.123872,-0.653912,-0.642793,6,0.974335
