In [4]:
import random
import csv
import re

# Variants for operations and asset types
operations_variants = {
    'Set': ['Set', 'Allocate', 'Place', 'Position', 'Designate', 'Assign'],
    'Update': ['Update', 'Modify', 'Adjust', 'Change', 'Revise', 'Amend', 'Alter']
}

asset_types_variants = {
    'LARGE_CAP_STOCKS': ['large-cap stocks', 'major stocks', 'big-cap stocks'],
    'MID_CAP_STOCKS': ['mid-cap stocks', 'medium stocks', 'mid-size stocks'],
    'SMALL_CAP_STOCKS': ['small-cap stocks', 'minor stocks', 'small-size stocks'],
    'BONDS': ['bonds', 'fixed-income securities', 'debt instruments']
}

# Variants for sentence structures
sentence_structures = [
    "{operation} {percentage} to {asset_type} in {portfolio}",
    "Could you {operation} {percentage} to {asset_type} in {portfolio}?",
    "Thinking of {operation}ing {percentage} {asset_type} into {portfolio}",
    "{operation} {portfolio} by adding {percentage} {asset_type}, please.",
    "Please {operation} {asset_type} by {percentage} in {portfolio}.",
    "I'm considering {operation}ing {portfolio} with an additional {percentage} of {asset_type}.",
    "How about we {operation} {percentage} of {asset_type} in {portfolio}?",
    "Is it possible to {operation} {portfolio}'s {asset_type} allocation by {percentage}?",
    "{operation}ing {portfolio} to include {percentage} more {asset_type}.",
    "We're moving to {operation} {percentage} more to {asset_type} in {portfolio}, correct?",
    "Plan to {operation} the {asset_type} in {portfolio} by {percentage}.",
    "Let's {operation} {percentage} more of {asset_type} to {portfolio}'s portfolio."
]

portfolios = [
    'myPortfolio', 'PortfolioA', 'PortfolioB',
    'GlobalEquityFund', 'TechGrowth', 'IncomeFund2024',
    'GreenEnergyInvest', 'BlueChipStocks', 'HighYieldBonds',
    'RealEstateHoldings', 'PreciousMetalsFund', 'EmergingMarkets',
    'VentureCapital', 'AlphaFund', 'BetaPortfolio',
    'CryptoAssets', 'DividendGrowers', 'ValueInvest'
]

def generate_percentage_variants():
    formats = ['{}%', '{} percent', '{} percentage', '{} proportion']
    percentages = []
    for i in range(1, 101):
        for form in formats:
            percentages.append(form.format(i))
    return percentages

percentages = generate_percentage_variants()

def choose_variant(variants_dict):
    return {key: random.choice(value) for key, value in variants_dict.items()}

# Track seen sentences to avoid duplicates
seen_sentences = set()

def generate_unique_samples(operations_variants, asset_types_variants, sentence_structures, portfolios, percentages, n=100):
    samples = []
    while len(samples) < n:
        operation = random.choice(list(operations_variants.keys()))
        asset_type_key = random.choice(list(asset_types_variants.keys()))
        
        operation_variants = choose_variant(operations_variants)
        asset_type_variants = choose_variant(asset_types_variants)

        data = {
            "operation": operation_variants[operation],
            "asset_type": asset_type_variants[asset_type_key],
            "portfolio": random.choice(portfolios),
            "percentage": random.choice(percentages)
        }

        sentence_structure = random.choice(sentence_structures)
        sentence = sentence_structure.format(**data)
        
        if sentence not in seen_sentences:
            seen_sentences.add(sentence)
            samples.append(sentence)
    return samples

generated_sentences = generate_unique_samples(operations_variants, asset_types_variants, sentence_structures, portfolios, percentages, n=100)

for sentence in generated_sentences[:10]:
    print(sentence)

def generate_dataset(num_samples, file_path, operations_variants, asset_types_variants, portfolios, percentages, sentence_structures):
    seen_training_samples = set()
    training_samples = []

    while len(training_samples) < num_samples:
        operation_key = random.choice(list(operations_variants.keys()))
        asset_type_key = random.choice(list(asset_types_variants.keys()))
        asset_type_variant = random.choice(asset_types_variants[asset_type_key])
        portfolio = random.choice(portfolios)
        percentage = random.choice(percentages)
        sentence_structure = random.choice(sentence_structures)
        
        input_text = sentence_structure.format(
            operation=random.choice(operations_variants[operation_key]).lower(),
            asset_type=asset_type_variant,
            portfolio=portfolio,
            percentage=percentage
        )
        
        # Extract only the numerical part for the DSL command
        percentage_value = re.search(r'\d+', percentage).group(0) + '%'  # Ensuring output is always 'number%'
        
        # Generate corresponding Python code
        if operation_key == 'Set':
            python_code = (
                f"if '{portfolio}' not in portfolio:\n"
                f"    portfolio['{portfolio}'] = {{}}\n"
                f"portfolio['{portfolio}']['{asset_type_key}'] = '{percentage_value}'"
            )
        elif operation_key == 'Update':
            python_code = (
                f"if '{portfolio}' in portfolio and '{asset_type_key}' in portfolio['{portfolio}']:\n"
                f"    portfolio['{portfolio}']['{asset_type_key}'] = '{percentage_value}'  # Update the allocation\n"
                f"else:\n"
                f"    raise ValueError('Portfolio or asset type not found')"
            )

        training_sample = {'input': input_text, 'output': python_code}
        training_sample_tuple = (input_text, python_code)
        
        if training_sample_tuple not in seen_training_samples:
            seen_training_samples.add(training_sample_tuple)
            training_samples.append(training_sample)
    
    # Display some examples to verify the correct formatting in outputs
    for sample in training_samples[:5]:
        print(sample)

    # Writing to CSV
    with open(file_path, mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=['input', 'output'])
        writer.writeheader()
        for sample in training_samples:
            writer.writerow(sample)

generate_dataset(5000, 'D:\\XJTLU\\YEAR4\\FYP\\TrainPython.csv', operations_variants, asset_types_variants, portfolios, percentages, sentence_structures)


Positioning RealEstateHoldings to include 40 proportion more large-cap stocks.
Is it possible to Set PreciousMetalsFund's minor stocks allocation by 41 percent?
Could you Change 99 percent to minor stocks in CryptoAssets?
How about we Set 94 percent of major stocks in IncomeFund2024?
Could you Designate 20 proportion to mid-size stocks in EmergingMarkets?
Updateing RealEstateHoldings to include 95 proportion more bonds.
Plan to Adjust the mid-cap stocks in BetaPortfolio by 49 percent.
I'm considering Changeing PortfolioB with an additional 39% of fixed-income securities.
Thinking of Seting 55 proportion mid-cap stocks into GlobalEquityFund
Could you Alter 3 proportion to small-size stocks in HighYieldBonds?
{'input': 'How about we modify 1 percentage of fixed-income securities in AlphaFund?', 'output': "if 'AlphaFund' in portfolio and 'BONDS' in portfolio['AlphaFund']:\n    portfolio['AlphaFund']['BONDS'] = '1%'  # Update the allocation\nelse:\n    raise ValueError('Portfolio or asset 