In [1]:
import random
import csv
import re

# Variants for operations and asset types
operations_variants = {
    'Set': ['Set', 'Allocate', 'Place', 'Position', 'Designate', 'Assign'],
    'Update': ['Update', 'Modify', 'Adjust', 'Change', 'Revise', 'Amend', 'Alter']
}

asset_types_variants = {
    'LARGE_CAP_STOCKS': ['large-cap stocks', 'major stocks', 'big-cap stocks'],
    'MID_CAP_STOCKS': ['mid-cap stocks', 'medium stocks', 'mid-size stocks'],
    'SMALL_CAP_STOCKS': ['small-cap stocks', 'minor stocks', 'small-size stocks'],
    'BONDS': ['bonds', 'fixed-income securities', 'debt instruments']
}

# Variants for sentence structures including multiple asset types
sentence_structures = [
    "{operation} {percentage} to {asset_type1} and {another_percentage} to {asset_type2} in {portfolio}",
    "Could you {operation} {percentage} to {asset_type1} and {another_percentage} to {asset_type2} in {portfolio}?",
    "Thinking of {operation}ing {percentage} to {asset_type1} and {another_percentage} to {asset_type2} into {portfolio}",
    "{operation} {portfolio} by adding {percentage} to {asset_type1} and {another_percentage} to {asset_type2}, please.",
    "Please {operation} {asset_type1} by {percentage} and {asset_type2} by {another_percentage} in {portfolio}.",
    "I'm considering {operation}ing {portfolio} with an additional {percentage} to {asset_type1} and {another_percentage} to {asset_type2}.",
    "How about we {operation} {percentage} to {asset_type1} and {another_percentage} to {asset_type2} in {portfolio}?",
    "Is it possible to {operation} {portfolio}'s {asset_type1} allocation by {percentage} and {asset_type2} allocation by {another_percentage}?",
    "{operation}ing {portfolio} to include {percentage} more to {asset_type1} and {another_percentage} more to {asset_type2}.",
    "We're moving to {operation} {percentage} more to {asset_type1} and {another_percentage} more to {asset_type2} in {portfolio}, correct?",
    "Plan to {operation} the {asset_type1} in {portfolio} by {percentage} and {asset_type2} by {another_percentage}.",
    "Let's {operation} {percentage} more of {asset_type1} and {another_percentage} more of {asset_type2} to {portfolio}'s portfolio."
]

portfolios = [
    'myPortfolio', 'PortfolioA', 'PortfolioB',
    'GlobalEquityFund', 'TechGrowth', 'IncomeFund2024',
    'GreenEnergyInvest', 'BlueChipStocks', 'HighYieldBonds',
    'RealEstateHoldings', 'PreciousMetalsFund', 'EmergingMarkets',
    'VentureCapital', 'AlphaFund', 'BetaPortfolio',
    'CryptoAssets', 'DividendGrowers', 'ValueInvest'
]

def generate_percentage_variants():
    formats = ['{}%', '{} percent', '{} percentage', '{} proportion']
    percentages = []
    for i in range(1, 101):
        for form in formats:
            percentages.append(form.format(i))
    return percentages

percentages = generate_percentage_variants()

def choose_variant(variants_dict):
    return {key: random.choice(value) for key, value in variants_dict.items()}

# Track seen sentences to avoid duplicates
seen_sentences = set()

def generate_unique_samples(operations_variants, asset_types_variants, sentence_structures, portfolios, percentages, n=100):
    samples = []
    while len(samples) < n:
        operation = random.choice(list(operations_variants.keys()))
        asset_type_keys = list(asset_types_variants.keys())
        asset_type_key1 = random.choice(asset_type_keys)
        asset_type_keys.remove(asset_type_key1)  # Ensure the second asset type is different
        asset_type_key2 = random.choice(asset_type_keys)
        
        operation_variants = choose_variant(operations_variants)
        asset_type_variants = choose_variant(asset_types_variants)

        data = {
            "operation": operation_variants[operation],
            "asset_type1": asset_type_variants[asset_type_key1],
            "asset_type2": asset_type_variants[asset_type_key2],
            "portfolio": random.choice(portfolios),
            "percentage": random.choice(percentages),
            "another_percentage": random.choice(percentages)
        }

        sentence_structure = random.choice(sentence_structures)
        sentence = sentence_structure.format(**data)
        
        if sentence not in seen_sentences:
            seen_sentences.add(sentence)
            samples.append(sentence)
    return samples

generated_sentences = generate_unique_samples(operations_variants, asset_types_variants, sentence_structures, portfolios, percentages, n=100)

for sentence in generated_sentences[:10]:
    print(sentence)

def generate_dataset(num_samples, file_path, operations_variants, asset_types_variants, portfolios, percentages, sentence_structures):
    seen_training_samples = set()
    training_samples = []

    while len(training_samples) < num_samples:
        operation_key = random.choice(list(operations_variants.keys()))
        asset_type_keys = list(asset_types_variants.keys())
        asset_type_key1 = random.choice(asset_type_keys)
        asset_type_keys.remove(asset_type_key1)  # Ensure the second asset type is different
        asset_type_key2 = random.choice(asset_type_keys)
        asset_type_variant1 = random.choice(asset_types_variants[asset_type_key1])
        asset_type_variant2 = random.choice(asset_types_variants[asset_type_key2])
        portfolio = random.choice(portfolios)
        percentage = random.choice(percentages)
        another_percentage = random.choice(percentages)
        sentence_structure = random.choice(sentence_structures)
        
        input_text = sentence_structure.format(
            operation=random.choice(operations_variants[operation_key]).lower(),
            asset_type1=asset_type_variant1,
            asset_type2=asset_type_variant2,
            portfolio=portfolio,
            percentage=percentage,
            another_percentage=another_percentage
        )
        
        # Extract only the numerical parts for the command
        percentage_value1 = re.search(r'\d+', percentage).group(0) + '%'  # Ensuring output is always 'number%'
        percentage_value2 = re.search(r'\d+', another_percentage).group(0) + '%'  # Ensuring output is always 'number%'
        
        # Generate the fund strategy directly
        fund_strategy = (
            f"Strategy for {portfolio}:\n"
            f"1. {operation_key.capitalize()} {percentage_value1} of assets to {asset_type_variant1}.\n"
            f"2. {operation_key.capitalize()} {percentage_value2} of assets to {asset_type_variant2}.\n"
            f"3. Ensure the remaining balance is properly diversified.\n"
            f"4. Monitor the performance of {asset_type_variant1} and {asset_type_variant2} regularly.\n"
            f"5. Rebalance the portfolio quarterly to maintain the target allocation."
        )

        training_sample = {'input': input_text, 'output': fund_strategy}
        training_sample_tuple = (input_text, fund_strategy)
        
        if training_sample_tuple not in seen_training_samples:
            seen_training_samples.add(training_sample_tuple)
            training_samples.append(training_sample)

    with open(file_path, 'w', newline='') as csvfile:
        fieldnames = ['input', 'output']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for sample in training_samples:
            writer.writerow(sample)

# Generate the dataset and save to file
generate_dataset(5000, 'D:\\XJTLU\\YEAR4\\FYP\\TrainDirect.csv', operations_variants, asset_types_variants, portfolios, percentages, sentence_structures)

Thinking of Designateing 92 percent to minor stocks and 62% to fixed-income securities into PortfolioA
Place 91 proportion to large-cap stocks and 36 proportion to small-cap stocks in myPortfolio
We're moving to Set 43 percentage more to small-cap stocks and 34 percentage more to big-cap stocks in GlobalEquityFund, correct?
Could you Adjust 15 proportion to mid-size stocks and 43 percent to small-size stocks in IncomeFund2024?
Place BlueChipStocks by adding 90 percent to big-cap stocks and 7 proportion to mid-size stocks, please.
How about we Amend 68 proportion to small-cap stocks and 9 proportion to fixed-income securities in TechGrowth?
Thinking of Changeing 4% to medium stocks and 24 percentage to small-size stocks into GlobalEquityFund
Please Update small-cap stocks by 68 proportion and large-cap stocks by 9% in PortfolioA.
Assign CryptoAssets by adding 51 percentage to major stocks and 2 percentage to minor stocks, please.
Please Revise bonds by 2 percent and major stocks by 9% i