In [1]:
import random
import csv
import re

In [2]:
# Variants for operations and asset types
operations_variants = {
    'Set': ['Set', 'Allocate', 'Place', 'Position', 'Designate', 'Assign'],
    'Update': ['Update', 'Modify', 'Adjust', 'Change', 'Revise', 'Amend', 'Alter']
}

asset_types_variants = {
    'LARGE_CAP_STOCKS': ['large-cap stocks', 'major stocks', 'big-cap stocks'],
    'MID_CAP_STOCKS': ['mid-cap stocks', 'medium stocks', 'mid-size stocks'],
    'SMALL_CAP_STOCKS': ['small-cap stocks', 'minor stocks', 'small-size stocks'],
    'BONDS': ['bonds', 'fixed-income securities', 'debt instruments']
}

# Variants for sentence structures
sentence_structures = [
    "{operation} {percentage} to {asset_type} in {portfolio}",
    "Could you {operation} {percentage} to {asset_type} in {portfolio}?",
    "Thinking of {operation}ing {percentage} {asset_type} into {portfolio}",
    "{operation} {portfolio} by adding {percentage} {asset_type}, please.",
    "Please {operation} {asset_type} by {percentage} in {portfolio}.",
    "I'm considering {operation}ing {portfolio} with an additional {percentage} of {asset_type}.",
    "How about we {operation} {percentage} of {asset_type} in {portfolio}?",
    "Is it possible to {operation} {portfolio}'s {asset_type} allocation by {percentage}?",
    "{operation}ing {portfolio} to include {percentage} more {asset_type}.",
    "We're moving to {operation} {percentage} more to {asset_type} in {portfolio}, correct?",
    "Plan to {operation} the {asset_type} in {portfolio} by {percentage}.",
    "Let's {operation} {percentage} more of {asset_type} to {portfolio}'s portfolio."
]

portfolios = [
    'myPortfolio', 'PortfolioA', 'PortfolioB',
    'GlobalEquityFund', 'TechGrowth', 'IncomeFund2024',
    'GreenEnergyInvest', 'BlueChipStocks', 'HighYieldBonds',
    'RealEstateHoldings', 'PreciousMetalsFund', 'EmergingMarkets',
    'VentureCapital', 'AlphaFund', 'BetaPortfolio',
    'CryptoAssets', 'DividendGrowers', 'ValueInvest'
]

def generate_percentage_variants():
    # Define different formats for expressing percentages
    formats = ['{}%', '{} percent', '{} percentage', '{} proportion']  # Correct format for each style
    percentages = []
    for i in range(1, 101):
        for form in formats:
            percentages.append(form.format(i))  # Format string correctly based on type
    return percentages

percentages = generate_percentage_variants()

In [3]:
def choose_variant(variants_dict):
    return {key: random.choice(value) for key, value in variants_dict.items()}

# Track seen sentences to avoid duplicates
seen_sentences = set()

def generate_unique_samples(operations_variants, asset_types_variants, sentence_structures, portfolios, percentages, n=100):
    samples = []
    while len(samples) < n:
        operation = random.choice(list(operations_variants.keys()))
        asset_type_key = random.choice(list(asset_types_variants.keys()))
        
        operation_variants = choose_variant(operations_variants)
        asset_type_variants = choose_variant(asset_types_variants)

        data = {
            "operation": operation_variants[operation],
            "asset_type": asset_type_variants[asset_type_key],
            "portfolio": random.choice(portfolios),
            "percentage": random.choice(percentages)
        }

        sentence_structure = random.choice(sentence_structures)
        sentence = sentence_structure.format(**data)
        
        if sentence not in seen_sentences:
            seen_sentences.add(sentence)
            samples.append(sentence)
    return samples

generated_sentences = generate_unique_samples(operations_variants, asset_types_variants, sentence_structures, portfolios, percentages, n=100)

for sentence in generated_sentences[:10]:
    print(sentence)

Designate GreenEnergyInvest by adding 1 proportion debt instruments, please.
Place 91 percentage to large-cap stocks in VentureCapital
I'm considering Positioning EmergingMarkets with an additional 31 percent of bonds.
Set 58 proportion to medium stocks in BetaPortfolio
Let's Adjust 5 proportion more of minor stocks to PortfolioA's portfolio.
Plan to Designate the fixed-income securities in GreenEnergyInvest by 90 percent.
Assign 100 percent to major stocks in ValueInvest
Is it possible to Revise myPortfolio's big-cap stocks allocation by 1 proportion?
Please Assign fixed-income securities by 41 percent in CryptoAssets.
Please Revise debt instruments by 46 proportion in HighYieldBonds.


In [4]:
def generate_dataset(num_samples, file_path, operations_variants, asset_types_variants, portfolios, percentages, sentence_structures):
    seen_training_samples = set()
    training_samples = []

    while len(training_samples) < num_samples:
        operation_key = random.choice(list(operations_variants.keys()))
        asset_type_key = random.choice(list(asset_types_variants.keys()))
        asset_type_variant = random.choice(asset_types_variants[asset_type_key])
        portfolio = random.choice(portfolios)
        percentage = random.choice(percentages)
        sentence_structure = random.choice(sentence_structures)
        
        input_text = sentence_structure.format(
            operation=random.choice(operations_variants[operation_key]).lower(),
            asset_type=asset_type_variant,
            portfolio=portfolio,
            percentage=percentage
        )
        
        # Extract only the numerical part for the DSL command
        percentage_value = re.search(r'\d+', percentage).group(0) + '%'  # Ensuring output is always 'number%'
        
        dsl_command = f"{operation_key.upper()} ETF {portfolio} WITH {asset_type_key} = {percentage_value}"
        
        training_sample = {'input': input_text, 'output': dsl_command}
        training_sample_tuple = (input_text, dsl_command)
        
        if training_sample_tuple not in seen_training_samples:
            seen_training_samples.add(training_sample_tuple)
            training_samples.append(training_sample)
    
    # Display some examples to verify the correct formatting in outputs
    for sample in training_samples[:5]:
        print(sample)

    # Writing to CSV
    with open(file_path, mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=['input', 'output'])
        writer.writeheader()
        for sample in training_samples:
            writer.writerow(sample)

In [6]:
generate_dataset(10000, 'D:\\XJTLU\\YEAR4\\FYP\\Train.csv', operations_variants, asset_types_variants, portfolios, percentages, sentence_structures)

{'input': 'Thinking of allocateing 33 percentage small-cap stocks into HighYieldBonds', 'output': 'SET ETF HighYieldBonds WITH SMALL_CAP_STOCKS = 33%'}
{'input': 'Please alter minor stocks by 6 percent in myPortfolio.', 'output': 'UPDATE ETF myPortfolio WITH SMALL_CAP_STOCKS = 6%'}
{'input': "We're moving to amend 11% more to fixed-income securities in BlueChipStocks, correct?", 'output': 'UPDATE ETF BlueChipStocks WITH BONDS = 11%'}
{'input': 'assigning GreenEnergyInvest to include 66% more big-cap stocks.', 'output': 'SET ETF GreenEnergyInvest WITH LARGE_CAP_STOCKS = 66%'}
{'input': 'Could you alter 23 proportion to fixed-income securities in HighYieldBonds?', 'output': 'UPDATE ETF HighYieldBonds WITH BONDS = 23%'}


In [10]:
data = [
    # Complex semantic structures
    ("In the scenario where we increase our holdings, could you possibly escalate the proportion of mid-cap stocks in GrowthFund2024 to 38%?", "UPDATE ETF GrowthFund2024 WITH MID_CAP_STOCKS = 38%"),
    ("Provided that market conditions favor larger companies, what would be the impact of boosting large-cap stocks in ValueMax by 20 percentage points?", "UPDATE ETF ValueMax WITH LARGE_CAP_STOCKS = 20%"),
    ("Given the recent uptick in the market, could we consider escalating our position in high-cap stocks in MarketIndex by 15%?", "UPDATE ETF MarketIndex WITH LARGE_CAP_STOCKS = 15%"),
    ("With the forecasted downturn, should we reduce our exposure in AsianMarkets by decreasing the bonds allocation by 20%?", "UPDATE ETF AsianMarkets WITH BONDS = 20%"),
    # Language variants and informal expressions
    ("Yo, just bump up the small-caps in EquityGrowth by like 45%, will ya?", "UPDATE ETF EquityGrowth WITH SMALL_CAP_STOCKS = 45%"),
    ("Hey, can you jack up the bonds portion to 50% in SafeHavenFund cuz it feels right?", "UPDATE ETF SafeHavenFund WITH BONDS = 50%"),
    ("Dude, let's crank up the bonds in RetirementFund to a solid 60%, alright?", "UPDATE ETF RetirementFund WITH BONDS = 60%"),
    ("Yo, drop the percentage of mid-caps in GrowthSector to just 18%, cool?", "UPDATE ETF GrowthSector WITH MID_CAP_STOCKS = 18%"),
    # Data containing noise and typographical errors
    ("Could you uupdate the allocaton of large-cap stoks in PremierPortfolio by 25%?", "UPDATE ETF PremierPortfolio WITH LARGE_CAP_STOCKS = 25%"),
    ("Set the bonds in Income2025 by 22% even with speling erors in input.", "SET ETF Income2025 WITH BONDS = 22%"),
    ("Could you pleas fix the alloc of smll-cap stocks in EmergingTech by 35%?", "UPDATE ETF EmergingTech WITH SMALL_CAP_STOCKS = 35%"),
    ("Update the bonds for Income2024 to 25% even with typos in the input.", "UPDATE ETF Income2024 WITH BONDS = 25%"),
    # Ambiguities and polysemies
    ("Adjust the investment in BlueChipStocks, focusing on 'stocks', by 33%.", "UPDATE ETF BlueChipStocks WITH LARGE_CAP_STOCKS = 33%"),
    ("It's time to shift 40 percent towards more secure assets in VolatileTimesFund, like bonds.", "UPDATE ETF VolatileTimesFund WITH BONDS = 40%"),
    ("Set the stakes in GlobalEquity to large-caps by 27%, especially focusing on stocks.", "UPDATE ETF GlobalEquity WITH LARGE_CAP_STOCKS = 27%"),
    ("Consider adjusting mid-cap exposure in BalancedFund to 30%, depending on market trends.", "UPDATE ETF BalancedFund WITH MID_CAP_STOCKS = 30%"),
    # Extreme and uncommon cases
    ("Divert 100% of assets in RiskyVentures to bonds immediately due to market crash.", "UPDATE ETF RiskyVentures WITH BONDS = 100%"),
    ("Zero out the small-cap stocks in TechInnovationFund as a strategic move to refocus.", "UPDATE ETF TechInnovationFund WITH SMALL_CAP_STOCKS = 0%"),
    ("Completely divest from bonds in RiskPortfolio given the new tax implications.", "UPDATE ETF RiskPortfolio WITH BONDS = 0%"),
    ("Max out the small-cap allocation in StartupVentures to capitalize on new market entrants.", "UPDATE ETF StartupVentures WITH SMALL_CAP_STOCKS = 100%"),
    ("As a bold move, set 100% of holdings in QuantumOpportunities to small-cap stocks.", "SET ETF QuantumOpportunities WITH SMALL_CAP_STOCKS = 100%"),
    ("For a major strategy overhaul in SpeculativeAssets, start by setting large-cap stocks to 50%.", "SET ETF SpeculativeAssets WITH LARGE_CAP_STOCKS = 50%"),
    ("To capitalize on the recent market upturn, set mid-cap stocks in GrowthFund2025 to 60%.", "SET ETF GrowthFund2025 WITH MID_CAP_STOCKS = 60%"),
    ("In an aggressive growth tactic, set bonds to 0% in HighRiskHighReturn to minimize fixed income.", "SET ETF HighRiskHighReturn WITH BONDS = 0%"),
    # Culturally or regionally specific expressions
    ("Can we up the ante on small-cap stocks in LondonInvestments by 15% post-Brexit?", "UPDATE ETF LondonInvestments WITH SMALL_CAP_STOCKS = 15%"),
    ("In light of recent EU regulations, decrease the bond holdings in EuroGrowth by 12%.", "UPDATE ETF EuroGrowth WITH BONDS = 12%"),
    ("Post-regulation adjustments, bump up bonds in EuroCapital by 19% to stay compliant.", "UPDATE ETF EuroCapital WITH BONDS = 19%"),
    ("Given the new fiscal policies, should we enhance our large-cap position in USGrowthFund by 25%?", "UPDATE ETF USGrowthFund WITH LARGE_CAP_STOCKS = 25%"),
    ("Allocate a new 24% to bonds in FutureSavings to diversify the portfolio.", "SET ETF FutureSavings WITH BONDS = 24%"),
    ("Let's start a position in GreenTech by setting up 35% in small-cap stocks.", "SET ETF GreenTech WITH SMALL_CAP_STOCKS = 35%"),
    ("Introduce mid-cap stocks to the mix in ValuePortfolio, starting with 22% allocation.", "SET ETF ValuePortfolio WITH MID_CAP_STOCKS = 22%"),
    ("Position 50% into large-cap stocks in NewEraInvestments as the initial setup.", "SET ETF NewEraInvestments WITH LARGE_CAP_STOCKS = 50%"),
    ("Kickoff the financial plan by placing 18% into bonds in SecureAssets.", "SET ETF SecureAssets WITH BONDS = 18%"),
    ("Can we initiate an exposure to small-cap stocks in DynamicGrowth by setting it to 40%?", "SET ETF DynamicGrowth WITH SMALL_CAP_STOCKS = 40%"),
    ("Considering the shift in market trends, let's establish a 45% bond position in StabilityFund.", "SET ETF StabilityFund WITH BONDS = 45%"),
    ("Begin the large-cap stocks investment in EquityPrime with a starting allocation of 30%.", "SET ETF EquityPrime WITH LARGE_CAP_STOCKS = 30%"),
    ("Given the robust growth, set the mid-cap stocks allocation in TechLeaders to 28% initially.", "SET ETF TechLeaders WITH MID_CAP_STOCKS = 28%"),
    ("Set the foundation for our new strategy in GlobalDiversified by allocating 55% to bonds.", "SET ETF GlobalDiversified WITH BONDS = 55%"),
    ("Under new management, let’s establish 33% bonds in AsiaPacificGrowth.", "SET ETF AsiaPacificGrowth WITH BONDS = 33%"),
    ("As part of our European expansion, set up 25% in small-cap stocks in EuroVentures.", "SET ETF EuroVentures WITH SMALL_CAP_STOCKS = 25%"),
    ("For our entry into emerging markets, let's set mid-cap stocks at 39% in EmergingMarketsFund.", "SET ETF EmergingMarketsFund WITH MID_CAP_STOCKS = 39%"),
    ("In preparation for the upcoming fiscal changes, set up 20% bonds in TaxAdvantage.", "SET ETF TaxAdvantage WITH BONDS = 20%"),
]

# Save to CSV file
with open('Test.csv', 'a', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["input", "output"])  # Write header
    for entry in data:
        writer.writerow(entry)
