In [1]:
import random
import pandas as pd
import numpy as np

class DynamicSentenceGenerator:
    def __init__(self):
        # Medical terminology lists
        self.medical_subjects = [
            "patient", "client", "individual", "person",
            "healthcare provider", "medical professional",
            "elderly patient", "adult", "young adult",
            "pediatric case", "outpatient", "inpatient"
        ]

        self.medical_modifiers = [
            "critical", "acute", "chronic", "mild", "severe",
            "preliminary", "ongoing", "complex", "secondary",
            "primary", "intermittent", "progressive"
        ]

        self.medical_actions = [
            "examined", "consulted", "assessed", "evaluated",
            "diagnosed", "interviewed", "screened",
            "monitored", "counseled", "treated",
            "referred", "investigated"
        ]

        self.medical_conditions = [
            "symptoms", "health status", "medical history",
            "current condition", "treatment progress",
            "chronic illness", "acute symptoms",
            "psychological state", "physiological markers",
            "preliminary diagnosis"
        ]

        self.medical_contexts = [
            "during consultation", "in recent examination",
            "after screening", "following assessment",
            "based on records", "per clinical review",
            "in emergency", "during checkup",
            "after evaluation", "in follow-up"
        ]

        self.outcome_indicators = [
            "requires investigation",
            "shows improvement",
            "needs monitoring",
            "indicates concern",
            "suggests treatment",
            "warrants attention",
            "demonstrates complexity",
            "needs approach",
            "shows response",
            "requires care"
        ]

    def generate_controlled_length_sentence(self):
        """Generate a sentence with controlled word length (6-20 words)."""
        while True:
            # Choose key components
            subject = random.choice(self.medical_subjects)
            modifier = random.choice(self.medical_modifiers)
            action = random.choice(self.medical_actions)
            condition = random.choice(self.medical_conditions)
            context = random.choice(self.medical_contexts)
            outcome = random.choice(self.outcome_indicators)

            # Generate sentence templates with varying structures
            sentence_templates = [
                f"The {modifier} {subject} was {action} {context}.",
                f"{subject} {action} {condition} {context}, noting {outcome}.",
                f"Medical review of {modifier} {condition} {context} revealed {outcome}.",
                f"Detailed {action} for {subject} shows {outcome} in {condition}.",
                f"{context.capitalize()}, {subject} demonstrates {modifier} {condition}.",
                f"Preliminary assessment indicates {outcome} for {subject}'s {condition}.",
                f"{subject}'s {modifier} {condition} requires {outcome}.",
                f"Medical team {action} {context} with {modifier} findings.",
                f"Comprehensive evaluation of {subject} suggests {outcome}.",
                f"{context.capitalize()} reveals {modifier} {condition} status."
            ]

            # Select a random template
            sentence = random.choice(sentence_templates)

            # Count words and validate
            word_count = len(sentence.split())
            if 6 <= word_count <= 20:
                return sentence.capitalize() + ('.' if not sentence.endswith('.') else '')

    def generate_large_scale_sentences(self, num_examples=10000):
        """Generate a large number of medical sentences with controlled length."""
        # Generate sentences
        sentences = [self.generate_controlled_length_sentence() for _ in range(num_examples)]

        # Create DataFrame
        df = pd.DataFrame({
            'text': sentences,
            'tag': 'OTHER'  # Default tag for all generated sentences
        })

        return df

def main():
    # Set random seed for reproducibility
    random.seed(42)
    np.random.seed(42)

    # Generate large-scale sentences
    generator = DynamicSentenceGenerator()
    medical_df = generator.generate_large_scale_sentences(num_examples=1500)

    # Display basic information
    print("Generated Medical Sentences DataFrame:")
    print(medical_df.info())

    # Show first few rows
    print("\nFirst 5 Rows:")
    print(medical_df.head())


    # Save to CSV
    medical_df.to_csv('medical_sentences_other_tag.csv', index=False)
    print("\nDataFrame saved to 'medical_sentences_other_tag.csv'")

if __name__ == "__main__":
    main()

Generated Medical Sentences DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1500 non-null   object
 1   tag     1500 non-null   object
dtypes: object(2)
memory usage: 23.6+ KB
None

First 5 Rows:
                                                text    tag
0  Medical review of acute treatment progress fol...  OTHER
1  Inpatient's acute physiological markers requir...  OTHER
2  Following assessment reveals critical current ...  OTHER
3  Medical team evaluated in emergency with secon...  OTHER
4  In emergency, pediatric case demonstrates seve...  OTHER

DataFrame saved to 'medical_sentences_other_tag.csv'


In [2]:
import random
import pandas as pd
import numpy as np

class FormalSentenceGenerator:
    def __init__(self):
        # Formal subjects
        self.subjects = [
            "corporation", "institution", "organization",
            "financial entity", "automotive company",
            "insurance provider", "financial service",
            "investment firm", "transportation agency"
        ]

        self.professional_modifiers = [
            "comprehensive", "strategic", "systematic",
            "innovative", "advanced", "fundamental",
            "integrated", "specialized", "efficient"
        ]

        self.analytical_actions = [
            "evaluated", "analyzed", "assessed",
            "developed", "implemented", "researched",
            "established", "structured", "introduced"
        ]

        self.financial_aspects = [
            "risk management", "financial strategy",
            "investment portfolio", "market positioning",
            "operational efficiency", "asset allocation",
            "financial performance", "market expansion",
            "technological infrastructure"
        ]

        self.temporal_contexts = [
            "during fiscal year", "in current quarter",
            "based on annual report", "following market analysis",
            "after comprehensive review", "in strategic planning",
            "through systematic evaluation"
        ]

        self.outcome_descriptors = [
            "demonstrates significant potential",
            "indicates strategic alignment",
            "confirms operational effectiveness",
            "reveals comprehensive approach",
            "supports financial objectives",
            "enhances market competitiveness",
            "establishes operational framework",
            "provides strategic insight",
            "validates technological implementation"
        ]

    def generate_controlled_length_sentence(self):
        """Generate a formal sentence with controlled word length (6-20 words)."""
        while True:
            # Choose key components
            subject = random.choice(self.subjects)
            modifier = random.choice(self.professional_modifiers)
            action = random.choice(self.analytical_actions)
            aspect = random.choice(self.financial_aspects)
            context = random.choice(self.temporal_contexts)
            outcome = random.choice(self.outcome_descriptors)

            # Generate formal sentence templates
            sentence_templates = [
                f"The {modifier} {subject} {action} {aspect} {context}.",
                f"{subject} {action} {aspect} {context}, which {outcome}.",
                f"Comprehensive analysis of {subject}'s {aspect} {context} reveals {outcome}.",
                f"{context.capitalize()}, {subject} {action} {modifier} {aspect}.",
                f"Strategic evaluation indicates {subject}'s {aspect} {outcome}.",
                f"{subject}'s {modifier} approach to {aspect} {context} demonstrates potential.",
                f"Systematic {action} of {aspect} by {subject} {context} confirms objectives.",
                f"The {modifier} framework implemented by {subject} {context} enhances effectiveness.",
                f"{context.capitalize()} reveals {subject}'s {aspect} through {modifier} implementation.",
                f"{subject} {action} {aspect} {context}, establishing strategic alignment."
            ]

            # Select a random template
            sentence = random.choice(sentence_templates)

            # Count words and validate
            word_count = len(sentence.split())
            if 6 <= word_count <= 15:
                return sentence.capitalize() + ('.' if not sentence.endswith('.') else '')

    def generate_large_scale_sentences(self, num_examples=10000):
        """Generate a large number of formal sentences with controlled length."""
        # Generate sentences
        sentences = [self.generate_controlled_length_sentence() for _ in range(num_examples)]

        # Create DataFrame
        df = pd.DataFrame({
            'text': sentences,
            'tag': 'OTHER'  # Default tag for all generated sentences
        })

        return df

def main():
    # Set random seed for reproducibility
    random.seed(42)
    np.random.seed(42)

    # Generate large-scale sentences
    generator = FormalSentenceGenerator()
    sentences_df = generator.generate_large_scale_sentences(num_examples=1000)

    # Display basic information
    print("Generated Formal Sentences DataFrame:")
    print(sentences_df.info())

    # Show first few rows
    print("\nFirst 5 Rows:")
    print(sentences_df.head())

    # Save to CSV
    sentences_df.to_csv('formal_financial_sentences.csv', index=False)
    print("\nDataFrame saved to 'formal_financial_sentences.csv'")

if __name__ == "__main__":
    main()

Generated Formal Sentences DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1000 non-null   object
 1   tag     1000 non-null   object
dtypes: object(2)
memory usage: 15.8+ KB
None

First 5 Rows:
                                                text    tag
0  Institution implemented market positioning in ...  OTHER
1  During fiscal year, transportation agency esta...  OTHER
2  Systematic evaluated of technological infrastr...  OTHER
3  Systematic implemented of risk management by f...  OTHER
4  Insurance provider assessed market positioning...  OTHER

DataFrame saved to 'formal_financial_sentences.csv'


In [3]:
import random
import pandas as pd
import numpy as np

class WebsiteSentenceGenerator:
    def __init__(self):
        # Digital and web-related subjects
        self.subjects = [
            "web platform", "digital service", "online application",
            "technology company", "software development team",
            "digital enterprise", "web solution provider",
            "internet platform", "digital interface"
        ]

        self.technical_modifiers = [
            "responsive", "scalable", "innovative",
            "adaptive", "advanced", "comprehensive",
            "secure", "dynamic", "intelligent"
        ]

        self.technical_actions = [
            "developed", "implemented", "optimized",
            "designed", "enhanced", "integrated",
            "transformed", "restructured", "launched"
        ]

        self.web_aspects = [
            "user experience", "digital infrastructure",
            "web architecture", "platform functionality",
            "interface design", "backend system",
            "performance metrics", "digital ecosystem",
            "technological framework"
        ]

        self.development_contexts = [
            "during system upgrade", "in recent development",
            "through agile methodology", "following user research",
            "after performance analysis", "in digital transformation",
            "using advanced technologies"
        ]

        self.outcome_descriptors = [
            "improves user engagement",
            "increases platform efficiency",
            "enhances digital capabilities",
            "streamlines web interactions",
            "optimizes system performance",
            "advances technological solutions",
            "creates seamless experience",
            "demonstrates technical innovation",
            "supports digital strategy"
        ]

    def generate_controlled_length_sentence(self):
        """Generate a technical sentence with controlled word length (6-20 words)."""
        while True:
            # Choose key components
            subject = random.choice(self.subjects)
            modifier = random.choice(self.technical_modifiers)
            action = random.choice(self.technical_actions)
            aspect = random.choice(self.web_aspects)
            context = random.choice(self.development_contexts)
            outcome = random.choice(self.outcome_descriptors)

            # Generate technical sentence templates
            sentence_templates = [
                f"The {modifier} {subject} {action} {aspect} {context}.",
                f"{subject} {action} {aspect} {context}, which {outcome}.",
                f"Technical analysis of {subject}'s {aspect} {context} reveals {outcome}.",
                f"{context.capitalize()}, {subject} {action} {modifier} {aspect}.",
                f"Digital evaluation indicates {subject}'s {aspect} {outcome}.",
                f"{subject}'s {modifier} approach to {aspect} {context} enhances performance.",
                f"Systematic {action} of {aspect} by {subject} {context} improves functionality.",
                f"The {modifier} framework developed by {subject} {context} transforms digital experience.",
                f"{context.capitalize()} reveals {subject}'s {aspect} through {modifier} implementation.",
                f"{subject} {action} {aspect} {context}, advancing digital capabilities."
            ]

            # Select a random template
            sentence = random.choice(sentence_templates)

            # Count words and validate
            word_count = len(sentence.split())
            if 6 <= word_count <= 15:
                return sentence.capitalize() + ('.' if not sentence.endswith('.') else '')

    def generate_large_scale_sentences(self, num_examples=10000):
        """Generate a large number of technical sentences with controlled length."""
        # Generate sentences
        sentences = [self.generate_controlled_length_sentence() for _ in range(num_examples)]

        # Create DataFrame
        df = pd.DataFrame({
            'text': sentences,
            'tag': 'OTHER'  # Default tag for all generated sentences
        })

        return df

def main():
    # Set random seed for reproducibility
    random.seed(42)
    np.random.seed(42)

    # Generate large-scale sentences
    generator = WebsiteSentenceGenerator()
    sentences_df = generator.generate_large_scale_sentences(num_examples=500)

    # Display basic information
    print("Generated Web Technology Sentences DataFrame:")
    print(sentences_df.info())

    # Show first few rows
    print("\nFirst 5 Rows:")
    print(sentences_df.head())

    # Save to CSV
    sentences_df.to_csv('web_technology_sentences.csv', index=False)
    print("\nDataFrame saved to 'web_technology_sentences.csv'")

if __name__ == "__main__":
    main()

Generated Web Technology Sentences DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    500 non-null    object
 1   tag     500 non-null    object
dtypes: object(2)
memory usage: 7.9+ KB
None

First 5 Rows:
                                                text    tag
0  Digital service enhanced platform functionalit...  OTHER
1  During system upgrade, digital interface trans...  OTHER
2  Systematic developed of technological framewor...  OTHER
3  Systematic enhanced of user experience by tech...  OTHER
4  Digital enterprise optimized platform function...  OTHER

DataFrame saved to 'web_technology_sentences.csv'


In [4]:
import pandas as pd

# Load the CSV files into pandas DataFrames
df1 = pd.read_csv('/content/formal_financial_sentences.csv')
df2 = pd.read_csv('/content/medical_sentences_other_tag.csv')
df3 = pd.read_csv('/content/web_technology_sentences.csv')

# Concatenate the DataFrames
merged_df = pd.concat([df1, df2, df3], ignore_index=True)

# Display the merged DataFrame (optional)
print(merged_df)

# Save the merged DataFrame to a new CSV file (optional)
merged_df.to_csv('merged_sentences.csv', index=False)

                                                   text    tag
0     Institution implemented market positioning in ...  OTHER
1     During fiscal year, transportation agency esta...  OTHER
2     Systematic evaluated of technological infrastr...  OTHER
3     Systematic implemented of risk management by f...  OTHER
4     Insurance provider assessed market positioning...  OTHER
...                                                 ...    ...
2995  Technical analysis of software development tea...  OTHER
2996  After performance analysis, digital service de...  OTHER
2997  Web solution provider's comprehensive approach...  OTHER
2998  Using advanced technologies reveals web platfo...  OTHER
2999  During system upgrade, software development te...  OTHER

[3000 rows x 2 columns]


In [5]:
import pandas as pd

# Load the merged CSV file
merged_df = pd.read_csv('merged_sentences.csv')

# Rename the columns
merged_df = merged_df.rename(columns={'text': 'Identifier', 'tag': 'Tag'})

# Display the updated DataFrame (optional)
print(merged_df)

# Save the updated DataFrame to a new CSV file
merged_df.to_csv('merged_sentences_updated.csv', index=False)

                                             Identifier    Tag
0     Institution implemented market positioning in ...  OTHER
1     During fiscal year, transportation agency esta...  OTHER
2     Systematic evaluated of technological infrastr...  OTHER
3     Systematic implemented of risk management by f...  OTHER
4     Insurance provider assessed market positioning...  OTHER
...                                                 ...    ...
2995  Technical analysis of software development tea...  OTHER
2996  After performance analysis, digital service de...  OTHER
2997  Web solution provider's comprehensive approach...  OTHER
2998  Using advanced technologies reveals web platfo...  OTHER
2999  During system upgrade, software development te...  OTHER

[3000 rows x 2 columns]


In [6]:
# Shuffle the merged DataFrame
merged_df = merged_df.sample(frac=1).reset_index(drop=True)

# Load the synthetic_phi_NER.csv file
synthetic_phi_df = pd.read_csv('/content/synthetic_phi_NER.csv')

# Concatenate the DataFrames
final_df = pd.concat([merged_df, synthetic_phi_df], ignore_index=True)

# Shuffle the final DataFrame
final_df = final_df.sample(frac=1).reset_index(drop=True)

# Save the final DataFrame to a new CSV file
final_df.to_csv('final_merged_sentences.csv', index=False)