In [37]:
import gensim
from gensim import corpora

# Your list of company description text (replace with real data)
documents = [
    ['CUST_ID', 'fintech'],
    ['ADR_KEY', 'fintech'],
    ['QWE_QWE', 'fintech'],
    ['CUST_ID', 'rakuten'],
    ['ADR_KEY', 'rakuten'],
    ['HHH_KOP', 'rakuten']
]

# Create a dictionary from the tokenized documents
dictionary = corpora.Dictionary(documents)

# Create a Document-Term Matrix (DTM)
corpus = [dictionary.doc2bow(doc) for doc in documents]

# Train the LDA model
lda_model = gensim.models.LdaModel(
    corpus,
    num_topics=3,  # Adjust the number of topics as needed
    id2word=dictionary,
    passes=15,  # Number of passes through the corpus
    alpha='auto',  # Automatic alpha estimation
    random_state=42
)

# Print the topics
topics = lda_model.print_topics(num_words=4)
for topic in topics:
    print(topic)


(0, '0.414*"rakuten" + 0.169*"ADR_KEY" + 0.168*"CUST_ID" + 0.165*"HHH_KOP"')
(1, '0.332*"ADR_KEY" + 0.329*"fintech" + 0.085*"CUST_ID" + 0.085*"rakuten"')
(2, '0.394*"fintech" + 0.221*"QWE_QWE" + 0.218*"CUST_ID" + 0.056*"ADR_KEY"')


In [31]:
import gensim
from gensim import corpora

# Example column names and predefined domains
column_names = ['product name', 'description', 'company name']
domains = [
    'ABC Inc. specializes in manufacturing products for various industries.',
    'XYZ Corp provides a wide range of software solutions for businesses.',
    'PQR Ltd is a leading provider of innovative technology services.'
]

# Tokenize and preprocess the column names
tokenized_column_names = [name.lower().split('_') for name in column_names]

# Create a dictionary from the tokenized column names
dictionary = corpora.Dictionary(tokenized_column_names)
print(dictionary)

# Create a Document-Term Matrix (DTM)
corpus = [dictionary.doc2bow(tokens) for tokens in tokenized_column_names]

# Train the LDA model
lda_model = gensim.models.LdaModel(
    corpus,
    num_topics=len(domains),  # Set the number of topics to match the predefined domains
    id2word=dictionary,
    passes=15,
    random_state=42
)

# Print the topics
topics = lda_model.print_topics(num_words=3)
for i, topic in enumerate(topics):
    print(f"Domain {i + 1}: {topic}")

# Associate column names with domains
for name in column_names:
    bow = dictionary.doc2bow(name.lower().split('_'))
    topic_distribution = lda_model.get_document_topics(bow)
    domain_idx = max(topic_distribution, key=lambda x: x[1])[0]
    print(f"Column Name: {name} => Domain: {domains[domain_idx]}")


Dictionary<3 unique tokens: ['product name', 'description', 'company name']>
Domain 1: (0, '0.662*"product name" + 0.169*"company name" + 0.169*"description"')
Domain 2: (1, '0.444*"description" + 0.443*"company name" + 0.113*"product name"')
Domain 3: (2, '0.336*"company name" + 0.335*"description" + 0.329*"product name"')
Column Name: product name => Domain: ABC Inc. specializes in manufacturing products for various industries.
Column Name: description => Domain: XYZ Corp provides a wide range of software solutions for businesses.
Column Name: company name => Domain: XYZ Corp provides a wide range of software solutions for businesses.
