In [None]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Phrases
from gensim.corpora import Dictionary
from datasets import load_dataset 
from gensim.models import LdaModel
import matplotlib.pyplot as plt
from pprint import pprint
from gensim.models.coherencemodel import CoherenceModel
#import nltk
#nltk.download('wordnet')
import numpy as np 
import torch
import json
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import pyLDAvis.gensim_models as gensimvis
from sklearn.metrics import davies_bouldin_score
import pyLDAvis
import pandas as pd

#nltk.download('wordnet')
#nltk.download('punkt')
#nltk.download('stopwords')

def get_dataset(dataset_name, dataset_info_list):
    if isinstance(dataset_name,list):
        if dataset_name[0] == "tasksource/mmlu":
            try:
                traindata = load_dataset(dataset_name[0],dataset_name[1], split="test[0:100]", num_proc=8) 
            except:
                traindata = load_dataset(dataset_name[0],dataset_name[1], split="test", num_proc=8) 
        elif dataset_name[0] == "tasksource/bigbench":
            try:
                traindata = load_dataset(dataset_name[0],dataset_name[1], split="train[0:100]", num_proc=8,trust_remote_code=True) 
            except:
                traindata = load_dataset(dataset_name[0],dataset_name[1], split="train", num_proc=8,trust_remote_code=True) 
    else:
        if dataset_name == "EleutherAI/truthful_qa_mc":
            traindata = load_dataset(dataset_name, split="validation[0:100]", num_proc=8) 
        else:
            traindata = load_dataset(dataset_name, split="train[0:100]", num_proc=8) 
    if isinstance(dataset_name,list):
        key = dataset_info_list[dataset_name[0]]["keys"]
    else:
        key = dataset_info_list[dataset_name]["keys"]
    return traindata[key[0]]

def set_random_seed(seed=0):
    #random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def get_dataset_list(dataset_list):
    dataname = []
    for data in dataset_list:
        if "subset" not in dataset_list[data].keys():
            dataname.append(data)
        else:
            for subset in dataset_list[data]["subset"]:
                dataname.append([data,subset])
    return dataname

set_random_seed()

# Dataset

In [None]:
with open("/home/bhandk/MLNeuron/dataset_info.json", 'r') as openfile:
    # Reading from json file
    dataset_info_list = json.load(openfile)
dataset_name_list = get_dataset_list(dataset_info_list)
questions = []
questions_name = []
for dataset_name in dataset_name_list:
    data = get_dataset(dataset_name, dataset_info_list)
    combinations = " ".join(data)
    questions.append(combinations)
    if len(dataset_name) == 2:
        questions_name.append(dataset_name[-1])
    else:
        questions_name.append(dataset_name)

# Prepare Corpus and Dictionary

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
from nltk.corpus import words

def tokenize_text(text, name):
    for val in [":",",","?",".","!","\n"]:
            text = text.replace(val,"")
    #if name in ["entailed_polarity_hindi","indic_cause_and_effect","kannada"]:
    if text.split(" ")[0] not in words.words():   
        tokens = text.split(" ")
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token.isalnum() not in stop_words]
        #print(name, end=", ")
        return tokens
    # Tokenize using NLTK
    tokens = word_tokenize(text)

    # Remove stopwords and perform stemming
    stop_words = set(stopwords.words('english'))
    
    #ps = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    #tokens = [ps.stem(token) for token in tokens if token.isalnum() not in stop_words]
    tokens = [token for token in tokens if token.isalnum() not in stop_words]
    return tokens

In [None]:
#tokenizer = RegexpTokenizer(r'\w+')
import copy
questions_copy = copy.deepcopy(questions)
for idx in range(len(questions)):
    questions_copy[idx] = questions_copy[idx].lower()  
    #questions_copy[idx] = tokenizer.tokenize(questions_copy[idx])  
    questions_copy[idx] = tokenize_text(questions_copy[idx], questions_name[idx])
# Remove numbers, but not words that contain numbers.
#questions = [[token for token in q if not token.isnumeric()] for q in questions]

# Remove words that are only one character.
#questions = [[token for token in q if len(token) > 1] for q in questions]

bigram = Phrases(questions_copy, min_count=20)
for idx in range(len(questions_copy)):
    for token in bigram[questions_copy[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            questions_copy[idx].append(token)

# Create a dictionary representation of the documents.
dictionary = Dictionary(questions_copy)

# Filter out words that occur less than 20 documents, or more than 20% of the documents.
#dictionary.filter_extremes(no_below=5, no_above=0.3)
dictionary.filter_extremes(no_above=0.50)
corpus = [dictionary.doc2bow(q) for q in questions_copy]

# LDA model

In [None]:
'''# Set training parameters.
num_topics = 20
#chunksize = 2000
passes = 500
iterations = 10000
#eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=num_topics,
    passes=passes,
    iterations=iterations,
    random_state = 0
)'''
# Function to compute coherence values for different number of topics
def compute_coherence_values(dictionary,texts, corpus, limit, start=2, step=3):
    coherence_values = []
    id2word = dictionary.id2token
    optimal_model = None
    best_coherece = 0
    for num_topics in range(start, limit, step):
        model = LdaModel(
            corpus=corpus,
            id2word=dictionary,
            num_topics=num_topics,
            passes=500,
            iterations=10000,
            random_state=0
        )
        coherence_model = CoherenceModel(model=model,texts=texts,corpus=corpus,dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherence_model.get_coherence())
        if coherence_values[-1] > best_coherece:
            best_coherece = coherence_values[-1]
            optimal_model = model
    return optimal_model, coherence_values

# Set the range of topics to explore
start, limit, step = 4, 5, 1
#start, limit, step = 7, 8, 1
#start, limit, step = 10, 11, 1

# Get coherence values
model, coherence_values = compute_coherence_values(dictionary=dictionary,texts=questions_copy, corpus=corpus, start=start, limit=limit, step=step)

# Plotting
x = range(start, limit, step)
plt.plot(x, coherence_values, label="coherence value")
plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score")
plt.legend()
plt.show()

# Find the optimal number of topics
optimal_num_topics = x[coherence_values.index(max(coherence_values))]
print(f"Optimal number of topics: {optimal_num_topics}")


# Get Topics

In [None]:

# Step 6: Assign Topics to Datasets.
topics = [model[doc] for doc in corpus]
dominant  = lambda x: max(x, key=lambda item: item[1])[0]
# Step 7: Categorize Datasets.
dominant_topic = np.array([dominant(t) for t in topics])
dominant_topic= dominant_topic.astype(int)
# Visualization (Optional):
# You can visualize the results using various libraries such as matplotlib or seaborn.
#print("Dominant Topic: ", len(dominant_topic) ,dominant_topic)
# Example: Bar chart of dominant topics
value, topic_index, topic_counts = np.unique(dominant_topic, return_index=True,return_counts=True)
plt.bar([v+1 for v in value], topic_counts)
plt.xticks([v+1 for v in value])
plt.xlabel('Dominant Topic')
plt.ylabel('Number of Datasets')
plt.title('Distribution of Dominant Topics in Datasets')
plt.plot()
'''for index, topic in model.show_topics(formatted=False, num_words= 30):
    print('Topic: {} \nWords: {}'.format(index, [w[0] for w in topic]))'''
questions_name = np.array(questions_name)
for topic, count in zip(value,topic_counts):
    indexes = [i for i in range(len(dominant_topic)) if dominant_topic[i] == topic]
    print("TOPIC ",topic+1, " count: ", count)
    print("Topic Distribution: ", [topics[i] for i in indexes])
    print("DATASET ", questions_name[indexes])
    print("+"*100)
#plt.savefig("Test.png")

In [None]:


pyLDAvis.enable_notebook()
# Convert the gensim LDA model to a format compatible with pyLDAvis
vis_data = gensimvis.prepare(model, corpus,dictionary=dictionary,sort_topics=False)
#print(vis_data[1][vis_data[1]["Category"]!="Default"])
'''for term, freq, category,total in zip(vis_data[1]['Term'],vis_data[1]['Freq'],vis_data[1]["Category"],vis_data[1]["Total"]):
    if category != "Default":
        print(term, freq, category, total)'''
# Visualize the topics
pyLDAvis.display(vis_data)
#db_index_pca = davies_bouldin_score(vis_data[0]['x'], vis_data[0]['topics'])

In [None]:
pyLDAvis.save_html(vis_data,"visualization.html")

# WordCloud

In [None]:
from wordcloud import WordCloud
for t in range(model.num_topics):
    plt.figure()
    plt.imshow(WordCloud().fit_words(dict(model.show_topic(t, 200))))
    plt.axis("off")
    plt.title("Topic #" + str(t+1))
    plt.show()

# Analyze the topic

In [None]:
# Get topic weights and dominant topics ------------
from sklearn.manifold import TSNE
#from bokeh.plotting import figure, output_file, show
#from bokeh.models import Label
#from bokeh.io import output_notebook
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

# Get topic weights
topic_weights = []
#[model[doc] for doc in corpus]
'''for i, row_list in enumerate(model[dictionary]):
    topic_weights.append([w for i, w in row_list[0]])'''
for i, row_list in enumerate(corpus):
    topic_weights.append([w for i, w in model[row_list]])
# Array of topic weights    
arr = pd.DataFrame(topic_weights)#.fillna(0).values

# Keep the well separated points (optional)
#arr = arr[np.amax(arr, axis=1) > 0.35]

# Dominant topic number in each doc
topic_num = np.argmax(arr, axis=1)
print(len(topic_num))
# tSNE Dimension Reduction
tsne_model = TSNE(n_components=2, random_state=0, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(arr)

mycolors = np.array([color for name, color in mcolors.TABLEAU_COLORS.items()])
plt.figure()
plt.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=mycolors[topic_num])
uq_topics, uq_idx = np.unique(topic_num, return_index=True)
print(uq_idx)
for topic_id, idx in zip(value,uq_idx):
    print(topic_id,idx)
    plt.scatter(x=tsne_lda[idx,0], y=tsne_lda[idx,1], color=mycolors[topic_num][idx], label=f"Topic #{topic_id+1}: {questions_name[idx]}")
plt.title("t-SNE Clustering of {} LDA Topics".format(optimal_num_topics))
plt.legend()
plt.show()

'''for topic, count in zip(value,topic_counts):
    indexes = [i for i in range(len(dominant_topic)) if dominant_topic[i] == topic]
    print("TOPIC ",topic+1, " count: ", count)
    print("Topic Distribution: ", [topics[i] for i in indexes])
    print("DATASET ", questions_name[indexes])
    print("+"*100)'''

In [None]:
def filterData(data, threshold):
    from collections import Counter
    uniquedata ={}
    for d in data:
        uniquedata[d] = np.unique(data[d])
    # Combine all items into a single list of words
    all_words = [word for skills_list in uniquedata.values() for word in skills_list]

    # Count the frequency of each word
    word_counts = Counter(all_words)

    # Calculate the threshold for words to be removed (appearing in more than 90% of the data)
    threshold = len(uniquedata) * threshold

    # Identify common words to be removed
    common_words = [word for word, count in word_counts.items() if count > threshold]

    # Remove common words from each item
    filtered_data = {key: [word for word in value if word not in common_words] for key, value in data.items()}
    return filtered_data
with open("result/dataCategory.json", 'r') as openfile:
    # Reading from json file
    dataCategory = json.load(openfile)
total_skill= 0 
for d in dataCategory:
    total_skill += len(dataCategory[d])
print(total_skill)
dataCategory = filterData(dataCategory, 0.50)
total_skill= 0 
for d in dataCategory:
    total_skill += len(dataCategory[d])
print(total_skill)

In [None]:
def getSkill(uniqueCategory, topic):
    dict_of_lists = {}
    for idx, comm in enumerate(topic):
        print(comm)
        dict_of_lists[comm] = {}
        for node in topic[comm]:
            items =  np.unique(np.array(uniqueCategory[node]))
            frequency = [(x,uniqueCategory[node].count(x)) for x in items]
            get_node = sorted(frequency,key=lambda x: x[1], reverse=True)[0:20]
            get_node = [key[0] for key in get_node]
            print("\t",node,"\t",get_node)
            dict_of_lists[comm][node] = get_node
    return dict_of_lists
def topSkills(dict_of_lists, top=10):
    from collections import Counter

    # Flatten the lists into a single list
    all_elements = [item for sublist in dict_of_lists.values() for item in sublist]

    # Count the occurrences of each element
    element_counts = Counter(all_elements)

    # Get the 10 most common elements
    most_common_elements = element_counts.most_common(top)

    '''print("10 Most Common Elements:")
    for element, count in most_common_elements:
        print(f"{element}: {count} occurrences")'''
    return most_common_elements
questions_name = np.array(questions_name)
topicDataset = {}
for topic, count in zip(value,topic_counts):
    indexes = [i for i in range(len(dominant_topic)) if dominant_topic[i] == topic]
    topicDataset[topic+1] = list(questions_name[indexes])
topicSkills = getSkill(dataCategory,topicDataset)


In [None]:
for topicNumber in topicSkills:
    skills = topSkills(topicSkills[topicNumber], 20)
    print(topicNumber, [f"({skill[0]} {skill[1]})" for skill in skills])