In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd



In [2]:
excel_path = "witcher_info.xlsx"  # Change this to your actual file path
xls = pd.ExcelFile(excel_path)

Dialogues_Names = [sheet_name for sheet_name in xls.sheet_names if sheet_name.startswith("D_")] 
Narrative_Objects_Names = [sheet_name for sheet_name in xls.sheet_names if sheet_name.startswith("NO_")]
Narration_Names = [sheet_name for sheet_name in xls.sheet_names if sheet_name.startswith("N_")]

research_list = [Dialogues_Names, Narrative_Objects_Names, Narration_Names]

print("Dialogues Names:", Dialogues_Names)
print("Narrative Objects Names:", Narrative_Objects_Names)
print("Narration Names:", Narration_Names)

Dialogues Names: ['D_NilfgardOficer', "D_Gounter O'Dim", 'D_AfterHym', 'D_LambertBoat', 'D_CiriGeralt', 'D_SkeligeWarriors', 'D_YennGeraltSkellige', 'D_Guslar', 'D_Trolls', 'D_Regis']
Narrative Objects Names: ['NO_Ghoul', 'NO_WampireInterview', 'NO_RedanianPaper', 'NO_OxenfurtBeast', 'NO_TresureLetter', 'NO_WidowerNote', 'NO_NonHuman', 'NO_ArenaCertificate', 'NO_TheaterPlay', 'NO_MapDescription']
Narration Names: ['N_Beginning', 'N_MeetingYennefer', 'N_MiddleOfKeira', 'N_CiriAndCrones', 'N_CorinneTilly', 'N_SkelligeExplosion', 'N_FindingUma', 'N_Battle', "N_Ge'els", 'N_Sunstone']


In [3]:
class ResearchObject:
    def __init__(self, context: str, Prompt: str, participants: list, text_game: str, text_llm: str):
        self.context = context
        self.Prompt = Prompt
        self.participants = participants
        self.text_game = text_game
        self.text_llm = text_llm

    def __repr__(self):
        return f"ResearchObject(context={self.context}, Prompt={self.Prompt}, participants={self.participants}, text_game={self.text_game}, text_llm={self.text_llm})"
    
    def get_texts(self):
        Participants_Text = "" 
        for i in range(len(self.participants)):
            Participants_Text.join(str(self.participants[i]))
        return [self.context, self.Prompt, self.text_game, self.text_llm, Participants_Text]


def create_research_object(xls, sheet_name: str) -> ResearchObject:
    df = pd.read_excel(xls, sheet_name=sheet_name)
    selected_columns = df[['Context', 'Prompt', 'Participants', 'Text Game', 'Text LLM']].copy()
    selected_columns.columns = ['Context', 'Prompt', 'Participants', 'Text Game', 'Text LLM']

    context = selected_columns['Context'].get(0)
    Prompt = selected_columns['Prompt'].get(0)
    text_game = selected_columns['Text Game'].get(0)
    text_llm = selected_columns['Text LLM'].get(0)
    participants = selected_columns['Participants']
    
    return ResearchObject(context, Prompt, participants, text_game, text_llm)






In [4]:
import chromadb
import os
from yake import KeywordExtractor



def get_keywords_similarity(keywords: list[str], main_text: str, collection: chromadb.Collection) -> float:
    if main_text.__class__ == "str" or len(keywords) <= 0 or main_text == None:
        return 0.0

    texts = [main_text]
    for keyword in keywords:
        # Add the keyword to the ChromaDB collection
        keyword_result = collection.query(
            query_texts=keyword[0], # Chroma will embed this for you
            n_results=1 # how many results to return
        )
        texts.append(str(keyword_result['documents'][0]))

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    similarity_matrix = cosine_similarity(tfidf_matrix)
    avg_similarity = similarity_matrix[0][1:]
    avg_similarity = sum(avg_similarity) / len(avg_similarity)
    return avg_similarity
    

def extract_and_calculate_keywords_similarity(Text: str) -> float:
    kw_extractor = KeywordExtractor(top=5, stopwords=None, n=3, dedupLim=0.9, features=None)

    script_dir = os.getcwd()
    chroma_client = chromadb.PersistentClient(path=script_dir)
    collection = chroma_client.get_or_create_collection(name="witcher")

    keywords_game = kw_extractor.extract_keywords(Text)
    keywards_similarity = get_keywords_similarity(keywords_game, Text, collection)

    return keywards_similarity




In [5]:
# TF-IDF vectorization
from chromadb.config import Settings
from numpy import matrix, ndarray



def get_martix_for_texts(texts: list[str]) -> ndarray:
    texts = [ str(text) if isinstance(text, str) else "" for text in texts]
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)

    # Cosine similarity matrix
    similarity_matrix = cosine_similarity(tfidf_matrix)
    # print("Cosine Similarity Matrix:\n", similarity_matrix)

    return similarity_matrix


def print_matrix_results(similarity_matrix: ndarray):
    i_Context = 0
    i_Prompt = 1
    i_game = 2
    i_llm = 3
    i_participants = 4
    
    print("Cosine Similarity Results:\n")

    print(f"Context           ↔ Text_Game : {similarity_matrix[i_Context][i_game]:.4f}")
    print(f"Context           ↔ Text_LLM  : {similarity_matrix[i_Context][i_llm]:.4f}")
    print(f"Prompt           ↔ Text_Game : {similarity_matrix[i_Prompt][i_game]:.4f}")
    print(f"Prompt           ↔ Text_LLM  : {similarity_matrix[i_Prompt][i_llm]:.4f}")
    print(f"Text_Game   ↔ Text_LLM  : {similarity_matrix[i_game][i_llm]:.4f}")
    print(f"Participants    ↔ Text_Game : {similarity_matrix[i_participants][i_game]:.4f}")
    print(f"Participants    ↔ Text_LLM  : {similarity_matrix[i_participants][i_llm]:.4f}")

    # Summary
    print("Summary of Cosine Similarities:\n")
    print(f"Context: G-{similarity_matrix[i_Context][i_game]:.4f} LLM-{similarity_matrix[i_Context][i_llm]:.4f}")
    print(f"Prompt: G-{similarity_matrix[i_Prompt][i_game]:.4f} LLM-{similarity_matrix[i_Prompt][i_llm]:.4f}")
    print(f"Participants: G-{similarity_matrix[i_participants][i_game]:.4f} LLM-{similarity_matrix[i_participants][i_llm]:.4f}")

def print_keywords_similarity(game_keywards_similarity, llm_keywards_similarity):
    print("Keywords Similarity Results:\n")

    print(f"Game Keywords Similarity: {game_keywards_similarity:.4f}")
    print(f"LLM Keywords Similarity: {llm_keywards_similarity:.4f}\n")
    print(f"Keywords Similarity: G-{game_keywards_similarity:.4f} LLM-{llm_keywards_similarity:.4f}\n")


def execute_research_on_group(sheet_names: list, xls: pd.ExcelFile, bPrintForAll: bool = True):
    general_matrix: ndarray = None
    for sheet_name in sheet_names:
        research_object = create_research_object(xls, sheet_name)
        game_keywards_similarity = extract_and_calculate_keywords_similarity(research_object.text_game)
        llm_keywards_similarity = extract_and_calculate_keywords_similarity(research_object.text_llm)
        texts = research_object.get_texts()
        similarity_matrix = get_martix_for_texts(texts)

        if general_matrix is None:
            general_matrix = similarity_matrix
        else:
            # Check if matrices have the same shape
            if general_matrix.shape == similarity_matrix.shape:
                general_matrix += similarity_matrix
            else:
                print(f"Skipping addition: matrix shapes do not match ({general_matrix.shape} vs {similarity_matrix.shape})")


        if bPrintForAll:
            print(f"Dialogue Name: {sheet_name}")
            print_matrix_results(similarity_matrix)
            print_keywords_similarity(game_keywards_similarity, llm_keywards_similarity)
        
    general_matrix /= len(sheet_names)
    print("General Cosine Similarity Results:\n")
    print_matrix_results(general_matrix)
    print_keywords_similarity(game_keywards_similarity, llm_keywards_similarity)


for research_group in research_list:
    print(f"\n\nResearch Group: {research_group}")
    execute_research_on_group(research_group, xls, False)





Research Group: ['D_NilfgardOficer', "D_Gounter O'Dim", 'D_AfterHym', 'D_LambertBoat', 'D_CiriGeralt', 'D_SkeligeWarriors', 'D_YennGeraltSkellige', 'D_Guslar', 'D_Trolls', 'D_Regis']
General Cosine Similarity Results:

Cosine Similarity Results:

Context           ↔ Text_Game : 0.3453
Context           ↔ Text_LLM  : 0.3996
Prompt           ↔ Text_Game : 0.2114
Prompt           ↔ Text_LLM  : 0.2533
Text_Game   ↔ Text_LLM  : 0.4856
Participants    ↔ Text_Game : 0.0000
Participants    ↔ Text_LLM  : 0.0000
Summary of Cosine Similarities:

Context: G-0.3453 LLM-0.3996
Prompt: G-0.2114 LLM-0.2533
Participants: G-0.0000 LLM-0.0000
Keywords Similarity Results:

Game Keywords Similarity: 0.1549
LLM Keywords Similarity: 0.2760

Keywords Similarity: G-0.1549 LLM-0.2760



Research Group: ['NO_Ghoul', 'NO_WampireInterview', 'NO_RedanianPaper', 'NO_OxenfurtBeast', 'NO_TresureLetter', 'NO_WidowerNote', 'NO_NonHuman', 'NO_ArenaCertificate', 'NO_TheaterPlay', 'NO_MapDescription']
General Cosine Simi