In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [28]:
import nltk
import pandas as pd
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Define custom old English stopwords
old_english_stopwords = {
    'thy', 'thou', 'thee', 'ye', 'thyself', 'thine', 'art', 'dost', 
    'hast', 'hast', 'wilt', 'shall', 'o', 'lo', 'say', 'come', 'unto','do','go'
}

# Define words to keep even if they have length 3 or less
keep_words = {'day', 'man', 'god'}  # Add any other important short words here

# Preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):
        return []  # If the text is not a string (NaN, float), return an empty list

    # Tokenize the text and convert to lowercase
    tokens = word_tokenize(text.lower())  # Ensure all tokens are in lowercase

    # Combine modern and old English stopwords
    stop_words = set(stopwords.words('english'))
    stop_words.update(old_english_stopwords)

    # Remove stopwords, single-letter words, and non-alphabetical tokens
    # Also remove words with length <= 3, except those in keep_words
    tokens = [
        word for word in tokens 
        if word.isalpha() and 
        word not in stop_words and 
        (len(word) > 3 or word in keep_words)
    ]

    # Lemmatize using spaCy
    doc = nlp(" ".join(tokens))
    lemmatized_tokens = [token.lemma_ for token in doc]

    return lemmatized_tokens

# Load datasets
df_bible = pd.read_csv('/kaggle/input/pr-dataset/final_datasets/Bible.csv')
df_gita = pd.read_csv('/kaggle/input/pr-dataset/final_datasets/Gita.csv')
df_quran = pd.read_csv('/kaggle/input/pr-dataset/final_datasets/Quran.csv')

# Apply preprocessing to the 'Text' columns (checking column existence before processing)
if 'Text' in df_gita.columns:
    df_gita['processed'] = df_gita['Text'].apply(lambda x: " ".join(preprocess_text(x)))

if 't' in df_bible.columns:
    df_bible['processed'] = df_bible['t'].apply(lambda x: " ".join(preprocess_text(x)))

if 'Verse' in df_quran.columns:
    df_quran['processed'] = df_quran['Verse'].apply(lambda x: " ".join(preprocess_text(x)))

# Remove the words 'go' and 'do' from the 'processed' columns
words_to_remove = {'go', 'do'}

def remove_specific_words(text):
    words = text.split()  # Split the text into words
    filtered_words = [word for word in words if word not in words_to_remove]  # Remove 'go' and 'do'
    return " ".join(filtered_words)  # Join the remaining words back into a string

# Apply the removal step to the processed columns
df_gita['processed'] = df_gita['processed'].apply(remove_specific_words)
df_bible['processed'] = df_bible['processed'].apply(remove_specific_words)
df_quran['processed'] = df_quran['processed'].apply(remove_specific_words)

# Replace NaN values with empty string in the processed columns (if any)
df_gita['processed'].fillna("", inplace=True)
df_bible['processed'].fillna("", inplace=True)
df_quran['processed'].fillna("", inplace=True)


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_gita['processed'].fillna("", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_bible['processed'].fillna("", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values 

In [29]:
df_gita.head()

Unnamed: 0,Chapter,Verse,Text,processed
0,1,1,"DHRITARASHTRA: O Sanjaya, tell me what happen...",dhritarashtra sanjaya tell happen kurukshetra ...
1,1,2,SANJAYA: Having surveyed the forces of the Pa...,sanjaya survey force pandava array battle prin...
2,1,3,"O my teacher, look at this mighty army of the ...",teacher look mighty army pandava assemble gift...
3,1,4,There are heroic warriors and great archers wh...,heroic warrior great archer equal bhima arjuna...
4,1,5,"Dhrishtaketu, Chekitana, the valiant king of K...",dhrishtaketu chekitana valiant king kashi puru...


In [30]:
df_quran.head()

Unnamed: 0,Name,Surah,Ayat,Verse,processed
0,The Opening,1,1,"In the name of Allah, the Beneficent, the Merc...",name allah beneficent merciful
1,The Opening,1,2,"Praise be to Allah, Lord of the Worlds,",praise allah lord worlds
2,The Opening,1,3,"The Beneficent, the Merciful.",beneficent merciful
3,The Opening,1,4,"Owner of the Day of Judgment,",owner day judgment
4,The Opening,1,5,Thee (alone) we worship; Thee (alone) we ask f...,alone worship alone help


In [31]:
df_bible.head()

Unnamed: 0,b,c,v,t,processed
0,1,1,1,At the first God made the heaven and the earth.,first god make heaven earth
1,1,1,2,And the earth was waste and without form; and ...,earth waste without form dark face deep spirit...
2,1,1,3,"And God said, Let there be light: and there wa...",god say light light
3,1,1,4,"And God, looking on the light, saw that it was...",god look light good god make division light dark
4,1,1,5,"Naming the light, Day, and the dark, Night. An...",name light day dark night evening morning firs...


In [32]:
# Save the processed DataFrames to CSV files
df_gita.to_csv('/kaggle/working/processed_Gita.csv', index=False)
df_bible.to_csv('/kaggle/working/processed_Bible.csv', index=False)
df_quran.to_csv('/kaggle/working/processed_Quran.csv', index=False)


In [35]:
import pandas as pd

# Load the CSV files
df_gita = pd.read_csv('/kaggle/working/processed_Gita.csv')
df_bible = pd.read_csv('/kaggle/working/processed_Bible.csv')
df_quran = pd.read_csv('/kaggle/working/processed_Quran.csv')

# Function to check for words less than 4 letters in the 'processed' column
def check_short_words(df):
    short_words = []
    for text in df['processed']:  # Assuming 'processed' column contains the text data
        if isinstance(text, str):  # Ensure the text is a string before splitting
            words = text.split()  # Split text into words
            for word in words:
                if len(word) < 4:  # Check if word length is less than 4
                    short_words.append(word)
    return short_words

# Checking for short words in the 'processed' column of each dataframe
short_words_gita = check_short_words(df_gita)
short_words_bible = check_short_words(df_bible)
short_words_quran = check_short_words(df_quran)

# Print results
print(f"Words less than 4 letters in Gita: {short_words_gita}")
print(f"Words less than 4 letters in Bible: {short_words_bible}")
print(f"Words less than 4 letters in Quran: {short_words_quran}")


Words less than 4 letters in Gita: ['son', 'old', 'son', 'see', 'son', 'say', 'son', 'see', 'son', 'tie', 'son', 'son', 'not', 'say', 'son', 'eye', 'beg', 'not', 'son', 'sap', 'god', 'lie', 'see', 'die', 'one', 'man', 'bad', 'say', 'say', 'man', 'see', 'man', 'day', 'say', 'use', 'act', 'sin', 'man', 'law', 'law', 'one', 'one', 'arm', 'god', 'cle', 'act', 'fix', 'god', 'vow', 'man', 'eat', 'let', 'eye', 'act', 'sin', 'eye', 'eye', 'fix', 'eat', 'see', 'joy', 'try', 'try', 'man', 'man', 'man', 'low', 'man', 'see', 'low', 'god', 'god', 'god', 'die', 'fix', 'law', 'day', 'end', 'end', 'day', 'day', 'day', 'say', 'law', 'law', 'god', 'god', 'sag', 'god', 'god', 'sag', 'god', 'god', 'god', 'god', 'way', 'god', 'god', 'god', 'god', 'god', 'god', 'see', 'god', 'see', 'eye', 'sun', 'god', 'god', 'bow', 'god', 'arm', 'eye', 'eye', 'god', 'god', 'eye', 'arm', 'leg', 'eye', 'son', 'jaw', 'jaw', 'bow', 'sag', 'god', 'god', 'god', 'god', 'god', 'say', 'say', 'sit', 'eat', 'see', 'see', 'god', 'god'

# TOPIC MODELLING

In [36]:
!pip install bertopic


Collecting bertopic
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.39-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence_transformers-3.3.0-py3-none-any.whl.metadata (10 kB)
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn>=0.5.0->bertopic)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading bertopic-0.16.4-py3-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.7/143.7 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading hdbscan-0.8.39-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m00:0

# QURAN

In [71]:
from bertopic import BERTopic
import pandas as pd

# Load the Quran dataset
df_quran = pd.read_csv('/kaggle/input/processed-religious1/processed_Quran.csv')

# Remove NaN values and convert all values in 'processed' to strings
df_quran['processed'] = df_quran['processed'].fillna('').astype(str)

# Initialize BERTopic with optimizations
#topic_model = BERTopic(nr_topics=15,min_topic_size=15)
topic_model_quran = BERTopic(nr_topics=15,top_n_words=10)
# Apply BERTopic to the cleaned 'processed' column
topics_quran, probs_quran = topic_model_quran.fit_transform(df_quran['processed'].tolist())

topic_info_quran = topic_model_quran.get_topic_info()
print(topic_info_quran)


# Save topic information to a CSV file
topic_info_quran.to_csv('quran_topic_info.csv', index=False)

print("Topic information saved as quran_topic_info.csv")

    Topic  Count                                  Name  \
0      -1   2313                -1_allah_lord_hath_say   
1       0   2645                 0_allah_lord_say_hath   
2       1    758                 1_day_doom_fire_night   
3       2     94            2_deny_favour_promise_lord   
4       3     86            3_warner_cattle_bear_plain   
5       4     67  4_create_generation_destroy_creation   
6       5     55           5_fruit_grape_thereof_olive   
7       6     40            6_couch_blind_recline_deaf   
8       7     37            7_measure_draw_scale_weigh   
9       8     33           8_water_boiling_gush_spring   
10      9     30              9_hand_right_record_book   
11     10     26            10_nineteen_doer_also_full   
12     11     20          11_dust_bone_become_forsooth   
13     12     18         12_magic_wizard_serpent_fling   
14     13     14         13_household_stay_save_behind   

                                       Representation  \
0   [allah, lo

In [73]:
fig = topic_model_quran.visualize_topics()
fig.write_html("bertopic_quran_topics.html")

# Display link to open the HTML file
from IPython.display import FileLink
FileLink("bertopic_quran_topics.html")

In [75]:
# Generate the bar chart visualization and save it
bar_chart = topic_model_quran.visualize_barchart(top_n_topics=10)
bar_chart.write_html("bertopic_quran_barchart.html")

# Display a link to open the HTML file
from IPython.display import FileLink
FileLink("bertopic_quran_barchart.html")

# GITA

In [76]:
from bertopic import BERTopic
import pandas as pd

# Load the Gita dataset
df_gita = pd.read_csv('/kaggle/input/processed-religious1/processed_Gita.csv')

# Remove NaN values and convert all values in 'processed' to strings
df_gita['processed'] = df_gita['processed'].fillna('').astype(str)

# Initialize BERTopic with optimizations
topic_model_gita = BERTopic(nr_topics=15, top_n_words=10)

# Apply BERTopic to the cleaned 'processed' column
topics_gita, probs_gita = topic_model_gita.fit_transform(df_gita['processed'].tolist())

# Get and print topic information for Gita
topic_info_gita = topic_model_gita.get_topic_info()
print("Gita Topic Information:")
print(topic_info_gita)

# Save topic information to a CSV file
topic_info_gita.to_csv('gita_topic_info.csv', index=False)

print("Topic information saved as gita_topic_info.csv")


Gita Topic Information:
    Topic  Count                                Name  \
0      -1    178       -1_among_creature_action_path   
1       0    172        0_arjuna_krishna_yoga_divine   
2       1     12            1_rajas_tama_sattva_bind   
3       2     26         2_guna_prakriti_action_bear   
4       3     11       3_dharma_family_destroy_unity   
5       4     11         4_enemy_friend_destroy_evil   
6       5     16  5_wisdom_light_knowledge_ignorance   
7       6     27       6_love_devotion_faith_worship   
8       7     72    7_attachment_work_selfish_desire   
9       8     74    8_meditation_mind_practice_sense   
10      9     29    9_brahman_sacrifice_offer_attain   
11     10     50      10_lord_creature_birth_supreme   
12     11     21        11_fire_mouth_light_radiance   

                                       Representation  \
0   [among, creature, action, path, worship, death...   
1   [arjuna, krishna, yoga, divine, word, sanjaya,...   
2   [rajas, tama, sa

In [77]:
fig = topic_model_gita.visualize_topics()
fig.write_html("bertopic_gita_topics.html")

# Display link to open the HTML file
from IPython.display import FileLink
FileLink("bertopic_gita_topics.html")

In [78]:
# Generate the bar chart visualization and save it
bar_chart = topic_model_gita.visualize_barchart(top_n_topics=10)
bar_chart.write_html("bertopic_gita_barchart.html")

# Display a link to open the HTML file
from IPython.display import FileLink
FileLink("bertopic_gita_barchart.html")

# BIBLE

In [79]:
from bertopic import BERTopic
import pandas as pd

# Load the Bible dataset
df_bible = pd.read_csv('/kaggle/input/processed-religious1/processed_Bible.csv')

# Remove NaN values and convert all values in 'processed' to strings
df_bible['processed'] = df_bible['processed'].fillna('').astype(str)

# Initialize BERTopic with optimizations
#topic_model_bible = BERTopic(nr_topics=15, min_topic_size=15)
topic_model_bible = BERTopic(nr_topics=15, top_n_words=10)

# Apply BERTopic to the cleaned 'processed' column
topics_bible, probs_bible = topic_model_bible.fit_transform(df_bible['processed'].tolist())

# Get and print topic information for Bible
topic_info_bible = topic_model_bible.get_topic_info()
print("Bible Topic Information:")
print(topic_info_bible)

# Save topic information to a CSV file
topic_info_bible.to_csv('bible_topic_info.csv', index=False)

print("Topic information saved as bible_topic_info.csv")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Bible Topic Information:
    Topic  Count                               Name  \
0      -1  12261              -1_give_make_lord_say   
1       0  15500               0_say_lord_give_make   
2       1   1046            1_sheep_fruit_wine_bird   
3       2    433         2_gold_silver_stone_shekel   
4       3    378          3_tent_light_dark_meeting   
5       4    316    4_hundred_thousand_number_month   
6       5    301            5_water_boat_wind_river   
7       6    272      6_circumcision_robe_hair_head   
8       7    185       7_perfume_burn_fire_offering   
9       8    176            8_cubit_wide_side_board   
10      9    106  9_pilate_hivite_hittite_macedonia   
11     10     55    10_breast_part_body_breastplate   
12     11     49                  11_second_truly__   
13     12     13       12_weight_scale_unequal_true   
14     13     12   13_cord_frequently_twisted_twist   

                                       Representation  \
0   [give, make, lord, say, take, god

In [80]:
fig = topic_model_bible.visualize_topics()
fig.write_html("bertopic_bible_topics.html")

# Display link to open the HTML file
from IPython.display import FileLink
FileLink("bertopic_bible_topics.html")

In [81]:
# Generate the bar chart visualization and save it
bar_chart = topic_model_bible.visualize_barchart(top_n_topics=10)
bar_chart.write_html("bertopic_bible_barchart.html")

# Display a link to open the HTML file
from IPython.display import FileLink
FileLink("bertopic_bible_barchart.html")

# GITA vs BIBLE

In [82]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Load topic info for Bible and Gita (from your BERTopic outputs)
topic_info_bible = pd.read_csv('bible_topic_info.csv')  # Assuming you have saved this file
topic_info_gita = pd.read_csv('gita_topic_info.csv')    # You already saved this file

# Initialize the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Extract relevant columns for Bible and Gita topic info
bible_topics = topic_info_bible['Topic'].tolist()
bible_keywords = topic_info_bible['Representation'].apply(eval).tolist()  # Convert string lists back to list

gita_topics = topic_info_gita['Topic'].tolist()
gita_keywords = topic_info_gita['Representation'].apply(eval).tolist()  # Convert string lists back to list

# Encode the keywords for Bible topics
bible_keywords_encoded = [model.encode(' '.join(keywords)) for keywords in bible_keywords]

# Encode the keywords for Gita topics
gita_keywords_encoded = [model.encode(' '.join(keywords)) for keywords in gita_keywords]

# Create an empty list to store the results
similarity_results = []

# Calculate cosine similarity between each Bible topic and each Gita topic
for bible_idx, bible_embedding in enumerate(bible_keywords_encoded):
    for gita_idx, gita_embedding in enumerate(gita_keywords_encoded):
        # Calculate the cosine similarity between the two topic embeddings
        similarity_score = cosine_similarity([bible_embedding], [gita_embedding])[0][0]
        
        # Add the results to the list
        similarity_results.append({
            'topic number of Bible': bible_topics[bible_idx],
            'key words of that topic (Bible)': ', '.join(bible_keywords[bible_idx]),
            'topic number of Gita': gita_topics[gita_idx],
            'key words of that topic (Gita)': ', '.join(gita_keywords[gita_idx]),
            'similarity score': similarity_score
        })

# Convert the list of results into a DataFrame
similarity_df = pd.DataFrame(similarity_results)

# Sort the DataFrame by similarity score in descending order
similarity_df_sorted = similarity_df.sort_values(by='similarity score', ascending=False)

# Display the sorted similarity dataframe
print(similarity_df_sorted)

# Optionally, save the sorted dataframe to a CSV
similarity_df_sorted.to_csv('sorted_topic_similarity_bible_gita.csv', index=False)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

     topic number of Bible                    key words of that topic (Bible)  \
64                       3  tent, light, dark, meeting, cloud, night, shin...   
150                     10  breast, part, body, breastplate, milk, birth, ...   
116                      7  perfume, burn, fire, offering, flame, sweet, s...   
14                       0  say, lord, give, make, take, god, man, king, c...   
1                       -1  give, make, lord, say, take, god, israel, king...   
..                     ...                                                ...   
121                      8  cubit, wide, side, board, cover, doorway, wing...   
82                       5  water, boat, wind, river, ship, limit, east, w...   
43                       2  gold, silver, stone, shekel, brass, pillar, pl...   
119                      8  cubit, wide, side, board, cover, doorway, wing...   
117                      8  cubit, wide, side, board, cover, doorway, wing...   

     topic number of Gita  

# GITA vs QURAN

In [83]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Load topic info for Gita and Quran (from your BERTopic outputs)
topic_info_gita = pd.read_csv('gita_topic_info.csv')    # Assuming you have saved this file
topic_info_quran = pd.read_csv('quran_topic_info.csv')  # Assuming you have saved this file

# Initialize the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Extract relevant columns for Gita and Quran topic info
gita_topics = topic_info_gita['Topic'].tolist()
gita_keywords = topic_info_gita['Representation'].apply(eval).tolist()  # Convert string lists back to list

quran_topics = topic_info_quran['Topic'].tolist()
quran_keywords = topic_info_quran['Representation'].apply(eval).tolist()  # Convert string lists back to list

# Encode the keywords for Gita topics
gita_keywords_encoded = [model.encode(' '.join(keywords)) for keywords in gita_keywords]

# Encode the keywords for Quran topics
quran_keywords_encoded = [model.encode(' '.join(keywords)) for keywords in quran_keywords]

# Create an empty list to store the results
similarity_results = []

# Calculate cosine similarity between each Gita topic and each Quran topic
for gita_idx, gita_embedding in enumerate(gita_keywords_encoded):
    for quran_idx, quran_embedding in enumerate(quran_keywords_encoded):
        # Calculate the cosine similarity between the two topic embeddings
        similarity_score = cosine_similarity([gita_embedding], [quran_embedding])[0][0]
        
        # Add the results to the list
        similarity_results.append({
            'topic number of Gita': gita_topics[gita_idx],
            'key words of that topic (Gita)': ', '.join(gita_keywords[gita_idx]),
            'topic number of Quran': quran_topics[quran_idx],
            'key words of that topic (Quran)': ', '.join(quran_keywords[quran_idx]),
            'similarity score': similarity_score
        })

# Convert the list of results into a DataFrame
similarity_df = pd.DataFrame(similarity_results)

# Sort the DataFrame by similarity score in descending order
similarity_df_sorted = similarity_df.sort_values(by='similarity score', ascending=False)

# Display the sorted similarity dataframe
print(similarity_df_sorted)

# Optionally, save the sorted dataframe to a CSV
similarity_df_sorted.to_csv('sorted_topic_similarity_gita_quran.csv', index=False)



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

     topic number of Gita                     key words of that topic (Gita)  \
90                      5  wisdom, light, knowledge, ignorance, live, peo...   
108                     6  love, devotion, faith, worship, honor, dear, w...   
36                      1  rajas, tama, sattva, bind, fruit, prevail, pre...   
16                      0  arjuna, krishna, yoga, divine, word, sanjaya, ...   
91                      5  wisdom, light, knowledge, ignorance, live, peo...   
..                    ...                                                ...   
94                      5  wisdom, light, knowledge, ignorance, live, peo...   
23                      0  arjuna, krishna, yoga, divine, word, sanjaya, ...   
129                     7  attachment, work, selfish, desire, free, pleas...   
54                      2  guna, prakriti, action, bear, doer, purusha, s...   
173                    10  lord, creature, birth, supreme, creation, univ...   

     topic number of Quran             

# QURAN vs BIBLE

In [84]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Load topic info for Bible and Quran (from your BERTopic outputs)
topic_info_bible = pd.read_csv('bible_topic_info.csv')  # Assuming you have saved this file
topic_info_quran = pd.read_csv('quran_topic_info.csv')  # Assuming you have saved this file

# Initialize the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Extract relevant columns for Bible and Quran topic info
bible_topics = topic_info_bible['Topic'].tolist()
bible_keywords = topic_info_bible['Representation'].apply(eval).tolist()  # Convert string lists back to list

quran_topics = topic_info_quran['Topic'].tolist()
quran_keywords = topic_info_quran['Representation'].apply(eval).tolist()  # Convert string lists back to list

# Encode the keywords for Bible topics
bible_keywords_encoded = [model.encode(' '.join(keywords)) for keywords in bible_keywords]

# Encode the keywords for Quran topics
quran_keywords_encoded = [model.encode(' '.join(keywords)) for keywords in quran_keywords]

# Create an empty list to store the results
similarity_results = []

# Calculate cosine similarity between each Bible topic and each Quran topic
for bible_idx, bible_embedding in enumerate(bible_keywords_encoded):
    for quran_idx, quran_embedding in enumerate(quran_keywords_encoded):
        # Calculate the cosine similarity between the two topic embeddings
        similarity_score = cosine_similarity([bible_embedding], [quran_embedding])[0][0]
        
        # Add the results to the list
        similarity_results.append({
            'topic number of Bible': bible_topics[bible_idx],
            'key words of that topic (Bible)': ', '.join(bible_keywords[bible_idx]),
            'topic number of Quran': quran_topics[quran_idx],
            'key words of that topic (Quran)': ', '.join(quran_keywords[quran_idx]),
            'similarity score': similarity_score
        })

# Convert the list of results into a DataFrame
similarity_df = pd.DataFrame(similarity_results)

# Sort the DataFrame by similarity score in descending order
similarity_df_sorted = similarity_df.sort_values(by='similarity score', ascending=False)

# Display the sorted similarity dataframe
print(similarity_df_sorted)

# Optionally, save the sorted dataframe to a CSV
similarity_df_sorted.to_csv('sorted_topic_similarity_bible_quran.csv', index=False)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

     topic number of Bible                    key words of that topic (Bible)  \
203                     12  weight, scale, unequal, true, measure, importa...   
36                       1  sheep, fruit, wine, bird, grain, tree, take, b...   
1                       -1  give, make, lord, say, take, god, israel, king...   
16                       0  say, lord, give, make, take, god, man, king, c...   
15                       0  say, lord, give, make, take, god, man, king, c...   
..                     ...                                                ...   
164                      9  pilate, hivite, hittite, macedonia, canaanite,...   
215                     13  cord, frequently, twisted, twist, loose, orion...   
137                      8  cubit, wide, side, board, cover, doorway, wing...   
102                      5  water, boat, wind, river, ship, limit, east, w...   
159                      9  pilate, hivite, hittite, macedonia, canaanite,...   

     topic number of Quran 