### **1.Loading Data**

In [21]:
import pandas as pd
import numpy as np
import os

In [22]:
import string
def preprocessText(text:str)->list[str]:
  text = text.lower()
  text = text.replace('\n', '.')
  sentences = text.split('.')
  translate_table = dict((ord(char), None) for char in string.punctuation)
  sentences = [sentence.translate(translate_table).strip() for sentence in sentences if sentence != ""]
  return sentences

In [23]:
df = pd.DataFrame()
for file in os.listdir('text_data'):
  text = open(os.path.join('text_data', file), 'r').read()
  sentences = preprocessText(text)
  df = pd.concat([df, pd.DataFrame(sentences, columns=['sentence'])])
  df.reset_index(drop=True, inplace=True)
df.head(10)

Unnamed: 0,sentence
0,regular exercise is crucial for maintaining ov...
1,a balanced diet rich in fruits and vegetables ...
2,mental health is as important as physical heal...
3,good sleep hygiene including consistent sleep ...
4,practices like yoga and meditation are effecti...
5,climate change is one of the most pressing iss...
6,renewable energy sources like solar and wind a...
7,conservation efforts are vital to protect enda...
8,reducing plastic waste is critical to preservi...
9,sustainable agriculture practices can help ens...


### **2. Loading the model**

In [24]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [25]:
embeddings = model.encode(df['sentence'])
df['embeddings'] = embeddings.tolist()
df.head()

Unnamed: 0,sentence,embeddings
0,regular exercise is crucial for maintaining ov...,"[0.02462807297706604, 0.06498054414987564, 0.0..."
1,a balanced diet rich in fruits and vegetables ...,"[-0.016917672008275986, 0.04941205307841301, 0..."
2,mental health is as important as physical heal...,"[0.05661291629076004, 0.08747770637273788, 0.0..."
3,good sleep hygiene including consistent sleep ...,"[0.04327883943915367, 0.05800575762987137, 0.0..."
4,practices like yoga and meditation are effecti...,"[0.08963283151388168, 0.05326443538069725, 0.0..."


### **3. Ordering the sentences**

In [26]:
from sklearn.metrics.pairwise import cosine_similarity


def compute_similarity_matrix(embeddings):
    return cosine_similarity(embeddings)
similarity_matrix = compute_similarity_matrix(embeddings)
similarity_matrix.shape

(15, 15)

In [27]:
from sklearn.preprocessing import MinMaxScaler

def normalize_similarities(similarity_matrix):
    scaler = MinMaxScaler()
    flattened_matrix = similarity_matrix.flatten().reshape(-1, 1)
    normalized_values = scaler.fit_transform(flattened_matrix).reshape(similarity_matrix.shape)
    return normalized_values

normalized_values = normalize_similarities(similarity_matrix)
normalized_lists = [list(row) for row in normalized_values]

df["similarity"] = normalized_lists

df.head()

Unnamed: 0,sentence,embeddings,similarity
0,regular exercise is crucial for maintaining ov...,"[0.02462807297706604, 0.06498054414987564, 0.0...","[0.9999993, 0.37767592, 0.64685756, 0.33952233..."
1,a balanced diet rich in fruits and vegetables ...,"[-0.016917672008275986, 0.04941205307841301, 0...","[0.37767592, 0.9999997, 0.2972488, 0.2621151, ..."
2,mental health is as important as physical heal...,"[0.05661291629076004, 0.08747770637273788, 0.0...","[0.64685756, 0.2972488, 0.9999998, 0.3684806, ..."
3,good sleep hygiene including consistent sleep ...,"[0.04327883943915367, 0.05800575762987137, 0.0...","[0.33952233, 0.2621151, 0.3684806, 0.9999998, ..."
4,practices like yoga and meditation are effecti...,"[0.08963283151388168, 0.05326443538069725, 0.0...","[0.5264407, 0.25793368, 0.61127603, 0.39486274..."


In [28]:
df["most_similar"] = pd.Series(np.zeros(len(df)), dtype=object)
df["most_similar_score"] = pd.Series(np.zeros(len(df)), dtype=object)
most_similar = []
most_similar_score = []

visited = set()

current_index = 0

while current_index is not None:
    visited.add(current_index)
    
    similarities = df["similarity"][current_index]
    
    valid_similarities = [
            (idx, sim) for idx, sim in enumerate(similarities) 
            if idx != current_index and idx not in visited and sim is not None
        ]
    if valid_similarities:
        most_similar_idx, most_similar_val = max(valid_similarities, key=lambda x: x[1])
        most_similar.append(most_similar_idx)
        most_similar_score.append(most_similar_val)
        df.loc[current_index, 'most_similar'] = most_similar_idx
        df.loc[current_index, 'most_similar_score'] = most_similar_val
        current_index = most_similar_idx
    else:
        most_similar.append(current_index)  
        most_similar_score.append(1)
        df.loc[current_index, 'most_similar'] =current_index
        df.loc[current_index, 'most_similar_score'] =1
        current_index = None


df['most_similar'] = df['most_similar'].astype('Int64')

df.head()

Unnamed: 0,sentence,embeddings,similarity,most_similar,most_similar_score
0,regular exercise is crucial for maintaining ov...,"[0.02462807297706604, 0.06498054414987564, 0.0...","[0.9999993, 0.37767592, 0.64685756, 0.33952233...",2,0.646858
1,a balanced diet rich in fruits and vegetables ...,"[-0.016917672008275986, 0.04941205307841301, 0...","[0.37767592, 0.9999997, 0.2972488, 0.2621151, ...",9,0.488635
2,mental health is as important as physical heal...,"[0.05661291629076004, 0.08747770637273788, 0.0...","[0.64685756, 0.2972488, 0.9999998, 0.3684806, ...",4,0.611276
3,good sleep hygiene including consistent sleep ...,"[0.04327883943915367, 0.05800575762987137, 0.0...","[0.33952233, 0.2621151, 0.3684806, 0.9999998, ...",1,0.262115
4,practices like yoga and meditation are effecti...,"[0.08963283151388168, 0.05326443538069725, 0.0...","[0.5264407, 0.25793368, 0.61127603, 0.39486274...",3,0.394863


In [29]:
new_order = [0]
current_index = 0
while True:
    next_index = df["most_similar"][current_index]
    if next_index == current_index :
        break
    new_order.append(next_index)
    current_index = next_index
df_reordered = df.reindex(new_order)
df_reordered.head()

Unnamed: 0,sentence,embeddings,similarity,most_similar,most_similar_score
0,regular exercise is crucial for maintaining ov...,"[0.02462807297706604, 0.06498054414987564, 0.0...","[0.9999993, 0.37767592, 0.64685756, 0.33952233...",2,0.646858
2,mental health is as important as physical heal...,"[0.05661291629076004, 0.08747770637273788, 0.0...","[0.64685756, 0.2972488, 0.9999998, 0.3684806, ...",4,0.611276
4,practices like yoga and meditation are effecti...,"[0.08963283151388168, 0.05326443538069725, 0.0...","[0.5264407, 0.25793368, 0.61127603, 0.39486274...",3,0.394863
3,good sleep hygiene including consistent sleep ...,"[0.04327883943915367, 0.05800575762987137, 0.0...","[0.33952233, 0.2621151, 0.3684806, 0.9999998, ...",1,0.262115
1,a balanced diet rich in fruits and vegetables ...,"[-0.016917672008275986, 0.04941205307841301, 0...","[0.37767592, 0.9999997, 0.2972488, 0.2621151, ...",9,0.488635


### **4. Adding color and generating HTML**

In [30]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
num_rows = len(df_reordered)
values = np.linspace(0, 1, num_rows)
colormap = plt.cm.coolwarm
normalized_diffs = np.linspace(0, 1, len(df))
colors = [colormap(diff) for diff in normalized_diffs]
hex_colors = [mcolors.to_hex(c) for c in colors]

df_reordered['color'] = hex_colors

df_reordered


Unnamed: 0,sentence,embeddings,similarity,most_similar,most_similar_score,color
0,regular exercise is crucial for maintaining ov...,"[0.02462807297706604, 0.06498054414987564, 0.0...","[0.9999993, 0.37767592, 0.64685756, 0.33952233...",2,0.646858,#3b4cc0
2,mental health is as important as physical heal...,"[0.05661291629076004, 0.08747770637273788, 0.0...","[0.64685756, 0.2972488, 0.9999998, 0.3684806, ...",4,0.611276,#506bda
4,practices like yoga and meditation are effecti...,"[0.08963283151388168, 0.05326443538069725, 0.0...","[0.5264407, 0.25793368, 0.61127603, 0.39486274...",3,0.394863,#6788ee
3,good sleep hygiene including consistent sleep ...,"[0.04327883943915367, 0.05800575762987137, 0.0...","[0.33952233, 0.2621151, 0.3684806, 0.9999998, ...",1,0.262115,#80a3fa
1,a balanced diet rich in fruits and vegetables ...,"[-0.016917672008275986, 0.04941205307841301, 0...","[0.37767592, 0.9999997, 0.2972488, 0.2621151, ...",9,0.488635,#9abbff
9,sustainable agriculture practices can help ens...,"[0.021508904173970222, 0.04652981832623482, -0...","[0.19085667, 0.48863482, 0.2066651, 0.2145147,...",7,0.481228,#b2ccfb
7,conservation efforts are vital to protect enda...,"[-0.025413181632757187, 0.10220841318368912, 0...","[0.18680322, 0.2520537, 0.26891437, 0.19891953...",5,0.442737,#c9d7f0
5,climate change is one of the most pressing iss...,"[0.027152128517627716, 0.05836827680468559, 0....","[0.11248568, 0.18628615, 0.20091856, 0.1333848...",6,0.500932,#dddcdc
6,renewable energy sources like solar and wind a...,"[-0.021640589460730553, 0.15694960951805115, 0...","[0.26903754, 0.19817859, 0.2787015, 0.21799856...",8,0.345602,#edd1c2
8,reducing plastic waste is critical to preservi...,"[-0.03975194692611694, 0.08236020803451538, 0....","[0.12452667, 0.16249985, 0.17878722, 0.1868470...",13,0.117061,#f6bfa6


In [31]:
import pandas as pd
from jinja2 import Environment, FileSystemLoader

env = Environment(loader=FileSystemLoader('.'))
template = env.get_template('./templates/template.html')

sentences = list(zip(df_reordered['sentence'], df_reordered['color']))

html_content = template.render(sentences=sentences)

with open('./output/output.html', 'w') as f:
    f.write(html_content)

print("HTML file generated: output.html")


HTML file generated: output.html
