In [1]:
import pandas as pd
from bertopic import BERTopic

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_excel('GS1.xlsx')
df.sample(3)

Unnamed: 0,Year,Question,Subject,Sub-Topic
189,2014,The New Economic Policy – 1921 of Lenin had in...,History,Post-independe nce Period
18,2023,Why is caste identity in India both fluid and ...,Indian Society,"Communalism, Regionalism & Secularism"
218,2013,“American Revolution was an economic revolt ag...,History,Colonization and decolonization


In [6]:
questions = df['Question'].values.tolist()
subjects = df['Subject'].values.tolist()

In [7]:
unique_subjects = list(set(subjects))

In [8]:
unique_subjects

['Indian Art and Culture', 'History', 'Geography', 'Indian Society']

In [14]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
sub_topics_encoded = labelencoder.fit_transform(subjects)


In [15]:
sub_topics_encoded

array([1, 1, 1, 0, 0, 0, 0, 3, 3, 3, 1, 1, 1, 0, 0, 0, 0, 3, 3, 3, 1, 1,
       1, 0, 0, 0, 0, 3, 3, 3, 1, 1, 1, 0, 0, 0, 0, 1, 3, 3, 1, 1, 1, 0,
       0, 3, 0, 0, 3, 3, 1, 1, 1, 0, 0, 0, 0, 3, 3, 3, 1, 1, 1, 0, 0, 0,
       0, 3, 3, 3, 1, 1, 1, 0, 0, 0, 0, 3, 3, 3, 1, 1, 1, 0, 0, 0, 0, 3,
       3, 3, 1, 1, 1, 0, 0, 0, 0, 3, 3, 3, 2, 1, 1, 0, 0, 0, 0, 3, 3, 3,
       1, 1, 1, 0, 0, 0, 0, 3, 3, 3, 2, 1, 1, 1, 0, 0, 0, 0, 3, 3, 3, 1,
       1, 0, 0, 0, 0, 3, 3, 3, 2, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 0, 0,
       0, 0, 0, 0, 0, 0, 2, 2, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 0, 0, 0,
       0, 0, 0, 0, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [11]:
questions[0]

'Explain the role of geographical factors towards the development of Ancient India.'

In [12]:
model = BERTopic(language="english", calculate_probabilities=True)

In [13]:
from sentence_transformers import SentenceTransformer
sentence_model = SentenceTransformer('all-mpnet-base-v2')


from bertopic.representation import KeyBERTInspired
from bertopic.representation import MaximalMarginalRelevance
from bertopic.representation import PartOfSpeech
representation = [PartOfSpeech("en_core_web_sm"),MaximalMarginalRelevance(diversity=0.2),KeyBERTInspired(top_n_words=30)]


from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3))

from bertopic.vectorizers import ClassTfidfTransformer
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True,reduce_frequent_words=True)

model = BERTopic(embedding_model=sentence_model,vectorizer_model=vectorizer_model,representation_model=representation,ctfidf_model=ctfidf_model,verbose=True,calculate_probabilities=True, nr_topics='auto', min_topic_size=5)



In [16]:
topics, probs = model.fit_transform(questions, y = sub_topics_encoded)

2024-07-17 17:19:06,825 - BERTopic - Embedding - Transforming documents to embeddings.


Batches: 100%|██████████| 8/8 [00:15<00:00,  1.93s/it]
2024-07-17 17:19:22,352 - BERTopic - Embedding - Completed ✓
2024-07-17 17:19:22,352 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-17 17:19:52,895 - BERTopic - Dimensionality - Completed ✓
2024-07-17 17:19:52,897 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-17 17:19:52,957 - BERTopic - Cluster - Completed ✓
2024-07-17 17:19:52,958 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-17 17:20:00,678 - BERTopic - Representation - Completed ✓
2024-07-17 17:20:00,679 - BERTopic - Topic reduction - Reducing number of topics
2024-07-17 17:20:08,129 - BERTopic - Topic reduction - Reduced number of topics from 4 to 4


In [17]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,93,0_resources india_india discuss_development in...,"[resources india, india discuss, development i...",[The interlinking of rivers can provide viable...
1,1,72,1_indian independence_india discuss_colonial i...,"[indian independence, india discuss, colonial ...",[Why did the armies of the British East India ...
2,2,60,2_indian society maintain_indian society_diver...,"[indian society maintain, indian society, dive...",[Globalisation is generally said to promote cu...
3,3,10,3_indus valley civilization_planning culture i...,"[indus valley civilization, planning culture i...",[The ancient civilization in Indian sub contin...


In [18]:
model.visualize_topics()

In [19]:
model.visualize_documents(questions)

In [20]:
model.save("GS1_model_supervised")



In [21]:
# model.reduce_topics(questions,nr_topics=10)

In [22]:
unique_subjects

['Indian Art and Culture', 'History', 'Geography', 'Indian Society']

In [27]:
history = df[df['Subject'] == 'History']['Question'].values.tolist()
geography = df[df['Subject'] == 'Geography']['Question'].values.tolist()
indian_society = df[df['Subject'] == 'Indian Society']['Question'].values.tolist()
indian_art_and_culture = df[df['Subject'] == 'Indian Art and Culture']['Question'].values.tolist()
questions = {}
questions['0'] = history
questions['1'] = geography
questions['2'] = indian_society
questions['3'] = indian_art_and_culture

In [29]:
questions['3']

['Safeguarding the Indian art heritage is the need of the moment.',
 'How do you justify the view that the level of excellence of the Gupta numismatic art is not at all noticeable in later times?',
 'Early Buddhist Stupa-art, while depicting folk motifs and narratives successfully expounds Buddhist ideals. Elucidate.',
 'The ancient civilization in Indian sub continent differed from those of Egypt , Mesopotamia and Greece in that its culture and traditions have been preserved without breakdown to the present day. Comment',
 'Mesolithic rock cut architecture of India not only reflects the cultural life of the times but also a fine aesthetic sense comparable to modern painting. Critically evaluate this comment.',
 'To what extent has the urban planning and culture of the Indus Valley Civilization provided inputs to the present day urbanization?\nDiscuss.',
 'Gandhara sculpture owed as much to the Romans as to the Greeks. Explain.',
 'Though not very useful from the point of view of a con

In [30]:
models = {}
topics = {}
probs = {}
for i in range(0,3):
    models['model'+str(i)] = BERTopic(embedding_model=sentence_model,vectorizer_model=vectorizer_model,representation_model=representation,ctfidf_model=ctfidf_model,verbose=True,calculate_probabilities=True, nr_topics='auto', min_topic_size=5)
    topics['topics'+str(i)], probs['probs'+str(i)] = models['model'+str(i)].fit_transform(questions[str(i)])



2024-07-17 17:31:47,749 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 3/3 [00:04<00:00,  1.67s/it]
2024-07-17 17:31:52,763 - BERTopic - Embedding - Completed ✓
2024-07-17 17:31:52,763 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-17 17:31:59,415 - BERTopic - Dimensionality - Completed ✓
2024-07-17 17:31:59,415 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-17 17:31:59,428 - BERTopic - Cluster - Completed ✓
2024-07-17 17:31:59,429 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-17 17:32:07,163 - BERTopic - Representation - Completed ✓
2024-07-17 17:32:07,163 - BERTopic - Topic reduction - Reducing number of topics
2024-07-17 17:32:14,713 - BERTopic - Topic reduction - Reduced number of topics from 4 to 4
2024-07-17 17:32:14,729 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 3/3 [00:04<00:00,

In [31]:
models['model0'].get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,12,-1_influence indian society_lenin influenced p...,"[influence indian society, lenin influenced po...",[The New Economic Policy – 1921 of Lenin had i...
1,0,26,0_policies british india_policies colonial ind...,"[policies british india, policies colonial ind...",[Why did the industrial revolution first occur...
2,1,17,1_monuments art india_indian society_medieval ...,"[monuments art india, indian society, medieval...",[The rock-cut architecture represents one of t...
3,2,17,2_mahatma gandhi discuss_mahatma gandhi strugg...,"[mahatma gandhi discuss, mahatma gandhi strugg...","[Defying the barriers of age, gender and relig..."


In [32]:
models['model1'].get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,23,-1_monsoon climate_monsoon asia_monsoon called...,"[monsoon climate, monsoon asia, monsoon called...",[What characteristics can be assigned to monso...
1,0,27,0_economic development india_potentials deccan...,"[economic development india, potentials deccan...",[Whereas the British planters had developed te...
2,1,17,1_problems droughts floods_related problems dr...,"[problems droughts floods, related problems dr...",[Identify and discuss the factors responsible ...
3,2,17,2_macro climatic changes_inversion meteorology...,"[macro climatic changes, inversion meteorology...",[Briefly mention the alignment of major mounta...
4,3,10,3_ocean currents role_influence ocean currents...,"[ocean currents role, influence ocean currents...",[Critically evaluate the various resources of ...


In [33]:
models['model2'].get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,17,-1_india communalism arises_india communalism_...,"[india communalism arises, india communalism, ...",[How do you explain the statistics that show t...
1,0,24,0_indian society_pluralism india_diversity ind...,"[indian society, pluralism india, diversity in...","[In the context of the diversity of India, can..."
2,1,12,1_inequalities poverty india_movement india ad...,"[inequalities poverty india, movement india ad...",[“Male membership needs to be encouraged in or...
3,2,6,2_urbanization india_urban life india_process ...,"[urbanization india, urban life india, process...",[How is the growth of Tier 2 cities related to...
