In [14]:
import os 
import requests

In [15]:
pdf_path = "world_geo.pdf"
if not os.path.exists(pdf_path):
    print('[INFO] The file does not exist, Downloading...')
    url  = "https://www.iipa.org.in/upload/world_geo.pdf"
    filename = pdf_path 
    response = requests.get(url)
    if response.status_code == 200:
        with open(filename, "wb") as file:
            file.write(response.content)
            print(f"[INFO] The file has been downloaded as  {filename}")
    else:
        print(f"[INFO] Failed to download the file. Status Code: {response.status_code}")
else:
    print("[INFO] File {pdf_path} already exists.")

[INFO] File {pdf_path} already exists.


PREPROCESS 

In [16]:
import fitz 
from tqdm.auto import tqdm 

def text_formatter(text: str) -> str:
    cleaned_text = text.replace("\n", "").strip()
    return cleaned_text 

def o_r_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_text = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_text.append({"page_number": page_number,
        "page_char_count" : len(text),
        "page_word_count" : len(text.split(" ")),
        "page_sentence_count" : len(text.split(". ")),
        "page_token_count" : len(text) / 4,
        "text": text}) # One token = 4 characters

    return pages_and_text

pages_and_text = o_r_pdf(pdf_path = pdf_path)
pages_and_text[:2]

0it [00:00, ?it/s]

[{'page_number': 0,
  'page_char_count': 80,
  'page_word_count': 20,
  'page_sentence_count': 1,
  'page_token_count': 20.0,
  'text': '1      IIPA2022  22 World Geography  Short Answers PKP-03 by Dr Amit Kumar Singh'},
 {'page_number': 1,
  'page_char_count': 4596,
  'page_word_count': 163,
  'page_sentence_count': 43,
  'page_token_count': 1149.0,
  'text': '2  Contents 1.  Origin of Solar System and Sun ........................................................................................................... 6 2. Stars, Sun & Planets ............................................................................................................................. 8 Sun Statistics ....................................................................................................................................... 8 Solar Eclipse ........................................................................................................................................ 9 Lunar Eclipse .......

In [17]:
import random 

random.sample(pages_and_text, k=3)

[{'page_number': 120,
  'page_char_count': 1322,
  'page_word_count': 182,
  'page_sentence_count': 14,
  'page_token_count': 330.5,
  'text': '121  30. Urban Heat Islands  In large urban settlements, human activities greatly modify the environment, creating unique meteorological and climatological characteristics. The agglomeration of tall buildings, roadways, green spaces, and concrete surfaces produces intricate rain, wind, heat, and air-quality patterns. The hard surfaces can shape water flow and aggravate flood risks. The alignment of buildings can create local wind tunnels. Tiny particles emitted by traffic and industry can reduce air quality. The urban heat-island effect can raise temperatures by 5oC to 10oC, exacerbating heat waves. Urban Heat Islands (UHI) are significantly warmer urban areas than its surrounding rural areas due to human activities. Urban Heat Island is a major problem associated with rapid urbanisation. The temperature increase is attributed to deforestation 

In [18]:
import pandas as pd 

df = pd.DataFrame(pages_and_text)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,text
0,0,80,20,1,20.0,1 IIPA2022 22 World Geography Short Ans...
1,1,4596,163,43,1149.0,2 Contents 1. Origin of Solar System and Sun...
2,2,4448,208,40,1112.0,3 Types of Volcanoes ...........................
3,3,4560,203,44,1140.0,4 Continental Slope ............................
4,4,3474,178,31,868.5,5 Insolation or Incoming Solar Radiation .......


In [19]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count
count,129.0,129.0,129.0,129.0,129.0
mean,64.0,1655.45,262.89,16.95,413.86
std,37.38,732.36,96.85,7.75,183.09
min,0.0,80.0,20.0,1.0,20.0
25%,32.0,1146.0,189.0,12.0,286.5
50%,64.0,1699.0,264.0,16.0,424.75
75%,96.0,2055.0,341.0,21.0,513.75
max,128.0,4596.0,424.0,44.0,1149.0


In [20]:
from spacy.lang.en import English

nlp = English()
#Adding a sentencizer pipeline
nlp.add_pipe("sentencizer")

#Creating a documnet instance as an example

doc = nlp("I am a shitty person. I am a good man. I am ugly.")
assert len(list(doc.sents)) == 3

list(doc.sents)


[I am a shitty person., I am a good man., I am ugly.]

In [21]:
pages_and_text[0]

{'page_number': 0,
 'page_char_count': 80,
 'page_word_count': 20,
 'page_sentence_count': 1,
 'page_token_count': 20.0,
 'text': '1      IIPA2022  22 World Geography  Short Answers PKP-03 by Dr Amit Kumar Singh'}

In [22]:
for item in tqdm(pages_and_text):
    item["sentences"] = list(nlp(item["text"]).sents)

    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    item["page_sentence_count_spacy"] = len(item["sentences"])


  0%|          | 0/129 [00:00<?, ?it/s]

In [23]:
random.sample(pages_and_text, k=1)

[{'page_number': 22,
  'page_char_count': 1938,
  'page_word_count': 334,
  'page_sentence_count': 17,
  'page_token_count': 484.5,
  'text': '23  Second stage – “the period of dominant volcanism” or ‘the period of the evolution of the earth’s interior and the evolution of continents and ocean basins’.  Third stage – ‘the actual geological period’ or ‘the period of the formation of the folds and faults, mountains and plateaux etc.  These stages of the evolution of the earth are separated from each other only for the sake of convenience; otherwise these are so interlinked with each other that it is quite difficult to differentiate one stage from the other. The planet earth initially was a barren, rocky and hot object with a thin atmosphere of hydrogen and helium. This is far from the present day picture of the earth. Hence, there must have been some events–processes, which may have caused this change from rocky, barren and hot earth to a beautiful planet with ample amount of water and c

In [24]:
df = pd.DataFrame(pages_and_text)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,page_sentence_count_spacy
count,129.0,129.0,129.0,129.0,129.0,129.0
mean,64.0,1655.45,262.89,16.95,413.86,15.51
std,37.38,732.36,96.85,7.75,183.09,6.25
min,0.0,80.0,20.0,1.0,20.0,1.0
25%,32.0,1146.0,189.0,12.0,286.5,10.0
50%,64.0,1699.0,264.0,16.0,424.75,16.0
75%,96.0,2055.0,341.0,21.0,513.75,20.0
max,128.0,4596.0,424.0,44.0,1149.0,31.0


CHUNKING : Splitting sentecnces in groups of 10 or less

In [25]:
num_sentence_chunk_size = 10 
def split_list(input_list: list, slice_size: int = num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]

text_list = list(range(70))
split_list(text_list)


[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
 [30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
 [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
 [50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
 [60, 61, 62, 63, 64, 65, 66, 67, 68, 69]]

In [26]:
for item in tqdm(pages_and_text):
    item["sentence_chunks"] = split_list(input_list=item["sentences"], slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/129 [00:00<?, ?it/s]

In [27]:
random.sample(pages_and_text, k = 1)

[{'page_number': 60,
  'page_char_count': 2048,
  'page_word_count': 357,
  'page_sentence_count': 18,
  'page_token_count': 512.0,
  'text': '61  \uf0b7 The ‘S’ shape of the ocean indicates the fact that landmasses (continents) on its either side were once a contiguous part. \uf0b7 The Atlantic Ocean was formed due to drifting of North and South Americas to the west due to plate tectonics. \uf0b7 The ocean widens to the south of equator and attains the maximum width of 5,920 km at 35°S latitude. \uf0b7 It narrows down towards the equator. It is only 2560 km wide between Liberian coast and Cape Sao Roque. \uf0b7 The width further increases northward and it becomes 4800 km at 40°N latitude. \uf0b7 It narrows down in the extreme north where it maintains its contact with the Arctic Ocean through Norwegian Sea, Denmark Strait and Davis Bay. \uf0b7 The average depth of the ocean is less than the Pacific Ocean because of extensive continental shelves and marginal and enclosed seas. \uf0b7 Ab

In [28]:
df = pd.DataFrame(pages_and_text)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,page_sentence_count_spacy,num_chunks
count,129.0,129.0,129.0,129.0,129.0,129.0,129.0
mean,64.0,1655.45,262.89,16.95,413.86,15.51,1.97
std,37.38,732.36,96.85,7.75,183.09,6.25,0.71
min,0.0,80.0,20.0,1.0,20.0,1.0,1.0
25%,32.0,1146.0,189.0,12.0,286.5,10.0,1.0
50%,64.0,1699.0,264.0,16.0,424.75,16.0,2.0
75%,96.0,2055.0,341.0,21.0,513.75,20.0,2.0
max,128.0,4596.0,424.0,44.0,1149.0,31.0,4.0


In [29]:
import re 

pages_and_chunks = []
for item in tqdm(pages_and_text):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        # join the sentences together into paragraph like structure aka join the list of sentences into one paragraph 

        joined_sentence_chunk = "".join(sentence_chunk).replace("  "," ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)

        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4

        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)

  0%|          | 0/129 [00:00<?, ?it/s]

254

In [30]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 72,
  'sentence_chunk': 'The Red Sea occupies a rift-valley between the continent of Africa and the',
  'chunk_char_count': 74,
  'chunk_word_count': 13,
  'chunk_token_count': 18.5}]

In [31]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,254.0,254.0,254.0,254.0
mean,62.39,838.0,131.24,209.5
std,37.55,527.47,60.81,131.87
min,0.0,5.0,1.0,1.25
25%,29.25,600.5,95.25,150.12
50%,61.0,832.5,137.5,208.12
75%,94.0,1050.25,174.0,262.56
max,128.0,4446.0,321.0,1111.5


In [32]:
min_token_length = 15
for index, row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    # Example processing code
    print(f"Processing row {index}: {row.to_dict()}")

Processing row 145: {'page_number': 70, 'sentence_chunk': 'wide.', 'chunk_char_count': 5, 'chunk_word_count': 1, 'chunk_token_count': 1.25}
Processing row 173: {'page_number': 85, 'sentence_chunk': 'It is the layer of the ocean', 'chunk_char_count': 28, 'chunk_word_count': 7, 'chunk_token_count': 7.0}
Processing row 17: {'page_number': 8, 'sentence_chunk': 'The earth is the only planet where some special', 'chunk_char_count': 47, 'chunk_word_count': 9, 'chunk_token_count': 11.75}
Processing row 188: {'page_number': 93, 'sentence_chunk': 'It is largely responsible', 'chunk_char_count': 25, 'chunk_word_count': 4, 'chunk_token_count': 6.25}
Processing row 117: {'page_number': 56, 'sentence_chunk': 'The deepest section is less than 55m (180 feet) underwater.', 'chunk_char_count': 59, 'chunk_word_count': 10, 'chunk_token_count': 14.75}


In [33]:
pages_and_chunks_over_min_token_length = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_length[:2]

[{'page_number': 0,
  'sentence_chunk': '1   IIPA2022 22 World Geography Short Answers PKP-03 by Dr Amit Kumar Singh',
  'chunk_char_count': 75,
  'chunk_word_count': 15,
  'chunk_token_count': 18.75},
 {'page_number': 1,
  'sentence_chunk': '2 Contents 1. Origin of Solar System and Sun ........................................................................................................... 6 2. Stars, Sun & Planets ............................................................................................................................. 8 Sun Statistics ....................................................................................................................................... 8 Solar Eclipse ........................................................................................................................................ 9 Lunar Eclipse ..................................................................................................................................

In [34]:
random.sample(pages_and_chunks_over_min_token_length, k=1)

[{'page_number': 43,
  'sentence_chunk': '44 Meanders and oxbow lakes Meanders are loop-like channel patterns develop over the flood and delta plains. They are actually not a landform but only a type of channel pattern formed as a result of deposition. They are formed basically because of three reasons: (i) propensity of water flowing over very gentle gradient to work laterally on the banks; (ii) unconsolidated nature of alluvial deposits making up the bank with many irregularities; (iii) Coriolis force acting on fluid water deflecting it like deflecting the wind. The concave bank of a meander is known as cut-off bank and the convex bank is known as a slip-off As meanders grow into deep loops, the same may get cut-off due to erosion at the inflection point and are left as oxbow lakes. For large rivers, the sediments deposited in a linear fashion at the depositional side of a meander are called as Point Bars or Meander Bars. Braided Channels When selective deposition of coarser material

Embedding : 

A useful numerical representation and it is a learned representation. 


{
    the: 0,
    a : 1,
    ...
}

In [35]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-MiniLM-L6-v2", device="cuda")

#create a list of sentences
sentences = ["The sentence transformer provides an easier way to create embeddings",
"Sentences can be embedded one by one in  a list.","I like chimps!"]

embeddings = embedding_model.encode(sentences)
embedding_dict = dict(zip(sentences, embeddings))

for sentence, embedding in embedding_dict.items():
    print(f"Sentence: {sentence}")
    print(f"Embedding: {embedding}")




Sentence: The sentence transformer provides an easier way to create embeddings
Embedding: [-6.87679201e-02 -3.37028950e-02  3.68130989e-02  1.77186299e-02
  4.59097028e-02  7.25906417e-02 -2.06882395e-02  5.50199710e-02
  9.44427997e-02 -4.44253236e-02  5.07963374e-02  1.03461444e-02
  1.73806529e-02  6.51760772e-02  1.94573663e-02  8.53721723e-02
  1.59757342e-02  1.02003135e-01 -7.83813596e-02 -1.06738113e-01
  6.44138968e-03  3.28693874e-02  2.74742004e-02 -6.96152002e-02
  5.03769144e-02  6.34706244e-02 -3.52961123e-02  1.27846748e-02
  7.83551186e-02 -6.58562314e-03 -1.30270229e-04 -5.40848039e-02
 -4.96062152e-02  9.75598395e-02 -3.67665216e-02  2.29739249e-02
  4.46881577e-02  7.04804584e-02 -4.62974198e-02 -1.74799338e-02
  2.25367267e-02  6.00746088e-03  6.89678416e-02  5.03606573e-02
  4.12279367e-02 -3.38686816e-02 -4.16970365e-02 -9.08160862e-03
 -2.03304663e-02  1.73994489e-02 -4.90085185e-02 -2.83182859e-02
 -9.14116317e-05  6.86822906e-02 -4.36346792e-02  3.69178839e-02


In [36]:
embeddings[0].shape

(384,)

In [37]:
embedding = embedding_model.encode("My favourite animal is the donkey")
embedding

array([-5.49600944e-02,  7.83105660e-03,  4.53742221e-02,  1.39820976e-02,
        6.11190312e-03,  3.69757041e-02,  5.68162575e-02,  8.16690922e-03,
        6.46038875e-02,  3.74894068e-02, -3.10413763e-02, -6.54169694e-02,
        1.80261582e-02,  6.76760450e-02,  4.60805856e-02, -7.25531066e-03,
        2.34833471e-02, -2.68905889e-02,  1.29957236e-02, -7.20567480e-02,
       -9.44685787e-02,  5.52156903e-02,  1.91131514e-02, -3.16299610e-02,
       -1.05778761e-01, -7.45278820e-02,  8.34870897e-03,  2.31317915e-02,
       -3.54249147e-04, -7.11231530e-02, -5.96878007e-02, -5.54633066e-02,
        5.56221940e-02, -1.30626876e-02, -3.61372605e-02,  3.70618515e-02,
        2.22166744e-03, -6.46472052e-02,  8.49302113e-02,  5.51789813e-02,
        2.26267688e-02,  1.08027216e-02,  5.81755601e-02, -3.67338434e-02,
        4.32512118e-03,  3.97136435e-02, -4.81586643e-02, -4.30390388e-02,
        8.91654491e-02,  2.15335749e-02,  1.61640681e-02, -1.13695599e-02,
       -2.48992108e-02, -

In [39]:
%%time 

embedding_model.to("cuda")

for item in tqdm(pages_and_chunks_over_min_token_length):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])



  0%|          | 0/249 [00:00<?, ?it/s]

CPU times: total: 969 ms
Wall time: 1.68 s


In [42]:
%%time 

text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_length]
text_chunks[100]

CPU times: total: 0 ns
Wall time: 0 ns


'Deflation Hollows Deflation is the removal of loose particles from the ground by the action of wind. When deflation causes a shallow depression by persistent movements of wind, they are called as deflation hollows. Mushroom Tables Ventifacts are rocks that have been abraded, pitted, etched, grooved, or polished by wind-driven sand or ice crystals. These geomorphic features are most typically found in arid environments where there is little vegetation to interfere with aeolian particle transport, where there are frequently strong winds, and where there is a steady but not overwhelming supply of sand. Mushroom Tables / Mushroom rocks are Ventifacts in the shape of a mushroom. In deserts, a greater amount of sand and rock particles are transported close to the ground by the winds which cause more bottom erosion in overlying rocks than the top. This'

In [45]:
len(text_chunks)

249

In [44]:
%%time 

text_chunk_embeddings = embedding_model.encode(text_chunks, batch_size=32, convert_to_tensor=True)
text_chunk_embeddings

CPU times: total: 375 ms
Wall time: 498 ms


tensor([[ 0.0561,  0.0737,  0.0419,  ..., -0.0218, -0.1246,  0.0038],
        [-0.0754,  0.0494, -0.0252,  ...,  0.0150, -0.0143,  0.0286],
        [ 0.0264, -0.0255,  0.0595,  ..., -0.0240, -0.0227, -0.0092],
        ...,
        [-0.0200,  0.0131,  0.1068,  ..., -0.0089,  0.0557, -0.0017],
        [-0.0557,  0.0520,  0.1518,  ..., -0.0607, -0.0696, -0.0524],
        [-0.0396,  0.0280,  0.1182,  ..., -0.0675, -0.1298, -0.0064]],
       device='cuda:0')

In [48]:
## Saving embeddings to file
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_length)
embeddings_df_save_path = " text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [50]:
## viewing the saved file

text_chunks_and_embeddings_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embeddings_df_load.head()


Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,0,1 IIPA2022 22 World Geography Short Answers ...,75,15,18.75,[ 5.60791939e-02 7.36621842e-02 4.18552086e-...
1,1,2 Contents 1. Origin of Solar System and Sun ....,3711,128,927.75,[-7.53976032e-02 4.93803062e-02 -2.51827892e-...
2,1,"Temperature, Pressure and Density of the Earth...",879,30,219.75,[ 2.63881050e-02 -2.54815072e-02 5.94870374e-...
3,2,3 Types of Volcanoes ............................,4446,206,1111.5,[ 7.03721046e-02 -3.47200967e-02 5.47501445e-...
4,3,4 Continental Slope .............................,3865,167,966.25,[ 1.10680657e-02 4.12390829e-04 -1.85156371e-...
