In [147]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [148]:
# Sample documents
documents = [

"The idea to create RGUKT germinated in April 2007, at a meeting between Prof. D.Raj Reddy and Prof. K.C. Reddy who then took idea to the then Chief Minister of Andhra Pradesh, Dr. Y.S.Rajashekar Reddy.",
"Dr. YSR by then was not happy with the opportunities available to the rural youth and wanted to start a University modeled on IIIT to exclusively concentrate on rural youth in imparting technology oriented higher education for bettering their employment potential.",
"A Task Force was created under the Chairmanship of Prof. K.C. Reddy, the then chairman APSCHE which recommended for establishing a technological University to benefit rural youth.",
"In March of 2008, the AP Government created RGUKT with three campuses through an Act of the Legislature as a full-fledged university which would initially admit approximately the top 1% of the rural students into the three residential campuses located at RK Valley, Basara and Nuzvid.",
"he first batch of about 6,500 tenth class graduates was admitted into the six year integrated B.Tech program in August of 2008. The initial selection was based on the 'Mandal Best' model wherein students securing the top marks in SSC examination in every mandal were selected.",
"Prof. Raj Reddy as the founding Chancellor of the University for a decade guided the educational philosophy of the University and also moulded the academic journey to maintain not only its uniqueness but create significant impact on the quality of outcome and the student employment profile that later transformed their lives."


]

In [149]:
query=" AP Government created RGUKT with three campuses through an Act"

In [150]:
import re
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[.,-]', '', text)
    return text

In [151]:
preprocess_documents=[preprocess_text(doc) for doc in documents]

In [152]:
preprocess_documents

['the idea to create rgukt germinated in april 2007 at a meeting between prof draj reddy and prof kc reddy who then took idea to the then chief minister of andhra pradesh dr ysrajashekar reddy',
 'dr ysr by then was not happy with the opportunities available to the rural youth and wanted to start a university modeled on iiit to exclusively concentrate on rural youth in imparting technology oriented higher education for bettering their employment potential',
 'a task force was created under the chairmanship of prof kc reddy the then chairman apsche which recommended for establishing a technological university to benefit rural youth',
 'in march of 2008 the ap government created rgukt with three campuses through an act of the legislature as a fullfledged university which would initially admit approximately the top 1% of the rural students into the three residential campuses located at rk valley basara and nuzvid',
 "he first batch of about 6500 tenth class graduates was admitted into the

In [153]:
print("Preprocessed Documents:")
for doc in preprocess_documents:
    print(doc)


Preprocessed Documents:
the idea to create rgukt germinated in april 2007 at a meeting between prof draj reddy and prof kc reddy who then took idea to the then chief minister of andhra pradesh dr ysrajashekar reddy
dr ysr by then was not happy with the opportunities available to the rural youth and wanted to start a university modeled on iiit to exclusively concentrate on rural youth in imparting technology oriented higher education for bettering their employment potential
a task force was created under the chairmanship of prof kc reddy the then chairman apsche which recommended for establishing a technological university to benefit rural youth
in march of 2008 the ap government created rgukt with three campuses through an act of the legislature as a fullfledged university which would initially admit approximately the top 1% of the rural students into the three residential campuses located at rk valley basara and nuzvid
he first batch of about 6500 tenth class graduates was admitted in

In [154]:
query

' AP Government created RGUKT with three campuses through an Act'

In [155]:
print("Preprocessed Query:")
preprocessed_query=preprocess_text(query)


Preprocessed Query:


In [156]:
vector=TfidfVectorizer()

In [157]:
X=vector.fit_transform(preprocess_documents)

In [158]:
X.toarray()

array([[0.17768813, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.10541511, 0.17768813, 0.        , 0.        , 0.17768813,
        0.        , 0.        , 0.14570681, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.17768813, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.17768813,
        0.        , 0.        , 0.14570681, 0.        , 0.        ,
        0.14570681, 0.17768813, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.17768813,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.35537625, 0.        , 0.        , 0.        ,
        0.10541511, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.14570681, 0.  

In [159]:
query_embedding=vector.transform([preprocessed_query])
print(query_embedding.toarray())
print(query)

[[0.         0.         0.         0.         0.         0.33301397
  0.         0.         0.         0.33301397 0.         0.
  0.33301397 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.33301397 0.         0.         0.         0.         0.
  0.         0.         0.27307622 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.33301397 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.      

In [160]:
similarities = cosine_similarity(X, query_embedding)

In [161]:
similarities

array([[0.03978906],
       [0.03821485],
       [0.053806  ],
       [0.56919629],
       [0.        ],
       [0.        ]])

In [162]:
ranked_indices=np.argsort(similarities,axis=0)[::-1].flatten()

In [163]:
ranked_documents = [documents[i] for i in ranked_indices]
ranked_documents

['In March of 2008, the AP Government created RGUKT with three campuses through an Act of the Legislature as a full-fledged university which would initially admit approximately the top 1% of the rural students into the three residential campuses located at RK Valley, Basara and Nuzvid.',
 'A Task Force was created under the Chairmanship of Prof. K.C. Reddy, the then chairman APSCHE which recommended for establishing a technological University to benefit rural youth.',
 'The idea to create RGUKT germinated in April 2007, at a meeting between Prof. D.Raj Reddy and Prof. K.C. Reddy who then took idea to the then Chief Minister of Andhra Pradesh, Dr. Y.S.Rajashekar Reddy.',
 'Dr. YSR by then was not happy with the opportunities available to the rural youth and wanted to start a University modeled on IIIT to exclusively concentrate on rural youth in imparting technology oriented higher education for bettering their employment potential.',
 "he first batch of about 6,500 tenth class graduate

In [164]:
for i, doc in enumerate(ranked_documents):
    print(f"Rank {i+1}: {doc}")


Rank 1: In March of 2008, the AP Government created RGUKT with three campuses through an Act of the Legislature as a full-fledged university which would initially admit approximately the top 1% of the rural students into the three residential campuses located at RK Valley, Basara and Nuzvid.
Rank 2: A Task Force was created under the Chairmanship of Prof. K.C. Reddy, the then chairman APSCHE which recommended for establishing a technological University to benefit rural youth.
Rank 3: The idea to create RGUKT germinated in April 2007, at a meeting between Prof. D.Raj Reddy and Prof. K.C. Reddy who then took idea to the then Chief Minister of Andhra Pradesh, Dr. Y.S.Rajashekar Reddy.
Rank 4: Dr. YSR by then was not happy with the opportunities available to the rural youth and wanted to start a University modeled on IIIT to exclusively concentrate on rural youth in imparting technology oriented higher education for bettering their employment potential.
Rank 5: he first batch of about 6,50

In [165]:
query

' AP Government created RGUKT with three campuses through an Act'

**Practicing with some other examples:**

In [166]:
# Sample documents
documents = [
 "This is a list which containig sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings."
]


In [167]:
import re
def preprocess_text(t):
  t=t.lower()
  t=re.sub(r'[.,-]',"",t)
  return t

In [168]:
preprocessed_text=[preprocess_text(i) for i in documents ]
preprocessed_text

['this is a list which containig sample documents',
 'keywords are important for keywordbased search',
 'document analysis involves extracting keywords',
 'keywordbased search relies on sparse embeddings']

In [169]:
query="keyword search"

In [170]:
preprocessed_query=preprocess_text(query)
preprocessed_query

'keyword search'

In [171]:
vector=TfidfVectorizer()

In [172]:
document_embedding=vector.fit_transform(preprocessed_text)
document_embedding

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 24 stored elements and shape (4, 21)>

In [173]:
document_embedding.toarray()

array([[0.        , 0.        , 0.37796447, 0.        , 0.37796447,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.37796447, 0.        , 0.        , 0.37796447, 0.        ,
        0.        , 0.37796447, 0.        , 0.        , 0.37796447,
        0.37796447],
       [0.        , 0.4533864 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.4533864 , 0.4533864 , 0.        ,
        0.        , 0.35745504, 0.35745504, 0.        , 0.        ,
        0.        , 0.        , 0.35745504, 0.        , 0.        ,
        0.        ],
       [0.46516193, 0.        , 0.        , 0.46516193, 0.        ,
        0.        , 0.46516193, 0.        , 0.        , 0.46516193,
        0.        , 0.        , 0.36673901, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.43671931, 0.        , 0.        , 0.       

In [174]:
query_embedding=vector.transform([preprocessed_query])
query_embedding.toarray()[0]


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0.])

In [175]:
similar=cosine_similarity(document_embedding,query_embedding)
similar

array([[0.        ],
       [0.35745504],
       [0.        ],
       [0.34431452]])

In [176]:
index=np.argsort(similar,axis=0)[::-1].flatten()

In [177]:
document_rank=[ documents[i] for i in index ]

In [178]:
document_rank

['Keywords are important for keyword-based search.',
 'Keyword-based search relies on sparse embeddings.',
 'Document analysis involves extracting keywords.',
 'This is a list which containig sample documents.']

In [179]:
for i,doc in enumerate(document_rank):
  print('rank{} : {}'.format(i+1,doc))

rank1 : Keywords are important for keyword-based search.
rank2 : Keyword-based search relies on sparse embeddings.
rank3 : Document analysis involves extracting keywords.
rank4 : This is a list which containig sample documents.
