**IMPORTING REQUIRED MODULES AND PACKAGES**

In [0]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt') # one time execution

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

**INPUT TEXT**

In [0]:
s='Malaria is a "mosquito-borne" infectious disease that affects humans and other animals. Malaria causes symptoms that typically include fever, tiredness, vomiting, and headaches. In severe cases it can cause yellow skin, seizures, coma & death. Symptoms usually begin 10 - 15 days after being bitten by an infected mosquito.'

In [0]:
print("Input text:",s)

Input text: Malaria is a "mosquito-borne" infectious disease that affects humans and other animals. Malaria causes symptoms that typically include fever, tiredness, vomiting, and headaches. In severe cases it can cause yellow skin, seizures, coma & death. Symptoms usually begin 10 - 15 days after being bitten by an infected mosquito.


**TOKENIZATION OF SENTENCES**

In [0]:
from nltk.tokenize import sent_tokenize
sentences=[]
sentences.append(sent_tokenize(s))
sentences = [word for sent in sentences for word in sent] 

In [0]:
print("After tokenization:",sentences)

After tokenization: ['Malaria is a "mosquito-borne" infectious disease that affects humans and other animals.', 'Malaria causes symptoms that typically include fever, tiredness, vomiting, and headaches.', 'In severe cases it can cause yellow skin, seizures, coma & death.', 'Symptoms usually begin 10 - 15 days after being bitten by an infected mosquito.']


**REMOVE PUNCTUATIONS, NUMBERS AND SPECIAL CHARACTERS**

In [0]:
# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")
print("clean sentences:")
print(clean_sentences)

clean sentences:
0    Malaria is a  mosquito borne  infectious disea...
1    Malaria causes symptoms that typically include...
2    In severe cases it can cause yellow skin  seiz...
3    Symptoms usually begin         days after bein...
dtype: object


In [0]:
# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]

In [0]:
print(clean_sentences)

['malaria is a  mosquito borne  infectious disease that affects humans and other animals ', 'malaria causes symptoms that typically include fever  tiredness  vomiting  and headaches ', 'in severe cases it can cause yellow skin  seizures  coma   death ', 'symptoms usually begin         days after being bitten by an infected mosquito ']


**REMOVAL OF STOPWORDS**

In [0]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print("before removing stop words")
print(clean_sentences)
def remove_stopwords(sen):
    sample_list=[]
    for i in sen:
      if i not in stop_words:
        sample_list.append(i)
    sen_new=" ".join(sample_list)
    return sen_new
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]
print("removing stop words:")
print(clean_sentences)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
before removing stop words
['malaria is a  mosquito borne  infectious disease that affects humans and other animals ', 'malaria causes symptoms that typically include fever  tiredness  vomiting  and headaches ', 'in severe cases it can cause yellow skin  seizures  coma   death ', 'symptoms usually begin         days after being bitten by an infected mosquito ']
removing stop words:
['malaria mosquito borne infectious disease affects humans animals', 'malaria causes symptoms typically include fever tiredness vomiting headaches', 'severe cases cause yellow skin seizures coma death', 'symptoms usually begin days bitten infected mosquito']


**LEMMATIZATION OF SENTENCES**

In [0]:
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
print("Before lemmatization:")
print(clean_sentences)

from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


part = {
    'N' : 'n',
    'V' : 'v',
    'J' : 'a',
    'R' : 'r'
}

wnl = WordNetLemmatizer()

def convert_tag(penn_tag):
    if penn_tag in part.keys():
        return part[penn_tag]
    else:
        return 'n'


def tag_and_lem(element):
    sent = pos_tag(word_tokenize(element)) 
    return ' '.join([wnl.lemmatize(sent[k][0], convert_tag(sent[k][1][0]))
                    for k in range(len(sent))])
result_sentence=[]
for i in clean_sentences:
    value=tag_and_lem(i)
    result_sentence.append(value)
lemmatized_sentences=result_sentence
print("After lemmatization:")
print(lemmatized_sentences)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
Before lemmatization:
['malaria mosquito borne infectious disease affects humans animals', 'malaria causes symptoms typically include fever tiredness vomiting headaches', 'severe cases cause yellow skin seizures coma death', 'symptoms usually begin days bitten infected mosquito']
After lemmatization:
['malaria mosquito borne infectious disease affect humans animal', 'malaria cause symptom typically include fever tiredness vomit headache', 'severe case cause yellow skin seizure coma death', 'symptom usually begin day bitten infect mosquito']


In [0]:
len(lemmatized_sentences)

4

**SIMILARITY OF SENTENCES USING COSINE SIMILARITY**

In [0]:
import math

In [0]:
sim_mat = np.zeros([len(sentences), len(sentences)])

In [0]:
sim_mat

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [0]:
#similarity_matrix=[]
for x in range(0,len(lemmatized_sentences)):
   # sim_mat=[]
    s1=lemmatized_sentences[x]
    s1_words=s1.split()
    for y in range(0,len(lemmatized_sentences)):
        s2=lemmatized_sentences[y]
        s2_words=s2.split()
        unique_words=[]
        for word in s1_words:
            if word not in unique_words:
                unique_words.append(word)
        for word in s2_words:
            if word not in unique_words:
                unique_words.append(word)
        d={}
        for j in unique_words:
            d[j]=0
        d1=[]
        d2=[]
        for k in d.keys():
            d1.append(s1_words.count(k))
            d2.append(s2_words.count(k))
        sum_d1=0
        for i in d1:
            sum_d1=sum_d1+i*i
            s1_norm=math.sqrt(sum_d1) 
        sum_d2=0
        for i in d2:
            sum_d2=sum_d2+i*i
            s2_norm=math.sqrt(sum_d2)
        similarity_value=0
        for i in range(0,len(d1)):
            d1[i]=d1[i]/s1_norm
            d2[i]=d2[i]/s2_norm
            similarity_value+=d1[i]*d2[i]
        sim_mat[x][y]=similarity_value

In [0]:
sim_mat

array([[1.        , 0.11785113, 0.        , 0.13363062],
       [0.11785113, 1.        , 0.11785113, 0.12598816],
       [0.        , 0.11785113, 1.        , 0.        ],
       [0.13363062, 0.12598816, 0.        , 1.        ]])

In [0]:
m=len(sim_mat)

In [0]:
print(m)

4


**CALCULATING TEXTRANK FOR SENTENCES**

In [0]:
damping_factor_matrix = [] 

for i in range(m): 
  a=[]
  for j in range(1):
    a.append(0.85)
  damping_factor_matrix.append(a)
print(damping_factor_matrix)

[[0.85], [0.85], [0.85], [0.85]]


In [0]:
for i in range(m): 
    for j in range(1): 
        print(damping_factor_matrix[i][j], end = " ") 
    print() 

0.85 
0.85 
0.85 
0.85 


In [0]:
transpose_matrix=np.transpose(sim_mat) 

In [0]:
for k in range(3):
  res = np.dot(transpose_matrix,damping_factor_matrix) 
  damping_factor_matrix=res

In [0]:
print(res)

[[1.70507085]
 [1.98720431]
 [1.26685247]
 [1.73086887]]


In [0]:
rank_dict={}
for s in range(len(sentences)):
  rank_dict[s]=res[s][0]
print(rank_dict)

{0: 1.705070850611419, 1: 1.9872043089314073, 2: 1.2668524673255812, 3: 1.7308688729793822}


**SORTING THE SENTENCES BASED ON THEIR RANKS**

In [0]:
ranked_sentences = sorted(((rank_dict[i],s) for i,s in enumerate(sentences)), reverse=True)
print(ranked_sentences)

[(1.9872043089314073, 'Malaria causes symptoms that typically include fever, tiredness, vomiting, and headaches.'), (1.7308688729793822, 'Symptoms usually begin 10 - 15 days after being bitten by an infected mosquito.'), (1.705070850611419, 'Malaria is a "mosquito-borne" infectious disease that affects humans and other animals.'), (1.2668524673255812, 'In severe cases it can cause yellow skin, seizures, coma & death.')]


**PRINTING THE TOP MOST IMPORTANT SENTENCES IN A PARAGRAPH**

In [0]:
for i in range(2):
    print(ranked_sentences[i][1])

Malaria causes symptoms that typically include fever, tiredness, vomiting, and headaches.
Symptoms usually begin 10 - 15 days after being bitten by an infected mosquito.
