In [1]:
from bs4 import BeautifulSoup
import os
import re
from nltk.util import ngrams

import warnings
warnings.filterwarnings('ignore')

### Reading Data

In [2]:
documents = []
for file in os.listdir("data/"):
    
    if file.endswith(".sgm"):
        
        # for each sgm file, read it
        filename = os.path.join("data", file)
        f = open(filename, 'r', encoding='utf-8', errors='ignore')
        dataFile = f.read()
        
        # pass it to BeautifulSoup
        soup = BeautifulSoup(dataFile, 'html.parser')
        contents = soup.findAll('body')
        
        # for each body tag, extract it's text
        for content in contents:
            documents.append(content.text)
print('We have {} documents'.format(len(documents)))
print(documents[0])

We have 19043 documents
Inco Ltd said it did not expect its
earlier reported removal from the Dow Jones industrial index to
make a major impact on the company's stock.
    "We don't think that individuals or institutions buy our
shares because we were one of the Dow Jones industrials,"
spokesman Ken Cherney said in reply to a query.
    Inco closed 1-3/8 lower at 19-3/8 in second most active
trading on the Toronto Stock Exchange.
    The Wall Street Journal, which selects the index, said Inco
was dropped to make the index more representative of the
market. Inco, the non-Communist world's largest nickel
producer, was a member of the index since 1928.
    Replacing Inco and Owens-Illinois Inc will be Coca-Cola Co
and Boeing Co, effective tomorrow.
    Nickel analyst Ilmar Martens at Walwyn Stodgell Cochran
Murray Ltd said Inco's removal from the index would likely
spark short-term selling pressure on the stock.
    "Some investors who have Inco may suddenly say, 'well,
because it's not n

### Data Cleaning

#### Remove all links

In [3]:
def removeLinks(text):
    text = re.sub(r'http\S+', '', text, flags=re.MULTILINE)
    return text

documents = list(map(removeLinks, documents))

#### Remove all HTML tags

In [4]:
def removeHTMLTags(text):
    text = re.sub(r'<.*?>', '', text, flags=re.MULTILINE)
    return text

documents = list(map(removeHTMLTags, documents))

#### Decontract english words
won't --> will not

In [5]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

documents = list(map(decontracted, documents))

#### Convert to lower case

In [6]:
def convertToLowerCase(text):
    return text.lower()

documents = list(map(convertToLowerCase, documents))

#### Remove words with numbers

In [7]:
def removeWordsWithNumbers(text):
    return re.sub(r"\S*\d\S*", "", text).strip()

documents = list(map(removeWordsWithNumbers, documents))

#### Remove Special Characters and Punctuations

In [8]:
def removePuctuations(text):
    return re.sub(r'[^A-Za-z0-9]+', ' ', text)

documents = list(map(removePuctuations, documents))

#### Remove whitespaces

In [9]:
def removeWhiteSpaces(text):
    return text.strip()

documents = list(map(removeWhiteSpaces, documents))

In [10]:
#after data cleaning steps
documents[0]

'inco ltd said it did not expect its earlier reported removal from the dow jones industrial index to make a major impact on the company is stock we do not think that individuals or institutions buy our shares because we were one of the dow jones industrials spokesman ken cherney said in reply to a query inco closed lower at in second most active trading on the toronto stock exchange the wall street journal which selects the index said inco was dropped to make the index more representative of the market inco the non communist world is largest nickel producer was a member of the index since replacing inco and owens illinois inc will be coca cola co and boeing co effective tomorrow nickel analyst ilmar martens at walwyn stodgell cochran murray ltd said inco is removal from the index would likely spark short term selling pressure on the stock some investors who have inco may suddenly say well because it is not now a dow stock we should eliminate that investment said martens although he add

### Generating N-Grams

In [11]:
def generateNGrams(n):
    n_gram = []
    for doc in documents:
        tokens = [token for token in doc.split(" ") if token != ""]
        output = set(ngrams(tokens, n))
        n_gram.append(output)
    return n_gram

In [12]:
n_gram = generateNGrams(2)
print(n_gram[0])

{('on', 'the'), ('added', 'the'), ('member', 'of'), ('index', 'more'), ('our', 'shares'), ('said', 'inco'), ('closed', 'lower'), ('the', 'toronto'), ('tomorrow', 'nickel'), ('a', 'dow'), ('net', 'earnings'), ('may', 'suddenly'), ('say', 'well'), ('stock', 'we'), ('unlikely', 'to'), ('index', 'since'), ('selling', 'pressure'), ('toronto', 'stock'), ('we', 'do'), ('one', 'of'), ('in', 'reply'), ('said', 'martens'), ('the', 'company'), ('and', 'boeing'), ('industrial', 'index'), ('stock', 'inco'), ('reported', 'removal'), ('inco', 'is'), ('impact', 'on'), ('inco', 'closed'), ('institutions', 'buy'), ('was', 'a'), ('cola', 'co'), ('query', 'inco'), ('that', 'investment'), ('jones', 'industrial'), ('in', 'from'), ('some', 'investors'), ('inco', 'ltd'), ('stodgell', 'cochran'), ('because', 'we'), ('a', 'member'), ('index', 'said'), ('most', 'active'), ('martens', 'although'), ('did', 'not'), ('more', 'representative'), ('company', 'is'), ('the', 'move'), ('individuals', 'or'), ('martens', 'a

### Applying Jaccard Similarity

In [13]:
def getSimilarityForDoc(docID, n):
    
    similarity_map_for_doc = {}
    
    #generating all the n_grams
    n_gram = generateNGrams(n)
    d1 = n_gram[docID]
    
    #applying Jaccard Similarity
    for i in range(len(documents)):
        if i != docID:
            d2 = n_gram[i]
            JS = len(d1.intersection(d2)) / (len(d1.union(d2)))
            similarity_map_for_doc[i] = JS*100
    
    #extracting top 5 according to similarity
    sorted_sim_map_for_doc = sorted(similarity_map_for_doc.items(), key=lambda x: x[1], reverse=True)
    top_five_sim_docs = sorted_sim_map_for_doc[:5]
    
    return top_five_sim_docs

In [14]:
documents[543]

'the world bank said it has approved a mln dlr loan for india to help lessen that country is dependence on imported oil and spur development of its own petroleum resources the bank said the loan will be used to boost production by injecting gas in the partially depleted assam oil fields and to assist exploration in other areas including drilling exploratory wells the bank said the recipient of the loan will be oil india ltd oil which is the smaller of two public indian petroleum exploration and production companies reuter'

In [15]:
res = getSimilarityForDoc(543, 2)
res

[(547, 96.42857142857143),
 (7074, 11.76470588235294),
 (18273, 10.416666666666668),
 (15921, 9.45945945945946),
 (15932, 9.45945945945946)]

In [16]:
documents[547]

'the world bank said it approved a mln dlr loan for india to help lessen that country is dependence on imported oil and spur development of its own petroleum resources the bank said the loan will be used to boost production by injecting gas in the partially depleted assam oil fields and to assist exploration in other areas including drilling exploratory wells the bank said the recipient of the loan will be oil india ltd oil which is the smaller of two public indian petroleum exploration and production companies reuter'