# Name: Indra Sai Kiran Valluru
# CiteSeer Data Set Collection Processing (CS582: Information Retrieval)

In [1]:
# Import necessary modules
import os
import pandas as pd
import re
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import collections

In [2]:
# Import gensim library to deal with stopwords
import gensim
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
print(STOPWORDS)

frozenset({'perhaps', 'be', 'nevertheless', 'once', 'mill', 'would', 'per', 'thence', 'mostly', 'ie', 'than', 'been', 'whole', 'herself', 'two', 'whatever', 'nowhere', 'also', 'can', 'hers', 'fire', 'latterly', 'before', 'side', 'noone', 'fill', 'because', 'eight', 'ltd', 'amongst', 'twelve', 'it', 'should', 'herein', 'please', 'beside', 'he', 'another', 'below', 'formerly', 'is', 'behind', 'too', 'being', 'now', 'whom', 'therein', 'serious', 'de', 'with', 'why', 'anything', 'did', 'this', 'whose', 'everywhere', 'me', 'due', 'such', 'something', 'on', 'between', 'very', 'never', 'along', 'used', 'move', 'anyway', 'other', 'same', 'but', 'using', 'around', 'cry', 're', 'sincere', 'beyond', 'himself', 'onto', 'are', 'throughout', 'twenty', 'how', 'made', 'these', 'nobody', 'during', 'wherever', 'up', 'empty', 'since', 'upon', 'whereas', 'them', 'put', 'until', 'had', 'several', 'amoungst', 'un', 'eleven', 'enough', 'somewhere', 'the', 'bottom', 'if', 'some', 'by', 'thus', 'last', 'they',

In [3]:
# File-path
path = 'citeseer'
filenames = os.listdir(path)

In [4]:
# Parsing and merging all files into merge.txt
with open('merge.txt', 'w') as outfile:
    for fname in filenames:
        filepath = 'citeseer/'+fname
        with open(filepath) as infile:
            outfile.write(infile.read())
        infile.close()
outfile.close()

## Task 1: Tokenize on whitespace and remove punctuation.


In [5]:
# Tokenize on whitespace and remove punctuation.
infile = open('merge.txt','r')
text = infile.read()
lines = text.lower() 

# Eliminate punctuation using regex
lines = re.sub(r'[^\w\s]+','',lines)     
tokens = lines.split()

# Writing tokens to a file
with open('tokens_1.txt','w') as outfile:            
    for listitem in tokens:
        outfile.write("%s " % listitem)

## Task 2:

In [6]:
# Frequency of words in the collection
count = {}
for word in open('tokens_1.txt').read().split(' '):
    if word in count:
        count[word] += 1
    else:
        count[word] = 1
        
print(count)

{'positioning': 6, 'a': 13345, 'coarsecalibrated': 1, 'camera': 55, 'with': 3200, 'respect': 104, 'to': 11536, 'an': 3281, 'unknown': 35, 'object': 369, 'by': 2765, '2d': 47, '12': 48, 'visual': 279, 'servoing': 13, 'in': 10067, 'this': 4446, 'paper': 2208, 'we': 5138, 'propose': 382, 'new': 976, 'visionbased': 37, 'robot': 305, 'control': 445, 'approach': 1167, 'halfway': 1, 'between': 639, 'the': 25662, 'classical': 88, 'positionbased': 2, 'and': 14131, 'imagebased': 2, 'servoings': 1, 'it': 1541, 'allows': 308, 'avoid': 46, 'their': 920, 'respective': 14, 'disadvantages': 13, 'homography': 1, 'some': 608, 'planar': 8, 'feature': 268, 'points': 97, 'extracted': 58, 'from': 1909, 'two': 704, 'images': 207, 'corresponding': 65, 'current': 340, 'desired': 44, 'poses': 22, 'is': 6577, 'computed': 36, 'at': 858, 'each': 587, 'iteration': 10, 'then': 429, 'approximate': 78, 'partialpose': 1, 'where': 383, 'translational': 3, 'term': 78, 'known': 127, 'only': 422, 'up': 162, 'scale': 76, 'f

In [7]:
# Total number of words in the collection
file = open('tokens_1.txt')
read_data = file.read()
words = read_data.split()
len(words)

476203

In [8]:
# Total number of unique words in the collection
len(count)

19890

In [9]:
# Top 20 words and their frequencies in the collection
import operator
from itertools import islice

top20_words = dict(sorted(count.items(), key=operator.itemgetter(1),reverse=True)[:20])
print(top20_words)

{'the': 25662, 'of': 18638, 'and': 14131, 'a': 13345, 'to': 11536, 'in': 10067, 'for': 7379, 'is': 6577, 'we': 5138, 'that': 4820, 'this': 4446, 'are': 3737, 'on': 3656, 'an': 3281, 'with': 3200, 'as': 3057, 'by': 2765, 'data': 2691, 'be': 2500, 'information': 2322}


In [10]:
# Stopwords in the Top 20 words in the collection
for i in top20_words.keys():
    if i in STOPWORDS:
        print(i)

the
of
and
a
to
in
for
is
we
that
this
are
on
an
with
as
by
be


In [11]:
# Minimum number of unique words accounting for 15% of the total number of words in the collection
c=0
no_of_words=0
count = dict(sorted(count.items(), key=operator.itemgetter(1),reverse=True))
limit = len(words)*0.15
for i in count:
    if c < limit and count[i] <= (limit - c):
        print(i,':', count[i])
        c += count[i]
        no_of_words += 1   
print('Number of words:',no_of_words)
print('Total count:',c)

the : 25662
of : 18638
and : 14131
to : 11536
web : 1432
factor : 31
Number of words: 6
Total count: 71430


## Task 3:

In [12]:
# Tokenize on whitespace and remove punctuation.
infile = open('merge.txt','r')
text = infile.read()
lines = text.lower() 
lines = re.sub(r'[^\w\s]+',' ',lines)

# Remove stopwords
t=remove_stopwords(lines)
clean_tokens = [word for word in t if word.lower() not in STOPWORDS]

# Initiallizing Porter Stemmer object
st = PorterStemmer()

# Stemming the words
stem_tokens = [st.stem(word) for word in clean_tokens]

# Writing tokens to a file
with open('tokens_3.txt','w') as outfile:                                  
    for listitem in t:
        outfile.write("%s" % listitem)

In [13]:
# Frequency of words in the collection
count = {}
for word in open('tokens_3.txt').read().split():
    if word in count:
        count[word] += 1
    else:
        count[word] = 1

print(count)

{'positioning': 7, 'coarse': 11, 'calibrated': 2, 'camera': 59, 'respect': 104, 'unknown': 37, 'object': 646, '2d': 37, '1': 1783, '2': 485, 'visual': 291, 'servoing': 13, 'paper': 2211, 'propose': 382, 'new': 977, 'vision': 173, 'based': 2534, 'robot': 370, 'control': 453, 'approach': 1169, 'halfway': 1, 'classical': 91, 'positionbased': 1, 'image': 325, 'servoings': 1, 'allows': 308, 'avoid': 48, 'respective': 14, 'disadvantages': 13, 'homography': 1, 'planar': 8, 'feature': 286, 'points': 98, 'extracted': 58, 'images': 209, 'corresponding': 65, 'current': 342, 'desired': 44, 'poses': 22, 'computed': 38, 'iteration': 10, 'approximate': 78, 'partial': 101, 'pose': 49, 'translational': 3, 'term': 116, 'known': 174, 'scale': 159, 'factor': 32, 'deduced': 1, 'designed': 198, 'closed': 26, 'loop': 18, 'law': 23, 'controlling': 26, 'd': 131, 'o': 142, 'f': 69, 'contrarily': 2, 'position': 99, 'scheme': 165, 'need': 313, 'geometric': 31, '3d': 145, 'model': 1241, 'furthermore': 103, 'ensure

In [14]:
# Total number of words in the collection
file = open('tokens_3.txt')
data = file.read()
words = data.split()
len(words)

284504

In [15]:
# Total number of unique words in the collection
len(count)

17029

In [16]:
# Top 20 words and their frequencies in the collection
top20_words = dict(sorted(count.items(), key=operator.itemgetter(1),reverse=True)[:20])
print(top20_words)

{'data': 2763, 'based': 2534, 'information': 2359, 'paper': 2211, 'systems': 1830, '1': 1783, 'agent': 1688, 'agents': 1565, 'web': 1556, 'learning': 1499, 'user': 1306, 'model': 1241, 'approach': 1169, 'query': 1168, 'search': 1032, 'problem': 1023, 'new': 977, 'use': 959, 'introduction': 952, 'time': 930}


In [17]:
# Stopwords in the Top 20 words in the collection
for i in top20_words.keys():
    if i in STOPWORDS:
        print(i,'')

In [18]:
# Minimum number of unique words accounting for 15% of the total number of words in the collection
c = 0
no_of_words =0
count = dict(sorted(count.items(), key=operator.itemgetter(1),reverse=True))
limit = len(words)*0.15
for i in count:
    if c < limit and count[i] <= (limit - c):
        print(i,':', count[i])
        c += count[i]
        no_of_words += 1   
print('\nNumber of words:',no_of_words)
print('Total count:',c)

data : 2763
based : 2534
information : 2359
paper : 2211
systems : 1830
1 : 1783
agent : 1688
agents : 1565
web : 1556
learning : 1499
user : 1306
model : 1241
approach : 1169
query : 1168
search : 1032
problem : 1023
new : 977
use : 959
introduction : 952
time : 930
results : 926
knowledge : 915
algorithm : 884
applications : 876
database : 853
s : 817
techniques : 798
present : 784
language : 782
design : 779
performance : 736
mobile : 709
algorithms : 694
research : 693
set : 693
speech : 191

Number of words: 36
Total count: 42675
