# TP Text Mining

In [1]:
# Connecteur

class Document:
    def __init__(self, url, text):
        self.url_ = url
        self.text_ = text

In [2]:
import os

def fetch(path, recursive=True):
    files = []
    # r=root, d=directories, f = files
    if (recursive):
        for r, d, f in os.walk(path):
            for file in f:
                files.append(os.path.join(r, file))
    else:
        files = [ f for f in os.listdir('.') if os.path.isfile(os.path.join('.',f)) ]
    
    #print(files)
    documents = []
    for file in files:
        try:
            if not file.endswith(".ipynb"):
                txt = open(file, "r").read()
                documents.append(Document(file, txt))
        except:
            pass
    return documents

In [3]:
data_documents = fetch("./", True)

In [4]:
class TokenizedDocument:
    def __init__(self, url, words):
        self.url_ = url
        self.words_ = words
        

In [5]:
class Processor():
    def process(self, word):
        raise Exception("This is an abstract class : %s" %cls)

In [6]:
import unicodedata
class Normalizer(Processor):
    def process(self, word):
        uni = unicodedata.normalize('NFD', word).encode('ascii', 'ignore').lower()
        return str(uni, 'utf-8')
        

In [7]:
norm = Normalizer()
norm.process('Télécharger c"est mal')

'telecharger c"est mal'

In [8]:
import re
def analyze(documents, processors):
    tokenizedDocs = []
    for doc in documents:
        words = re.findall(r"[\w']+", doc.text_)
        for processor in processors:
            words = [processor.process(w) for w in words]
        tokenizedDocs.append(TokenizedDocument(doc.url_, words))
    return tokenizedDocs

In [9]:
sample_doc = data_documents[:2]
norm = Normalizer()
sample_tokenized_doc = analyze(sample_doc, [norm])
sample_tokenized_doc[1].url_, sample_tokenized_doc[1].words_


('./20news-bydate-test/alt.atheism/53639',
 ['from',
  'mandtbacka',
  'finabo',
  'abo',
  'fi',
  'mats',
  'andtbacka',
  'subject',
  're',
  'after',
  '2000',
  'years',
  'can',
  'we',
  'say',
  'that',
  'christian',
  'morality',
  'is',
  'in',
  'reply',
  'to',
  'frank',
  'd012s658',
  "uucp's",
  'message',
  'of',
  '21',
  'apr',
  '1993',
  '09',
  '38',
  '43',
  'gmt',
  'organization',
  'unorganized',
  'usenet',
  'postings',
  'uninc',
  'x',
  'news',
  'reader',
  'vms',
  'news',
  '1',
  '24',
  'lines',
  '151',
  'in',
  '1r34n3',
  'hfj',
  'horus',
  'ap',
  'mchp',
  'sni',
  'de',
  'frank',
  'd012s658',
  'uucp',
  'writes',
  'deletia',
  'in',
  'case',
  'anybody',
  "hadn't",
  'noticed',
  'frank',
  'and',
  'i',
  'are',
  'debating',
  'objective',
  'morality',
  'and',
  'seemingly',
  'hitting',
  'semantics',
  'secondly',
  'how',
  'can',
  'i',
  'refute',
  'your',
  'definition',
  'i',
  'can',
  'only',
  'point',
  'up',
  'its'

In [10]:
class Posting():
    def __init__(self, word, urls):
        self.word_ = word
        self.urls_ = urls

In [11]:
def index(tokenized_documents):
    postings = []
    tmp_postings = {}
    for doc in tokenized_documents:
        uniques = set(doc.words_)
        for word in uniques:
            if word in tmp_postings:
                tmp_postings[word] += [doc.url_]
            else:
                tmp_postings[word] = [doc.url_]
    for elm in tmp_postings.items():
        postings.append(Posting(elm[0], elm[1]))
    return postings

In [12]:
res = index(sample_tokenized_doc)
for e in res:
    print(e.word_, e.urls_)

mangoe ['./20news-bydate-test/alt.atheism/53420']
why ['./20news-bydate-test/alt.atheism/53420', './20news-bydate-test/alt.atheism/53639']
isn't ['./20news-bydate-test/alt.atheism/53420', './20news-bydate-test/alt.atheism/53639']
some ['./20news-bydate-test/alt.atheism/53420', './20news-bydate-test/alt.atheism/53639']
stuff ['./20news-bydate-test/alt.atheism/53420']
essentially ['./20news-bydate-test/alt.atheism/53420']
text ['./20news-bydate-test/alt.atheism/53420']
nt ['./20news-bydate-test/alt.atheism/53420']
about ['./20news-bydate-test/alt.atheism/53420', './20news-bydate-test/alt.atheism/53639']
texts ['./20news-bydate-test/alt.atheism/53420']
etc ['./20news-bydate-test/alt.atheism/53420']
happened ['./20news-bydate-test/alt.atheism/53420']
up ['./20news-bydate-test/alt.atheism/53420', './20news-bydate-test/alt.atheism/53639']
religion ['./20news-bydate-test/alt.atheism/53420']
by ['./20news-bydate-test/alt.atheism/53420', './20news-bydate-test/alt.atheism/53639']
1935 ['./20news

In [13]:
class Index():
    def __init__(self, urlToDid, wordToDids, DidToUrl):
        self.urlToDid_ = urlToDid
        self.wordToDids_ = wordToDids
        self.DidToUrl_ = DidToUrl
        # Je ne vois pas comment faire search() de manière
        # pas trop moche sans °_°

In [14]:
def build(postings):
    urlToDid = {}
    wordToDid = {}
    
    DidToUrl = {}
    
    # building urlToDid
    accId = 0
    for po in postings:
        for url in po.urls_:
            if url not in urlToDid:
                urlToDid[url] = accId
                
                DidToUrl[accId] = url
                
                accId += 1
    
    # building wordToDid
    for po in postings:
        wordToDid[po.word_] = []
        for url in po.urls_:
            wordToDid[po.word_].append(urlToDid[url])
    
    return Index(urlToDid, wordToDid, DidToUrl)
    

In [15]:
import pickle

def save(index, path):
    file = open(path, 'wb')
    pickle.dump(index, file)
    file.close()

def load(path):
    file = open(path, 'rb')
    index = pickle.load(file)
    file.close()
    return index

# TOPO

> **TODO** : Soigner la selection de fichier

In [16]:
myindex = build(res)
save(myindex, "savedIndex")

In [17]:
myindex.urlToDid_

{'./20news-bydate-test/alt.atheism/53420': 0,
 './20news-bydate-test/alt.atheism/53639': 1}

In [44]:
class MySearcher():
    def __init__(self, paths):
        self.indexs_ = [load(path) for path in paths]
    
    def search(self, word):
        res = []
        for index in self.indexs_:
            if word in index.wordToDids_:
                res += [index.DidToUrl_[did] for did in index.wordToDids_[word]]
        return res
    
    def searchOneOf(self, words):
        uniques = set()
        for word in words:
            tmp_result = self.search(word)
            for elm in tmp_result:
                uniques.add(elm)
        return list(uniques)
    
    def searchAllOf(self, words):
        uniques = set(self.search(words[0]))
        for word in words[1:]:
            tmp_result = set(self.search(word))
            for elm in tmp_result:
                uniques.intersection(elm)
        return list(uniques)


In [50]:
mySearcher = MySearcher(['savedIndex'])
mySearcher.search('in')

['./20news-bydate-test/alt.atheism/53420',
 './20news-bydate-test/alt.atheism/53639']

In [46]:
myindex.wordToDids_

{'mangoe': [0],
 'why': [0, 1],
 "isn't": [0, 1],
 'some': [0, 1],
 'stuff': [0],
 'essentially': [0],
 'text': [0],
 'nt': [0],
 'about': [0, 1],
 'texts': [0],
 'etc': [0],
 'happened': [0],
 'up': [0, 1],
 'religion': [0],
 'by': [0, 1],
 '1935': [0],
 'wingate': [0],
 'world': [0],
 'qumram': [0],
 'differences': [0],
 'though': [0, 1],
 'readings': [0],
 'particulars': [0],
 'an': [0, 1],
 'wpd': [0],
 'greek': [0],
 'story': [0],
 'between': [0, 1],
 'interpreted': [0],
 'only': [0, 1],
 'having': [0],
 'abundance': [0],
 'theological': [0],
 'obviously': [0],
 'and': [0, 1],
 'back': [0, 1],
 'know': [0, 1],
 'we': [0, 1],
 'analysis': [0],
 'real': [0, 1],
 'these': [0, 1],
 'any': [0, 1],
 'com': [0],
 'textual': [0],
 'umd': [0],
 'on': [0, 1],
 'reflect': [0],
 'must': [0],
 'latter': [0],
 'scroll': [0],
 'reference': [0],
 'cs': [0],
 'passages': [0],
 'great': [0, 1],
 'two': [0, 1],
 'hebrew': [0],
 'host': [0],
 'no': [0, 1],
 'is': [0, 1],
 'corrupted': [0],
 'time': [

In [49]:
mySearcher.searchOneOf(['mangoe', 'why']), mySearcher.searchAllOf(['mangoe', 'why'])

(['./20news-bydate-test/alt.atheism/53420',
  './20news-bydate-test/alt.atheism/53639'],
 ['./20news-bydate-test/alt.atheism/53420'])