In [1]:
import numpy as np
import xml.etree.ElementTree as ET
import re
import pandas as pd
from lxml import etree
from nltk.stem import PorterStemmer
import pickle

# Чтение topics

In [2]:
xml = ""
with open('Data/CLEF3/topics/Top-en03.txt') as f:
    xml = f.read()
tree = ET.fromstring("<root>" + xml + "</root>")

In [3]:
nums, titles = [], []
for top in tree.getchildren():
    num = top.find("num").text.strip()
    num = num[num.find("C") + 1:]
    title = top.find("EN-title").text.strip().lower()
    nums.append(num)
    titles.append(title)
    
nums = np.array(nums, dtype=np.int)
titles = np.array(titles)

In [4]:
topics = pd.DataFrame(np.hstack([
    nums.reshape(-1, 1), 
    titles.reshape(-1, 1)
    ]),
    columns=['num', 'title']
)

# Чтение qrels

In [5]:
qrels = pd.read_csv('Data/CLEF3/qrels/biling_qrels_03', sep=' ', 
                    names=['query', 'stage', 'docno', 'target'])
qrels.drop('stage', inplace=True, axis=1)

In [6]:
qrels.head()

Unnamed: 0,query,docno,target
0,141,GH950106-000047,0
1,141,GH950109-000115,0
2,141,GH950118-000104,0
3,141,GH950121-000124,0
4,141,GH950121-000159,0


# Чтение docs и их processing

In [15]:
def clean_text(text):
    regex = "[^a-zA-Z ]"
    cleaned_text = re.sub(regex, "", text).lower()
    
    stemmer = PorterStemmer()
    stemmed_text = ""
    for word in cleaned_text.split(' '):
        stemmed_text += stemmer.stem(word) + ' '
    cleaned_text = stemmed_text
    
    return cleaned_text.strip()

### Загрузка корпуса LAT94

In [9]:
xml = ""
with open('Data/CLEF3/corpus/LAT94-utf.all', 'r', encoding='utf-8') as f:
    xml = f.read()
xml = re.sub('&', ' ', xml)
xml = re.sub('&amp;', ' ', xml)
root = etree.fromstring("<root>" + xml + "</root>")

In [10]:
del xml

In [12]:
docs_array = []

for doc in root.getchildren():
    docno = doc.find("DOCNO").text.strip()
    docid = doc.find("DOCID").text.strip()
    textElement = doc.find("TEXT")
    if textElement is None:
        continue
    text = ""
    for textChild in textElement.getchildren():
        text += ' '.join(textChild.text.lower().split('\n')) + ' '
    
    splitted = list(filter(lambda x: len(x) > 0, clean_text(text).split(' ')))
    text = ' '.join(splitted)
    length = len(splitted)
    docs_array.append([docno, docid, text, length])

### Загрузка корпуса GH95

In [55]:
xml = ""
with open('Data/CLEF3/corpus/GH95-utf.all', 'r', encoding='utf-8') as f:
    xml = f.read()
xml = re.sub('&', ' ', xml)
xml = re.sub('&amp;', ' ', xml)
# unsupported tag <#LIN+ E> was erased in line 2865393, column 70
# unsupported tag <\n was erased in line 3019020, column 62
root = etree.fromstring("<root>" + xml + "</root>")

In [56]:
del xml

In [57]:
docs_array_gh95 = []

for doc in root.getchildren():
    docno = doc.find("DOCNO").text.strip()
    docid = doc.find("DOCID").text.strip()
    textElement = doc.find("TEXT")
    if textElement is None:
        continue
    text = ' '.join(textElement.text.lower().split('\n'))
    
    for textChild in textElement.getchildren():
        print('found tag')
        
    splitted = list(filter(lambda x: len(x) > 0, clean_text(text).split(' ')))
    text = ' '.join(splitted)
    length = len(splitted)
    docs_array_gh95.append([docno, docid, text, length])

In [58]:
docs_array = docs_array +  docs_array_gh95

In [59]:
with open('Data/CLEF3/cleaned-corpus-utf', 'wb') as f:
    pickle.dump(docs_array, f)

### Загрузка сохраненных данных

In [7]:
docs_array = None
with open('Data/CLEF3/cleaned-corpus-utf', 'rb') as f:
    docs_array = pickle.load(f)

In [8]:
docs = pd.DataFrame(
    docs_array,
    columns=['docno', 'docid', 'text', 'len']
)

# Создание features

In [62]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

In [63]:
vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1), stop_words = 'english')
term_count = vectorizer.fit_transform(docs['text'])

In [64]:
term_count

<166745x345455 sparse matrix of type '<class 'numpy.int64'>'
	with 28852198 stored elements in Compressed Sparse Row format>

In [73]:
with open('Data/CLEF3/vectorizer', 'wb') as f:
    pickle.dump(vectorizer, f)

In [65]:
with open('Data/CLEF3/BoW-corpus-utf', 'wb') as f:
    pickle.dump(term_count, f)

### Загрузка

In [9]:
term_count = None
vectorizer = None
with open('Data/CLEF3/BoW-corpus-utf', 'rb') as f:
    term_count = pickle.load(f)
with open('Data/CLEF3/vectorizer', 'rb') as f:
    vectorizer = pickle.load(f)

In [10]:
document_average_size = np.mean(docs['len'])

In [11]:
norm_coefs = np.log(1 + docs['len'].values / document_average_size).reshape(-1, 1)
X_values = term_count.multiply(norm_coefs)

In [12]:
Y_values = np.sum(term_count > 0, axis=0) / term_count.shape[0]
Y_values = np.array(Y_values).flatten()

In [13]:
X_values.shape, Y_values.shape

((166745, 345455), (345455,))

In [16]:
topics['title'] = topics['title'].apply(clean_text)

In [17]:
np.sum(vectorizer.transform(topics['title']) > 1)

0

In [38]:
with open('Data/CLEF3/X_values', 'wb') as f:
    pickle.dump(X_values, f)
with open('Data/CLEF3/Y_values', 'wb') as f:
    pickle.dump(Y_values, f)

Каждое слово в запросе встречается по одному разу.
Забавное замечание - мы выкинули все стоп-слова, а часть слов из запросов никогда не встречается в итоговом сете.

In [39]:
X_values, Y_values = None, None
with open('Data/CLEF3/X_values', 'rb') as f:
    X_values = pickle.load(f)
with open('Data/CLEF3/Y_values', 'rb') as f:
    Y_values = pickle.load(f)

## Важно!
### Теперь преобразуем наши значения так, чтобы (tf, idf) значения слов из запроса шли подряд. Дополнительно создадим массив числа слов в запросе распознанных.

In [33]:
from tqdm import tqdm

In [34]:
queries = []
query_ids = np.unique(qrels['query'])

for query_id in tqdm(query_ids):
    
    # Получаем target
    subtable = qrels[qrels['query'] == query_id]
    subtable = subtable[subtable['target'] == 1]
    
    query_target = np.zeros(docs.shape[0], dtype=np.bool)
    for docno in subtable['docno']:
        query_target |= (docs['docno'] == docno).values
    
    topic_text = topics[topics['num'].astype(np.int) == query_id]['title'].values[0]
    
    bow = vectorizer.transform([topic_text])
    feature_index = np.arange(bow.shape[1])[bow.toarray()[0] > 0]
    
    query_characteristics = []
    
    for index in feature_index:
        query_characteristics.append([
            X_values.getcol(index).toarray().flatten(),
            Y_values[index]
        ])
    queries.append([query_characteristics, query_target])
    
queries = np.array(queries)

100%|██████████| 60/60 [04:02<00:00,  4.04s/it]


In [35]:
with open('Data/CLEF3/queries-corpus-utf', 'wb') as f:
    pickle.dump(queries, f)

### Загрузка сохраненных данных

In [36]:
queries = None
with open('Data/CLEF3/queries-corpus-utf', 'rb') as f:
    queries = pickle.load(f)

In [40]:
from sklearn.cluster import KMeans

In [45]:
kmeans = KMeans(n_clusters=4, verbose=1, n_jobs=8)

In [46]:
kmeans.fit(X_values)

Initialization complete
Initialization complete
Initialization complete
Initialization complete
Initialization complete
Initialization complete
Initialization complete
Initialization complete
Iteration  0, inertia 287317770.935
Iteration  0, inertia 293271712.036
Iteration  0, inertia 296312317.463
Iteration  0, inertia 284189513.060
Iteration  0, inertia 266764425.668
Iteration  0, inertia 281776801.568
Iteration  0, inertia 285508693.466
Iteration  0, inertia 262899670.083
Iteration  1, inertia 238671854.168
Iteration  1, inertia 258685799.837
Iteration  1, inertia 260257729.422
Iteration  1, inertia 255274371.554
Iteration  1, inertia 236883859.483
Iteration  1, inertia 238317788.138
Iteration  1, inertia 256533769.915
Iteration  1, inertia 241767333.488
Iteration  2, inertia 234038229.537
Iteration  2, inertia 255729127.409
Iteration  2, inertia 248368623.902
Iteration  2, inertia 257656538.090
Iteration  2, inertia 232217986.045
Iteration  2, inertia 235869092.090
Iteration  2, in

Iteration 27, inertia 235184324.836
Iteration 28, inertia 244858293.001
Iteration 28, inertia 231349547.204
Iteration 28, inertia 253488975.471
Iteration 28, inertia 231691934.556
Iteration 28, inertia 231352441.749
Iteration 28, inertia 251626175.357
Iteration 28, inertia 230901457.275
Iteration 28, inertia 235184283.579
Iteration 29, inertia 244858282.845
Iteration 29, inertia 231349546.315
Iteration 29, inertia 253488779.787
Iteration 29, inertia 231672896.680
Iteration 29, inertia 231351695.873
Iteration 29, inertia 251623950.961
Iteration 29, inertia 230896344.253
Iteration 29, inertia 235184256.064
Iteration 30, inertia 244858277.464
Iteration 30, inertia 231349546.036
Converged at iteration 30: center shift 0.000000e+00 within tolerance 4.908211e-07
Initialization complete
Iteration 30, inertia 253488317.068
Iteration 30, inertia 231656540.679
Iteration 30, inertia 231351211.150
Iteration 30, inertia 251622696.145
Iteration 30, inertia 230891342.336
Iteration 30, inertia 2351842

KeyboardInterrupt: 