In [1]:
import os
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.linear_model import Lasso

from pathlib import Path

In [2]:
project_path = str(Path(os.getcwd()).parent.absolute())
print("Current directory : " + os.getcwd() + ", Project directory : " + project_path)

Current directory : d:\Documents\Info\INF554\INF554_Kaggle_Project\Exploration, Project directory : d:\Documents\Info\INF554\INF554_Kaggle_Project


In [3]:
os.chdir(project_path)

In [23]:

# read training data
df_train = pd.read_csv('data/train.csv', dtype={'author': np.int64, 'hindex': np.float32})
n_train = df_train.shape[0]

# read test data
df_test = pd.read_csv('data/test.csv', dtype={'author': np.int64})
n_test = df_test.shape[0]

# load the graph    
G = nx.read_edgelist('data/coauthorship.edgelist', delimiter=' ', nodetype=int)
n_nodes = G.number_of_nodes()
n_edges = G.number_of_edges() 
print('Number of nodes:', n_nodes)
print('Number of edges:', n_edges)


# computes structural features for each node
core_number = nx.core_number(G)

# create the training matrix. each node is represented as a vector of 3 features:
# (1) its degree, (2) its core number 
X_train = np.zeros((n_train, 2))
y_train = np.zeros(n_train)
for i,row in df_train.iterrows():
    node = row['author']
    X_train[i,0] = G.degree(node)
    X_train[i,1] = core_number[node]
    y_train[i] = row['hindex']

# create the test matrix. each node is represented as a vector of 3 features:
# (1) its degree, (2) its core number
X_test = np.zeros((n_test, 2))
for i,row in df_test.iterrows():
    node = row['author']
    X_test[i,0] = G.degree(node)
    X_test[i,1] = core_number[node]
    
# train a regression model and make predictions
reg = Lasso(alpha=0.1)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

# write the predictions to file
df_test['hindex'] = pd.Series(np.round_(y_pred, decimals=3))


df_test.loc[:,["author","hindex"]].to_csv('submission.csv', index=False)







Number of nodes: 217801
Number of edges: 1718164


In [5]:
nx.draw_networkx(G, node_size=10)
plt.title("Raw graph")
plt.show()

# Texts manipulation

## NLP

In [10]:
import os, json

os.chdir(project_path)

In [11]:
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
import gensim
from gensim.models import Word2Vec
import re
import string
from tqdm.notebook import tqdm
import nltk
from nltk.corpus import stopwords
import pandas as pd

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nathan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [8]:
with open("data/abstracts.txt") as file:
    texts = file.readlines()

FileNotFoundError: [Errno 2] No such file or directory: 'data/abstracts.txt'

In [5]:
len(texts)

624181

In [6]:
file_str = texts[0]
print(type(file_str))
file_str

<class 'str'>


'3603----{"IndexLength":122,"InvertedIndex":{"In":[0],"this":[1],"paper,":[2],"we":[3,98],"describe":[4],"a":[5,16,41,58,62,67,74,108],"new":[6,17],"bitmap":[7,42,54,69,76],"indexing":[8,43],"technique":[9],"to":[10,73],"cluster":[11],"XML":[12,14,34,63],"documents.":[13],"is":[15,71],"standard":[18,89],"for":[19,60],"exchanging":[20],"and":[21,38,49,56,91,104,115],"representing":[22],"information":[23],"on":[24,95,107],"the":[25,47,85,101,117],"Internet.":[26],"Documents":[27],"can":[28,111],"be":[29,112],"hierarchically":[30],"represented":[31,37],"by":[32],"XML-elements.":[33],"documents":[35],"are":[36],"indexed":[39],"using":[40],"technique.":[44],"We":[45,80],"define":[46,81,100],"similarity":[48],"popularity":[50],"operations":[51,106],"available":[52],"in":[53,84],"indexes":[55],"propose":[57],"method":[59],"partitioning":[61],"document":[64,120],"set.":[65],"Furthermore,":[66],"2-dimensional":[68],"index":[70],"extended":[72],"3dimensional":[75],"index,":[77],"called":[78],"Bi

In [7]:
def get_id(file_str):
    return int(file_str.split("----")[0])

def get_json(file_str):
    new_file_str = file_str.replace("-","")
    json_str = new_file_str.split(str(get_id(file_str)))[-1]

    return json.loads(json_str)

def get_descritpion(file_str):

    json_file = get_json(file_str)
    words = [""]*int(json_file["IndexLength"])

    for word in json_file["InvertedIndex"].keys():
        indexes = json_file["InvertedIndex"][word]

        for index in indexes:
            words[int(index)] = word
        
        
    return words #' '.join(words).replace("\n", " ")


def get_line(file_str):
    
    words = get_descritpion(file_str)

    return ' '.join(words).replace("\n", " ")

def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens

    Args:
        text: Text to tokenize.

    Returns:
        Tokenized text.
    """
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation

    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens


def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in tqdm(list_of_docs):
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features


In [8]:
tokenizer = word_tokenize
stpwds = stopwords.words("english")

In [9]:
import gensim.downloader
model = gensim.downloader.load('glove-wiki-gigaword-200')

In [None]:
data = [[ get_id(file_str), get_line(file_str)] for i,file_str in tqdm(enumerate(texts))]#[:100000]

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [None]:
df = pd.DataFrame(data=data, columns=["id", "text"])

In [None]:
df["tokens"] =  df["text"].map(lambda x: clean_text(x, tokenizer, stpwds))
tokenized_docs = df["tokens"].tolist()

In [None]:
df["tokens"]

0         [paper, describe, new, bitmap, indexing, techn...
1         [paper, starts, observation, inclusionbased, a...
2         [contribution, describes, approach, integrate,...
3         [cleaneval, shared, task, competitive, evaluat...
4         [xax, browser, plugin, model, enables, develop...
                                ...                        
624176    [xray, polarimetry, sometimes, alone, sometime...
624177    [recent, years, underwater, wireless, sensor, ...
624178    [todays, cyber, physical, systems, cps, well, ...
624179    [software, service, cloud, computing, model, f...
624180    [penetration, testing, wellestablished, practi...
Name: tokens, Length: 624181, dtype: object

In [None]:
model.get_vector("math")

array([ 3.6594e-01,  6.6349e-01,  2.6309e-01, -5.4793e-01,  1.0673e+00,
       -9.0648e-02, -1.8309e-01,  4.4575e-01, -4.4378e-01,  2.6572e-01,
       -6.3668e-02,  3.0498e-01, -4.1506e-01, -2.9739e-01, -1.8309e-01,
        8.4494e-02,  3.8472e-01, -9.2856e-01,  4.4563e-01,  3.2183e-01,
        3.8717e-01,  1.6766e+00,  3.8729e-01,  3.9076e-01,  6.2466e-01,
        1.4462e-01, -6.5783e-02, -4.3507e-01,  8.1413e-01,  5.2090e-02,
        2.5516e-01, -3.9658e-02, -5.0672e-01, -2.9341e-01,  2.3707e-01,
        1.2230e-01, -1.1576e-01, -2.7695e-01,  4.1394e-01,  4.3311e-02,
        4.8973e-01, -4.4549e-01, -6.1259e-02,  1.9547e-02,  5.2209e-01,
        9.1658e-01,  3.5733e-01, -4.9494e-01, -9.8224e-01,  2.0034e-01,
       -3.0868e-02, -1.7801e-02, -2.3831e-01,  2.8774e-01,  7.4100e-02,
        1.5456e-01,  3.3661e-01,  3.4376e-01, -5.9647e-01, -9.4255e-02,
       -3.4176e-01,  2.5309e-02, -2.3511e-02,  1.2011e+00, -2.1605e-01,
       -4.8390e-01,  4.3511e-01,  5.4461e-01,  1.2633e+00,  2.59

In [None]:
model.vocab

{'the': <gensim.models.keyedvectors.Vocab at 0x7fb767b7c0d0>,
 ',': <gensim.models.keyedvectors.Vocab at 0x7fb767b67e10>,
 '.': <gensim.models.keyedvectors.Vocab at 0x7fb767b7c110>,
 'of': <gensim.models.keyedvectors.Vocab at 0x7fb767b67e90>,
 'to': <gensim.models.keyedvectors.Vocab at 0x7fb767b7c190>,
 'and': <gensim.models.keyedvectors.Vocab at 0x7fb767b67dd0>,
 'in': <gensim.models.keyedvectors.Vocab at 0x7fb767b7c250>,
 'a': <gensim.models.keyedvectors.Vocab at 0x7fb781ca8450>,
 '"': <gensim.models.keyedvectors.Vocab at 0x7fb767b7c290>,
 "'s": <gensim.models.keyedvectors.Vocab at 0x7fb781ca8a90>,
 'for': <gensim.models.keyedvectors.Vocab at 0x7fb767b7c2d0>,
 '-': <gensim.models.keyedvectors.Vocab at 0x7fb781ca8490>,
 'that': <gensim.models.keyedvectors.Vocab at 0x7fb767b7c310>,
 'on': <gensim.models.keyedvectors.Vocab at 0x7fb767b7c090>,
 'is': <gensim.models.keyedvectors.Vocab at 0x7fb767b7c390>,
 'was': <gensim.models.keyedvectors.Vocab at 0x7fb744d2d410>,
 'said': <gensim.models

In [None]:
vectorized_docs = vectorize(tokenized_docs, model=model)
len(vectorized_docs), len(vectorized_docs[0])

HBox(children=(FloatProgress(value=0.0, max=624181.0), HTML(value='')))






(624181, 200)

## Clustering 

In [None]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score

In [None]:
def mbkmeans_clusters(
	X, 
    k, 
    mb, 
    print_silhouette_values, 
):
    """Generate clusters and print Silhouette metrics using MBKmeans

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    #print(f"For n_clusters = {k}")
    #print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    #print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_


For n_clusters = 50


KeyboardInterrupt: 

In [None]:
clustering, cluster_labels = mbkmeans_clusters(
	X=vectorized_docs,
    k=50,
    mb=500,
    print_silhouette_values=False,
)
df_clusters = pd.DataFrame({
    "text": df["text"],
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels
}) 

In [None]:
clustering = MiniBatchKMeans(n_clusters=50, batch_size=500).fit(vectorized_docs)
predictions = clustering.predict(vectorized_docs)

In [None]:
print(clustering.cluster_centers_)
np.save("cluster_centers.npy", clustering.cluster_centers_)

[[ 0.20588301  0.24067159  0.19165999 ... -0.01171241 -0.03965203
  -0.08586896]
 [ 0.16592961  0.19839012  0.03009064 ...  0.05206217 -0.00211239
   0.08112168]
 [ 0.12366415  0.22503281  0.09871089 ... -0.01294877 -0.00832571
  -0.02904039]
 ...
 [ 0.14220501 -0.00394353 -0.00162601 ...  0.14857209 -0.06222622
   0.00346095]
 [ 0.08820026  0.14914491  0.06434108 ... -0.03626029  0.01993634
  -0.00894024]
 [-0.02962896  0.09460649  0.02092578 ... -0.04177928 -0.07266515
  -0.01178901]]


In [None]:
ids = np.array(data)[:,0]
del data, tokenized_docs, model, texts
#


In [None]:
#results["cluster_centers"] = clustering.cluster_centers_
results = {}

for i,id_ in tqdm(enumerate(ids)):
    results[id_] = {}
    results[id_]["vector"] = str(list(vectorized_docs[i]))
    results[id_]["cluster"] = int(predictions[i])

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [None]:
with open('Vectors_and_clusters.json', 'w') as f:
    json.dump(results, f)

Error: Kernel is dead

In [12]:
import json
import numpy as np

s = {"1":{"vector":str(list(np.array([0.00000200304002001000001000000000000001,1],dtype=np.float32))),"pred":1}}
print(s)
with open('Vectors_and_clusters.json', 'w') as f:
    json.dump(s, f)

{'1': {'vector': '[2.00304e-06, 1.0]', 'pred': 1}}


In [None]:
print("Most representative terms per cluster (based on centroids):")
for i in range(50):
    tokens_per_cluster = ""
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=5)
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")

Most representative terms per cluster (based on centroids):
Cluster 0: example specific particular instance similar 
Cluster 1: example instance same this well 
Cluster 2: en que la con de 
Cluster 3: system using data use example 
Cluster 4: example instance same this use 
Cluster 5: particular specific example certain instance 
Cluster 6: any specific instance example means 
Cluster 7: und eine der über „ 


  after removing the cwd from sys.path.


Cluster 8: example instance particular similar specific 
Cluster 9: provide example instance provided use 
Cluster 10: using uses system allows use 
Cluster 11: system example provide use creating 
Cluster 12: la de et du en 
Cluster 13: using instance specific data allows 
Cluster 14: using allows available use system 
Cluster 15: specific example particular instance use 
Cluster 16: using system allows directly use 
Cluster 17: example specific i.e. particular function 
Cluster 18: example specific using uses particular 
Cluster 19: example similar usually using typically 
Cluster 20: example specific particular instance certain 
Cluster 21: example specific use using system 
Cluster 22: provide specific example system particular 
Cluster 23: focus particular well example important 
Cluster 24: specific furthermore particular certain similar 
Cluster 25: provided provide instance system use 
Cluster 26: system allows direct using directly 
Cluster 27: same example this instance simil

In [None]:
for test_cluster in [10,1]:
    print(f"-------------------{test_cluster}-------------------")
    most_representative_docs = np.argsort(
        np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
    )
    for d in most_representative_docs[:10]:
        print(df["text"][d])
        print("-------------")

-------------------10-------------------
Multimedia applications targeting batteryoperated wearable devices must be poweraware to exploit the capabilities of variable voltage processors. This paper presents a feedback (FB) controller for video decoding that regulates the voltage for individual frames. The decoding complexity of (parts of) individual frames is estimated using a simple frame length decoding time correlation obtained from statistics gathered on the target hardware (StrongARM processor). Experiments with a modified H.263 decoder show that the FB controller closely approaches ( 10%) the optimal case in which each frame is decoded at the minimal frequency/voltage. Furthermore, we observe that incorporating additional complexity information in the video stream will only be useful when the energy consumption of the (fixed) memory subsystem is significantly reduced.
-------------
A system for videoconferencing that offers, among other features, extremely low endtoend delay as w

In [None]:
clustering.cluster_centers_

array([[ 0.41603511,  0.02993672,  0.08477567, ...,  0.42157972,
         0.05389575,  0.08879889],
       [ 0.2547176 ,  0.12962069,  0.05133368, ...,  0.41265002,
         0.03695938,  0.02680716],
       [ 0.21775689,  0.58536683, -0.57444827, ..., -0.38275401,
         0.88557918,  1.01202863],
       ...,
       [ 0.32258523,  0.02653912,  0.13975534, ...,  0.44815915,
         0.07009276,  0.15324835],
       [ 0.39768368, -0.00493577,  0.35454605, ...,  0.29803471,
         0.02942713,  0.12025491],
       [ 0.38876871, -0.02372196,  0.03457274, ...,  0.50404936,
         0.14686806,  0.10216383]])

In [None]:
np.save("Clusters.npy",clustering.cluster_centers_)

In [None]:
predictions = clustering.predict(vectorized_docs)

In [None]:
df_clusters = pd.DataFrame()
df_clusters["id"] = data[:,0]
df_clusters["prediction"] = predictions
pd.DataFrame(df_clusters).to_csv("Clusters.csv", index=False)

In [7]:
from json import JSONDecodeError
for text in texts:
    try:
        j = get_json(text)
    except JSONDecodeError:
        print(text)
        break
    if j['IndexLength']==0:
        print(j)

KeyboardInterrupt: 

In [8]:
files_str = [get_descritpion(file_str) for file_str in tqdm(texts)]

HBox(children=(FloatProgress(value=0.0, max=624181.0), HTML(value='')))




In [None]:
model = Word2Vec(sentences=files_str,size=100, window=5, min_count=1, workers=4)

In [13]:
model.train(sentences=files_str, total_examples=len(files_str), epochs=1)

TypeError: train() got an unexpected keyword argument 'min_count'

In [36]:
help(Word2Vec)

Help on class Word2Vec in module gensim.models.word2vec:

class Word2Vec(gensim.models.base_any2vec.BaseWordEmbeddingsModel)
 |  Word2Vec(sentences=None, corpus_file=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=<built-in function hash>, iter=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000, compute_loss=False, callbacks=(), max_final_vocab=None)
 |  
 |  Train, use and evaluate neural networks described in https://code.google.com/p/word2vec/.
 |  
 |  Once you're finished training a model (=no more updates, only querying)
 |  store and use only the :class:`~gensim.models.keyedvectors.KeyedVectors` instance in `self.wv` to reduce memory.
 |  
 |  The model can be stored/loaded via its :meth:`~gensim.models.word2vec.Word2Vec.save` and
 |  :meth:`~gensim.models.word2vec.Word2Vec.load` methods.
 |  
 |  The trained word vectors can a

In [None]:
model.save("word2vecPaperDescription.model")

# Autres

In [None]:
import os
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.linear_model import Lasso

In [None]:
os.getcwd()

'/Users/maximebonnin/Notebooks/3A Notebook/INF554/INF554_Kaggle_Project/Exploration'

In [None]:
os.chdir("/Users/maximebonnin/Notebooks/3A Notebook/INF554/INF554_Kaggle_Project/")

In [None]:
# read training data
df_train = pd.read_csv('data/train.csv', dtype={'author': np.int64, 'hindex': np.float32})
n_train = df_train.shape[0]

# read test data
df_test = pd.read_csv('data/test.csv', dtype={'author': np.int64})
n_test = df_test.shape[0]

# load the graph    
G = nx.read_edgelist('data/coauthorship.edgelist', delimiter=' ', nodetype=int)
n_nodes = G.number_of_nodes()
n_edges = G.number_of_edges() 
print('Number of nodes:', n_nodes)
print('Number of edges:', n_edges)


# computes structural features for each node
core_number = nx.core_number(G)

# create the training matrix. each node is represented as a vector of 3 features:
# (1) its degree, (2) its core number 
X_train = np.zeros((n_train, 2))
y_train = np.zeros(n_train)
for i,row in df_train.iterrows():
    node = row['author']
    X_train[i,0] = G.degree(node)
    X_train[i,1] = core_number[node]
    y_train[i] = row['hindex']

# create the test matrix. each node is represented as a vector of 3 features:
# (1) its degree, (2) its core number
X_test = np.zeros((n_test, 2))
for i,row in df_test.iterrows():
    node = row['author']
    X_test[i,0] = G.degree(node)
    X_test[i,1] = core_number[node]
    
# train a regression model and make predictions
reg = Lasso(alpha=0.1)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

# write the predictions to file
df_test['hindex'] = pd.Series(np.round_(y_pred, decimals=3))


df_test.loc[:,["author","hindex"]].to_csv('submission.csv', index=False)







Number of nodes: 217801
Number of edges: 1718164


In [None]:
nx.draw_networkx(G, node_size=10)
plt.title("Raw graph")
plt.show()

# Texts manipulation

with open("data/abstracts.txt") as 

In [None]:
print(get_id(texts[1000]))
print(get_json(texts[1000]))
get_descritpion(file_str)

8289804
{'IndexLength': 82, 'InvertedIndex': {'For': [0], 'the': [1, 48, 51, 73], 'feature': [2], 'analysis': [3], 'of': [4, 47, 50, 65, 71], 'vector': [5, 11, 23, 30, 59], 'fields': [6, 60], 'we': [7], 'decompose': [8], 'a': [9, 16, 18, 21, 29, 33, 79], 'given': [10], 'field': [12, 31], 'into': [13], 'three': [14], 'components:': [15], 'divergence-free,': [17], 'rotation-free,': [19], 'and': [20, 36, 42, 63, 78], 'harmonic': [22], 'field.': [24], 'This': [25], 'Hodge-type': [26], 'decomposition': [27], 'splits': [28], 'using': [32], 'variational': [34], 'approach,': [35], 'allows': [37], 'to': [38, 56, 75], 'locate': [39], 'sources,': [40], 'sinks,': [41], 'vortices': [43], 'as': [44], 'extremal': [45], 'points': [46], 'potentials': [49], 'components.': [52], 'Our': [53], 'method': [54, 74], 'applies': [55], 'discrete': [57], 'tangential': [58], 'on': [61], 'surfaces,': [62], 'is': [64], 'global': [66], 'nature.': [67], 'Results': [68], 'are': [69], 'presented': [70], 'applying': [72]

['In',
 'this',
 'paper,',
 'we',
 'describe',
 'a',
 'new',
 'bitmap',
 'indexing',
 'technique',
 'to',
 'cluster',
 'XML',
 'documents.',
 'XML',
 'is',
 'a',
 'new',
 'standard',
 'for',
 'exchanging',
 'and',
 'representing',
 'information',
 'on',
 'the',
 'Internet.',
 'Documents',
 'can',
 'be',
 'hierarchically',
 'represented',
 'by',
 'XML-elements.',
 'XML',
 'documents',
 'are',
 'represented',
 'and',
 'indexed',
 'using',
 'a',
 'bitmap',
 'indexing',
 'technique.',
 'We',
 'define',
 'the',
 'similarity',
 'and',
 'popularity',
 'operations',
 'available',
 'in',
 'bitmap',
 'indexes',
 'and',
 'propose',
 'a',
 'method',
 'for',
 'partitioning',
 'a',
 'XML',
 'document',
 'set.',
 'Furthermore,',
 'a',
 '2-dimensional',
 'bitmap',
 'index',
 'is',
 'extended',
 'to',
 'a',
 '3dimensional',
 'bitmap',
 'index,',
 'called',
 'BitCube.',
 'We',
 'define',
 'statistical',
 'measurements',
 'in',
 'the',
 'BitCube:',
 'mean,',
 'mode,',
 'standard',
 'derivation,',
 'and',

In [None]:
model = Word2Vec(size=100, window=5, min_count=1, workers=4)

In [None]:
files_str = [get_descritpion(file_str) for file_str in texts]
    

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
from gensim.test.utils import common_texts

In [None]:
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]