In [5]:
# imports
import os
import json
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from elasticsearch import helpers
import numpy as np
import pandas as pd
import nbconvert
import nbformat
import zipfile
import csv
import time

In [2]:
# connect to local elastic search host
HOST = 'http://localhost:9200/'
es = Elasticsearch(hosts=[HOST])

# global vars
INDEX="notebookindex"
TYPE= "notebook"
COLUMNS = ['nb_id', 'html_url', 'name', 'language', 'markdown', 'comments', 'code']

In [113]:
# Initialise the elastic search index
init_md_index = {
    "settings" : {
        "number_of_shards": 5,
        "number_of_replicas": 1,
        "analysis": {
            "analyzer": {
                "text_analyzer": {
                    "type": "standard",
                    "stopwords": "_english_"
                }
            }
        }
    },
    'mappings': {
            'properties': {
                'nb_id': {'type': 'integer'},
                'markdown': {'type': 'text', 
                             "analyzer":"text_analyzer"
                            },
        }
    }
}
es.indices.create(index = 'markdown', body = init_md_index)

{'acknowledged': True, 'index': 'markdown', 'shards_acknowledged': True}

In [117]:
# Initialise the elastic search index
init_comments_index = {
    "settings" : {
        "number_of_shards": 5,
        "number_of_replicas": 1,
        "analysis": {
            "analyzer": {
                "text_analyzer": {
                    "type": "standard",
                    "stopwords": "_english_"
                }
            }
        }
    },
    'mappings': {
            'properties': {
                'nb_id': {'type': 'integer'},
                'comments': {'type': 'text', 
                             "analyzer":"text_analyzer"
                            },
        }
    }
}
es.indices.create(index = 'comments', body = init_comments_index)

{'acknowledged': True, 'index': 'comments', 'shards_acknowledged': True}

In [98]:
# Initialise the elastic search index
init_info_index = {
    "settings" : {
        "number_of_shards": 5,
        "number_of_replicas": 1,
    },
    'mappings': {
            'properties': {
                'nb_id': {'type': 'integer'},
                'html_url': {'type': 'keyword'},
                'name': {'type': 'keyword'},
                'language': {'type': 'keyword'},
        }
    }
}
es.indices.create(index = 'info', body = init_info_index)

{'acknowledged': True, 'index': 'info', 'shards_acknowledged': True}

In [53]:
# tes_df = pd.read_csv('df_bb2733859v_2_1_old.csv') # DONE
#tes_df = pd.read_pickle('df_bb2733859v_2_1.pkl') # DONE
#tes_df = pd.read_pickle('bb2733859v_3_1_new.pkl') # DONE
#tes_df = pd.read_pickle('bb2733859v_4_1(1)_new.pkl') # DONE
#tes_df = pd.read_pickle('bb2733859v_5_1(1)_new.pkl') # DONE
tes_df = pd.read_pickle('bb2733859v_6_1(1)_new.pkl') # DONE
#tes_df = pd.read_pickle('bb2733859v_7_1(1)_new.pkl') # DONE
hes_df = tes_df.head()

In [23]:
es_df = tes_df

def gen(es_df):
    for i, row in es_df.iterrows():
        t = {
            "_id": row['nb_id'],
            "html_url": row['html_url'],
            'name': row['name'],
            'language': row['language']
            }
        yield t

errors = []
for ok, action in helpers.parallel_bulk(client=es, index="info", actions=gen(es_df)):
    if not ok:
        errors.append(action)

In [54]:
es_df = tes_df
s = 'markdown'
def gen_md(es_df):
    for i, row in es_df.iterrows():
        t = {
            "_id": row['nb_id'],
            s: row[s]
                }
        yield t

errors = []
for ok, action in helpers.parallel_bulk(client=es, index=s, actions=gen_md(es_df)):
    if not ok:
        errors.append(action)

In [25]:
es_df = tes_df
s = 'comments' #'comments' 'markdown'
def gen_md(es_df):
    for i, row in es_df.iterrows():
        t = {
            "_id": row['nb_id'],
            s: row[s]
            }
        yield t

errors = []
for ok, action in helpers.parallel_bulk(client=es, index=s, actions=gen_md(es_df)):
    if not ok:
        errors.append(action)

In [97]:
es.search(body={"query": {"match_all": {}}}, index = INDEX)

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
 'hits': {'hits': [],
  'max_score': None,
  'total': {'relation': 'eq', 'value': 0}},
 'timed_out': False,
 'took': 0}

In [8]:
es.ping()

[]


In [52]:
broken = '1250687'
es_df = tes_df
test = es_df['nb_id'] == broken
print(test)

0         False
1         False
2         False
3         False
4         False
          ...  
258184    False
258185    False
258186    False
258187    False
258188    False
Name: nb_id, Length: 258189, dtype: bool


In [None]:
# old non bulk
es_df = tes_df.head()

for i, row in es_df.iterrows():
    t = {
        "html_url": row['html_url'],
        'name': row['name'],
        'language': row['language']
        }
    res = es.index(index="info",id=row['nb_id'], body=t)
    if res['result'] != 'created':
        print(res)