In [55]:
import pandas as pd
from pyelasticsearch.client import ElasticSearch
import requests, json

In [44]:
df_train = pd.read_excel('data/overall-tf-idf.xlsx') # Your training dataset here
df_test = pd.read_excel('data/overall-test-bsn.xlsx') # Your test set here

In [45]:
train_dict = df_train.transpose().to_dict()
test_dict = df_test.transpose().to_dict()

In [46]:
list_of_train_document = []
for key in train_dict:
    document = {}
    document['id'] = key
    document['label'] = train_dict[key]['Label']
    document['tweet'] = train_dict[key]['Tweets']
    list_of_train_document.append(document)
    
list_of_test_document = []
for key in test_dict:
    document = {}
    document['id'] = key
    document['label'] = test_dict[key]['Label']
    document['tweet'] = test_dict[key]['Tweets']
    list_of_test_document.append(document)

In [59]:
# Index your document in ES
def index_documents_in_ES(index, documents):
    es = ElasticSearch()
    for document in documents:
        res = es.index(index=index, doc_type='tweet', id=document['id'], doc=document)
        if res['created'] == False:
            print res

# Build your more like this query
def build_mlt(nb, doc_id):
    mlt = {}
    mlt["from"] = 0
    mlt["size"] = nb
    mlt["query"] = {}
    mlt["query"]["more_like_this"] = {}
    mlt["query"]["more_like_this"]["fields"] = ["tweet"]
    mlt["query"]["more_like_this"]["like"] = [{"_index" : "test","_type" : "tweet","_id" : doc_id}]
    mlt["query"]["more_like_this"]["min_term_freq"] = 1
    mlt["query"]["more_like_this"]["max_query_terms"] = 50
    mlt["query"]["more_like_this"]["minimum_should_match"] = "25%"
    return mlt


# Extract as a list the result from Elasticsearch
def extract_from_json(json):
    hits = json['hits']['hits']
    documents = [hit['_source'] for hit in hits]
    return documents

#This function need to be changed in order to be more versatile
def get_max_label(documents):
    bullying = 0
    sarcasm = 0
    normal = 0
    for document in documents:
        if document['label'] == 'sarcasm':
            sarcasm += 1
        elif document['label'] == 'normal':
            normal  += 1
        else:
            bullying += 1
    max_label = max([sarcasm, bullying, normal])
    if max_label == bullying:
        return 'bully'
    elif max_label == sarcasm:
        return 'sarcasm'
    else:
        return 'normal'
        
# Send the request to elasticsearch and extract the result
def get_similar(nb, doc_id):
    mlt = build_mlt(nb, doc_id)
    response = requests.post("http://localhost:9200/train/tweet/_search", data=json.dumps(mlt))
    similar_documents = extract_from_json(json.loads(response.text))
    return similar_documents

In [None]:
index_documents_in_ES('train', list_of_train_document)
index_documents_in_ES('test', list_of_test_document)

for document in list_of_test_document:
    similar_documents = get_similar(25, document['id'])
    label = get_max_label(similar_documents)
    document['auto_label'] = label

In [62]:
classifier_result = {}
for document in list_of_test_document:
    classifier_result[document['id']] = document

In [64]:
df_result = pd.DataFrame(classifier_result)
df_result.transpose()
df_result.transpose().to_csv('result.csv', encoding='utf8')