# Get BM25 recommendation

In [1]:
import logging
logging.basicConfig(filename='error.log', level=logging.ERROR)
from datetime import datetime
FILES = [
    "grandmaster_nl_pl_only_plot.json",
    "master_nl_pl_only_plot.json",
    "expert_nl_pl_only_plot.json",
]

CLASS_NAME = {
    "grandmaster": "GrandMasterCode",
    "master": "MasterCode",
    "expert": "ExpertCode",
}


Weaviate Instance

In [None]:
import json
from json import loads
import weaviate
from elasticsearch import Elasticsearch


weaviate_client = weaviate.Client("http://202.151.177.149:81")  # Replace with your endpoint
some_objects = weaviate_client.data_object.get()
if (json.dumps(some_objects)):
    print(True)
else:
    print(False)

print(json.dumps(some_objects))


elastic_client = Elasticsearch("http://202.151.177.154:9200")

response = str(elastic_client.info())
print(response)

In [3]:
def getMLRecommendation(text: str, target_class: str) -> dict:
    # md_text = obj['markdown']
    # cur_class = "grandmaster"
    near_text = {"concepts": [text]}
    fetched = (weaviate_client.query
                      .get(CLASS_NAME[target_class], ["code"])
                      .with_near_text(near_text)
                      .with_limit(1)
                      .do()
                      )
    data = fetched['data']['Get'][CLASS_NAME[target_class]]
    return data[0]

def getElasticQuery(query, type=["base", "processed"]):
    if (type == "processed"):
        result = {
        "query": {
            "match": {
                "processed": {
                    "query": query
                }
            }
        }
    }
    else:
        result = {
        "query": {
            "match": {
                "markdown": {
                    "query": query
                }
            }
        }
    }
    return result 

def getElasticRecommendation(index, queryBody):
    response = elastic_client.search(index=index, body=queryBody)

    if response and response["hits"]["hits"]:
        result = response["hits"]["hits"][0]["_source"]
        return result['code']
    return []

Load recommendation result into JSON files

In [20]:
for file_name in FILES:
    with open(f'data/{file_name}', 'r') as file:
        data = json.load(file)
        data_rank = file_name[:-21]

        data_length = len(data)
        testing_accumulate = []
        count = 0
        for row in data:
            markdown = "".join(row['markdown'])
            
            if (not markdown):
                count =  count + 1
        print(f'Total Empty Markdown in rank {data_rank} = {count}')

Total Empty Markdown in rank grandmaster = 22
Total Empty Markdown in rank master = 116
Total Empty Markdown in rank expert = 528


In [None]:
for file_name in FILES:
    with open(f'data/{file_name}', 'r') as file:
        data = json.load(file)
        data_rank = file_name[:-21]

        data_length = len(data)
        testing_accumulate = []
        count = 0
        for row in data:
            markdown = "".join(row['markdown'])
            
            try:
                temp_result = {
                    "original_md": markdown,
                    "original_code": row['code'],
                    "count": count,
                    "data_rank": data_rank
                }

                # Get ML Recommendation
                recommended_code_ml = getMLRecommendation(markdown, data_rank)
                temp_result["recommended_code_ml"] = recommended_code_ml['code']
            
                # Get Elastic Recommendation
                query_base = getElasticQuery(markdown)
                query_processed = getElasticQuery(markdown, "processed")

                recommended_code_es_base = getElasticRecommendation(data_rank, query_base)
                recommended_code_es_processed = getElasticRecommendation(data_rank, query_processed)

                temp_result['recommended_code_es_base'] = recommended_code_es_base
                temp_result['recommended_code_es_processed'] = recommended_code_es_processed

                # Append to the collection
                testing_accumulate.append(temp_result)

                print(f'Count: {count}/{data_length}')
                
            except Exception as e:
                print(f'[{datetime.now().strftime("%d/%m/%y %H:%M:%S")}] Skipping item #{count} due to an error.')
                logging.error(f"Error occurred for item: {markdown}\nError message: {str(e)}\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n")
                continue
            count =  count + 1
        
        with open(f"recommendation result/{data_rank}_result.json", "w") as file:
            json.dump(testing_accumulate, file)
            print(f"successfully write to: {data_rank}_result.json ")

Comparing original with recommended ones

In [21]:
FILE_NAMES = [
    "grandmaster_result.json",
    "master_result.json",
    "expert_result.json",
]

for name in FILE_NAMES:
    path = f"recommendation result/{name}"
    data_rank = name[:-12]

    with open(path, 'r') as file:
        original_file_path = open(f'data/{data_rank}_nl_pl_only_plot.json')
        original_data = json.load(original_file_path)

        data = json.load(file)

        print(f"rank {data_rank}: {len(data)} items || original : {len(original_data)} items")

  original_file_path = open(f'data/{data_rank}_nl_pl_only_plot.json')
  original_file_path = open(f'data/{data_rank}_nl_pl_only_plot.json')


rank grandmaster: 2670 items || original : 2670 items
rank master: 4098 items || original : 4098 items


  original_file_path = open(f'data/{data_rank}_nl_pl_only_plot.json')


rank expert: 10851 items || original : 10855 items


In [44]:
output = {}
metadata = {
    
}
for name in FILE_NAMES:
    path = f"recommendation result/{name}"
    data_rank = name[:-12]

    with open(path, 'r') as file:
        original_file_path = open(f'data/{data_rank}_nl_pl_only_plot.json')
        original_data = json.load(original_file_path)
        data = json.load(file)
        print(f"rank {data_rank}: {len(data)} items || original : {len(original_data)} items")

        comparison_collection = {
            "ml": 0,
            "es": 0,
            "es-processed": 0,
        }
        incorrect_summarize = {
            # "ml": 0,
            "es": 0,
            # "es-processed": 0,
        }

        incorrect_pairs = []
        incorrect = {

        }
        i = 0
        count_empty = 0
        for item in data:
            if (not item['original_md']):
                count_empty = count_empty + 1
                continue

            original_code = item['original_code']
            ml_code = item['recommended_code_ml']
            es_base_code = item['recommended_code_es_base']
            es_processed_code = item['recommended_code_es_processed']

            if (item['original_code'] == item['recommended_code_ml']):
                comparison_collection["ml"] = comparison_collection["ml"] + 1

            if (item['original_code'] == item['recommended_code_es_processed']):
                comparison_collection["es-processed"] = comparison_collection["es-processed"] + 1
                
            if (item['original_code'] == item['recommended_code_es_base']):
                comparison_collection["es"] = comparison_collection["es"] + 1
                
            if (item['original_code'] != item['recommended_code_es_base']):
                incorrect_summarize["es"] = incorrect_summarize["es"] + 1
                incorrect_pairs.append({
                    "originalMarkdown": item['original_md'],
                    "originalCode": item['original_code'],
                    "bm25Code": item['recommended_code_es_base']
                })
        
        print(f"Skipped {count_empty} empty Markdown items in rank {data_rank}")
        
        output[data_rank] = {
            "correct_pairs_summarize": comparison_collection,
            "incorrect_summarize": {
                "es": len(data)-count_empty-comparison_collection["es"]
            },
            "incorrect_pairs_items": incorrect_pairs,
        }

        metadata[data_rank] = {
            "total_items": len(data),
            "total_skipped_empty_markdown": count_empty,
            "total_es_correct_pairs_": comparison_collection["es"],
            "total_es_incorrect_pairs_": incorrect_summarize["es"]
        }
        
with open(f"comparison_result.json", "w") as file:
    output = json.dump(output, file)
    print(f'successfully write to /result/comparison_result.json')
print("Done!!")

print(f"""
Summarize:

Rank Grandmaster
- Total items: {metadata['grandmaster']["total_items"]}
- Total skipped empty Markdown: {metadata['grandmaster']["total_skipped_empty_markdown"]}
# Below is only for Elasticsearch base approach #
- Total correct original-recommended pairs: {metadata['grandmaster']["total_es_correct_pairs_"]}
- Total incorrect original-recommended pairs: {metadata['grandmaster']["total_es_incorrect_pairs_"]}

Rank Master
- Total items: {metadata['master']["total_items"]}
- Total skipped empty Markdown: {metadata['master']["total_skipped_empty_markdown"]}
# Below is only for Elasticsearch base approach #
- Total correct original-recommended pairs: {metadata['master']["total_es_correct_pairs_"]}
- Total incorrect original-recommended pairs: {metadata['master']["total_es_incorrect_pairs_"]}

Rank Expert
- Total items: {metadata['expert']["total_items"]}
- Total skipped empty Markdown: {metadata['expert']["total_skipped_empty_markdown"]}
# Below is only for Elasticsearch base approach #
- Total correct original-recommended pairs: {metadata['expert']["total_es_correct_pairs_"]}
- Total incorrect original-recommended pairs: {metadata['expert']["total_es_incorrect_pairs_"]}
""")



  original_file_path = open(f'data/{data_rank}_nl_pl_only_plot.json')


rank grandmaster: 2670 items || original : 2670 items
Skipped 22 empty Markdown items in rank grandmaster
rank master: 4098 items || original : 4098 items
Skipped 116 empty Markdown items in rank master


  original_file_path = open(f'data/{data_rank}_nl_pl_only_plot.json')
  original_file_path = open(f'data/{data_rank}_nl_pl_only_plot.json')


rank expert: 10851 items || original : 10855 items
Skipped 528 empty Markdown items in rank expert
successfully write to /result/comparison_result.json
Done!!

Summarize:

Rank Grandmaster
- Total items: 2670
- Total skipped empty Markdown: 22
# Below is only for Elasticsearch base approach #
- Total correct original-recommended pairs: 2512
- Total incorrect original-recommended pairs: 136

Rank Master
- Total items: 4098
- Total skipped empty Markdown: 116
# Below is only for Elasticsearch base approach #
- Total correct original-recommended pairs: 3572
- Total incorrect original-recommended pairs: 410

Rank Expert
- Total items: 10851
- Total skipped empty Markdown: 528
# Below is only for Elasticsearch base approach #
- Total correct original-recommended pairs: 9187
- Total incorrect original-recommended pairs: 1136



Filter and take some sample of data for analysis

In [45]:
import random 
import json
RANKS = ['grandmaster', 'master', 'expert']
MAX_SAMPLE = 30

with open("comparison_result.json", "r", encoding="utf-8") as f:
    data = json.load(f)
    for rank in RANKS:
        incorrect_pairs = data[rank]['incorrect_pairs_items']
        with open(f'./analysis-result/{rank}_analysis.txt', 'w', encoding="utf-8") as file:
            random_range = list(range(0, len(incorrect_pairs)))
            random.shuffle(random_range)

            i = 0
            while i<MAX_SAMPLE:
                if (i>=MAX_SAMPLE):
                    break
                file.write(f"ITEM #{i+1}\n\n")
                file.write(f"ORIGINAL MARKDOWN:\n{'-'*99}\n")
                file.write(str(incorrect_pairs[i]['originalMarkdown']))
                file.write("\n================================================================\n")
                file.write("RECOMMENDED CODE:\n----------------------------------------------------\n")
                file.write(str(incorrect_pairs[i]['bm25Code']))
                file.write("\n================================================================\n")
                file.write("ORIGINAL CODE:\n----------------------------------------------------\n")
                file.write(str(incorrect_pairs[i]['originalCode']))
                file.write("\n" + "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n"*3 + "\n")

                i = i+1

    