In [1]:
!pip install  langchain
!pip install  langchain_openai
!pip install  langchain_neo4j
!pip install  fastapi[all]
!pip install  requests
!pip install  python-dotenv
!pip install  langchain_community
!pip install  jq
!pip install  certainty-estimator
!pip install  transformers

Collecting langchain
  Downloading langchain-0.3.13-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.26 (from langchain)
  Downloading langchain_core-0.3.28-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.3 (from langchain)
  Downloading langchain_text_splitters-0.3.4-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.3,>=0.1.17 (from langchain)
  Downloading langsmith-0.2.6-py3-none-any.whl.metadata (14 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.4.0,>=0.3.26->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting httpx<1,>=0.23.0 (from langsmith<0.3,>=0.1.17->langchain)
  Downloading httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.3,>=0.1.17->langchain)
  Downloading orjson-3.10.13-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import numpy as np
import pandas as pd
from langchain_openai import OpenAIEmbeddings
from langchain_neo4j import Neo4jGraph
from langchain.text_splitter import CharacterTextSplitter
from certainty_estimator.predict_certainty import CertaintyEstimator
from dotenv import load_dotenv
from neo4j import GraphDatabase
import os
import re
import json

In [3]:
NEO4J_URI="neo4j+s://89193cc4.databases.neo4j.io"
NEO4J_USERNAME="neo4j"
NEO4J_PASSWORD="MVwkmyXWKf7uXYoli0cja_1TUh8-ksSDy_sTKSo2nK4"
# OPENAI_API_KEY=""

In [4]:
META_REVIEW_NODE = "MetaReview"
REVIEW_NODE = "Review"
RATING_NODE = "Rating"
CONFIDENCE_NODE = "Confidence"
RATING_SCORE_NODE = "RatingScore"
CONFIDENCE_SCORE_NODE = "ConfidenceScore"

CONTAINS_RELATIONSHIP = "CONTAINS"
HAS_RATING_RELATIONSHIP = "HAS_RATING"
HAS_CONFIDENCE_RELATIONSHIP = "HAS_CONFIDENCE"

In [5]:
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

graph = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD)
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
# embeddings = OpenAIEmbeddings()
estimator = CertaintyEstimator('sentence-level',cuda=True)



tokenizer_config.json:   0%|          | 0.00/423 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/712 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [6]:
from pydantic import BaseModel
from typing import List, Optional

class Review(BaseModel):
    review: str  
    rating: str  
    confidence: str 

class MetaReview(BaseModel):
    id: str
    reviews: List[Review]  

class Rating(BaseModel):
    rating: str  
    id: str  

class Confidence(BaseModel):
    confidence: str  
    id: str  

class MetaReviewRequest(BaseModel):
    id: str  
    metaReview: Optional[str]
    reviews: List[Review]  

class ReviewRequest(BaseModel):
    review: str  
    rating: str  
    confidence: str  

In [7]:
def clean_text(text: str) -> str:
    """Remove all special characters from the text."""
    return re.sub(r'[^A-Za-z0-9 ]', '', text)

def extract_first_number(text: str) -> int:
    """Extract the first numeric value from the given text."""
    match = re.search(r'\d+', text)
    return int(match.group()) if match else None

def add_reviews(meta_review_request: MetaReview):
    

    with driver.session() as session:
        for review_data in meta_review_request.reviews:

            cleaned_review = clean_text(review_data.review)
            cleaned_rating = clean_text(review_data.rating)
            cleaned_confidence = clean_text(review_data.confidence)

            rating_score = extract_first_number(review_data.rating)
            confidence_score = extract_first_number(review_data.confidence)

            certainty_score = estimator.predict(cleaned_review)[0]

            review_query = f"""
            CREATE (review: {REVIEW_NODE} {{paper_id : '{meta_review_request.id}',review: '{cleaned_review}', certainty: {certainty_score}}})
            CREATE (rating: {RATING_NODE} {{rating: '{cleaned_rating}'}})
            CREATE (confidence: {CONFIDENCE_NODE} {{confidence: '{cleaned_confidence}'}})
            MERGE (rating_score: {RATING_SCORE_NODE} {{value: {rating_score}}})
            MERGE (confidence_score: {CONFIDENCE_SCORE_NODE} {{value: {confidence_score}}})
            CREATE (review)-[:HAS_RATING]->(rating)
            CREATE (review)-[:HAS_CONFIDENCE]->(confidence)
            MERGE (rating)-[:HAS_RATING_SCORE]->(rating_score)  
            MERGE (confidence)-[:HAS_CONFIDENCE_SCORE]->(confidence_score) 
            """
            session.run(review_query)

        return {"reviews_count": len(meta_review_request.reviews)}


In [8]:
def process_dataset_files(filepaths, num_files):
    try:
        if not filepaths:
            raise ValueError("The filepaths list is empty.")

        overall_success_rate = 0
        overall_failure_rate = 0
        total_processed_files = 0

        for folder_path in filepaths:
            if not os.path.exists(folder_path):
                print(f"Dataset folder not found: {folder_path}")
                continue

            json_files = sorted([f for f in os.listdir(folder_path) if f.endswith('.json')])

            if not json_files:
                print(f"No JSON files found in the dataset folder: {folder_path}")
                continue

            json_files = json_files[:num_files]  # Limit the files to process
            
            success_rate = 0
            failure_rate = 0

            for file_name in json_files:
                file_path = os.path.join(folder_path, file_name)
                try:
                    with open(file_path, 'r') as file:
                        data = json.load(file)

                    request = MetaReview(**data)
                    result = add_reviews(request)

                    print(f"File {file_name} processed successfully from folder {folder_path}.")
                    success_rate += 1
                except Exception as e:
                    print(f"Error processing file {file_name} from folder {folder_path}. Error: {str(e)}")
                    failure_rate += 1

            total_files = success_rate + failure_rate
            total_processed_files += total_files
            overall_success_rate += success_rate
            overall_failure_rate += failure_rate

            success_rate_percentage = (success_rate / total_files) * 100 if total_files > 0 else 0
            failure_rate_percentage = (failure_rate / total_files) * 100 if total_files > 0 else 0

            print(f"Folder: {folder_path}, Success Rate: {success_rate_percentage:.2f}%, Failure Rate: {failure_rate_percentage:.2f}%")

        overall_success_rate_percentage = (overall_success_rate / total_processed_files) * 100 if total_processed_files > 0 else 0
        overall_failure_rate_percentage = (overall_failure_rate / total_processed_files) * 100 if total_processed_files > 0 else 0

        return {
            "status": "completed",
            "total_files_processed": total_processed_files,
            "overall_success_rate": overall_success_rate_percentage,
            "overall_failure_rate": overall_failure_rate_percentage
        }

    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

In [9]:
filepaths = [
    "/kaggle/input/convictiondataset/dataset/ICLR_2017/ICLR_2017_review",
    "/kaggle/input/convictiondataset/dataset/ICLR_2018/ICLR_2018_review",
    "/kaggle/input/convictiondataset/dataset/ICLR_2019/ICLR_2019_review"
]
num_files = 50


In [10]:
result = process_dataset_files(filepaths, num_files)
print(result)

File ICLR_2017_100_review.json processed successfully from folder /kaggle/input/convictiondataset/dataset/ICLR_2017/ICLR_2017_review.
File ICLR_2017_101_review.json processed successfully from folder /kaggle/input/convictiondataset/dataset/ICLR_2017/ICLR_2017_review.
File ICLR_2017_102_review.json processed successfully from folder /kaggle/input/convictiondataset/dataset/ICLR_2017/ICLR_2017_review.
File ICLR_2017_103_review.json processed successfully from folder /kaggle/input/convictiondataset/dataset/ICLR_2017/ICLR_2017_review.
File ICLR_2017_104_review.json processed successfully from folder /kaggle/input/convictiondataset/dataset/ICLR_2017/ICLR_2017_review.
File ICLR_2017_105_review.json processed successfully from folder /kaggle/input/convictiondataset/dataset/ICLR_2017/ICLR_2017_review.
File ICLR_2017_106_review.json processed successfully from folder /kaggle/input/convictiondataset/dataset/ICLR_2017/ICLR_2017_review.
File ICLR_2017_107_review.json processed successfully from fol

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


File ICLR_2017_109_review.json processed successfully from folder /kaggle/input/convictiondataset/dataset/ICLR_2017/ICLR_2017_review.
File ICLR_2017_10_review.json processed successfully from folder /kaggle/input/convictiondataset/dataset/ICLR_2017/ICLR_2017_review.
File ICLR_2017_110_review.json processed successfully from folder /kaggle/input/convictiondataset/dataset/ICLR_2017/ICLR_2017_review.
File ICLR_2017_111_review.json processed successfully from folder /kaggle/input/convictiondataset/dataset/ICLR_2017/ICLR_2017_review.
File ICLR_2017_112_review.json processed successfully from folder /kaggle/input/convictiondataset/dataset/ICLR_2017/ICLR_2017_review.
File ICLR_2017_113_review.json processed successfully from folder /kaggle/input/convictiondataset/dataset/ICLR_2017/ICLR_2017_review.
File ICLR_2017_114_review.json processed successfully from folder /kaggle/input/convictiondataset/dataset/ICLR_2017/ICLR_2017_review.
File ICLR_2017_115_review.json processed successfully from fold