In [2]:
%pip install qdrant-client
%pip install nltk
%pip install -U sentence-transformers
%pip install langdetect
%pip install flask
%pip install flask_restful
%pip install pandas

In [3]:
#Generates 90000 entries and pushes it into the server

import requests
import random
import json

# Replace the URL with the URL where server.py is running
SERVER_URL = "http://localhost:3000/"

# Function to generate random log data
def generate_log_data():
    levels = ["error", "info", "warning"]
    resources = ["server-1234", "server-5678", "server-91011"]
    trace_ids = ["abc-xyz-123", "def-uvw-456", "ghi-jkl-789"]
    span_ids = ["span-111", "span-222", "span-333"]
    commits = ["5e5342f", "a1b2c3d", "x0y1z2"]
    parent_resources = ["parent-1", "parent-2", "parent-3"]

    log_data = {
        "level": random.choice(levels),
        "message": f"Log message for {random.choice(resources)}",
        "resourceId": random.choice(resources),
        "timestamp": "2023-09-15T08:00:00Z",
        "traceId": random.choice(trace_ids),
        "spanId": random.choice(span_ids),
        "commit": random.choice(commits),
        "metadata": {
            "parentResourceId": random.choice(parent_resources)
        }
    }

    return log_data

# Generate a single log entry
log_data = generate_log_data()

# Save log_data to a JSON file
with open('log_data.json', 'w') as json_file:
    json.dump(log_data, json_file, indent=2)

# Generate a list of 90,000 log entries
bulk_log_data = [generate_log_data() for _ in range(90000)]

# Save bulk_log_data to a JSON file
with open('bulk_log_data.json', 'w') as json_file:
    json.dump(bulk_log_data, json_file, indent=2)

# Split the list into chunks to send in batches
chunk_size = 5000
chunks = [json.dumps(chunk) for chunk in [bulk_log_data[i:i + chunk_size] for i in range(0, len(bulk_log_data), chunk_size)]]

# Send log data in batches to the receiver
for chunk in chunks:
    response = requests.post(SERVER_URL, data=chunk, headers={'Content-Type': 'application/json'})
    
    try:
        print(response.json())
    except json.JSONDecodeError:
        print(response.text)


In [4]:
#Query Instances 

''' 
Example input:
4
Find all logs with the level set to "error".
Search for logs with the message containing the term "Failed to connect".
Retrieve all logs related to resourceId "server-1234".
Filter logs between the timestamp "2023-09-10T00:00:00Z" and "2023-09-15T23:59:59Z".
'''
#Give the number of Queries that are given as input
n=int(input("Enter the number of Queries:"))
print("***Enter your Queries one by one in new line***")
sample_queries=[]
for i in range(n):
    Query=str(input())
    sample_queries.append(Query)
print("The list of Queries are:")
print(sample_queries)

In [5]:
import pandas as pd
import re

# Load logs from CSV file
csv_file_path = 'logs.csv'
logs_df = pd.read_csv(csv_file_path)

def parse_query(query):
    # Define regex patterns for filters
    filter_patterns = {
        'level': r'level\s*=\s*"(.*?)"',
        'message': r'message\s*containing\s*the\s*term\s*"(.*?)"',
        'resourceId': r'resourceId\s*=\s*"(.*?)"',
        'timestamp': r'between\s*the\s*timestamp\s*"(.*?)"\s*and\s*"(.*?)"',
        'traceId': r'traceId\s*=\s*"(.*?)"',
        'spanId': r'spanId\s*=\s*"(.*?)"',
        'commit': r'commit\s*=\s*"(.*?)"',
        'metadata.parentResourceId': r'metadata.parentResourceId\s*=\s*"(.*?)"',
    }

    # Initialize an empty dictionary to store filters and values
    filters = {}

    # Iterate through filter patterns
    for filter_name, pattern in filter_patterns.items():
        match = re.search(pattern, query)
        if match:
            filters[filter_name] = match.group(1) if filter_name != 'timestamp' else (match.group(1), match.group(2))

    return filters

def filter_logs(filters):
    # Apply filters to the DataFrame
    filtered_logs = logs_df.copy()

    for filter_name, filter_value in filters.items():
        if filter_name == 'timestamp':
            start_time, end_time = filter_value
            filtered_logs = filtered_logs[
                (filtered_logs['timestamp'] >= start_time) & (filtered_logs['timestamp'] <= end_time)
            ]
        else:
            filtered_logs = filtered_logs[filtered_logs[filter_name] == filter_value]

    return filtered_logs


# Extract filters and values from sample queries
for i, query in enumerate(sample_queries, start=1):
    filters = parse_query(query)

    # Apply filters to the DataFrame
    filtered_logs = filter_logs(filters)

    # Remove duplicates and save to a file
    output_file_path = f'Query{i}.csv'
    unique_rows = filtered_logs.drop_duplicates()
    unique_rows.to_csv(output_file_path, index=False)

    print(f"\nQuery: {query}")
    print(f"Unique Rows (without duplicates) for Query {i}:\n{unique_rows}")
    print(f"Saved output to {output_file_path}")
