## Client Generation

In [None]:
from neumai.Client.NeumClient import NeumClient
import pandas as pd
import os

client = NeumClient(api_key=os.environ['NEUM_API_KEY'])

## Helper functions

In [None]:
from datetime import datetime, timezone

def calculate_time_differences_unix(unix_timestamp):
    now = datetime.now(timezone.utc)
    given_time = datetime.fromtimestamp(unix_timestamp, timezone.utc)

    diff = now - given_time
    diff_in_seconds = diff.total_seconds()
    diff_in_minutes = divmod(diff_in_seconds, 60)[0]
    diff_in_hours = divmod(diff_in_minutes, 60)[0]
    diff_in_days = divmod(diff_in_hours, 24)[0]

    date_str = given_time.strftime('%Y-%m-%d')

    if diff_in_seconds < 60:
        return f"{int(diff_in_seconds)} secs ago"
    elif diff_in_minutes < 60:
        return f"{int(diff_in_minutes)} mins ago"
    elif diff_in_hours < 24:
        return f"{int(diff_in_hours)} hours ago"
    elif diff_in_days < 15:
        return f"{int(diff_in_days)} days ago"
    else:
        return date_str


In [None]:
from datetime import datetime, timezone

def calculate_time_differences_iso(iso_timestamp):
    now = datetime.now(timezone.utc)
    # Parse the ISO 8601 formatted string
    given_time = datetime.fromisoformat(iso_timestamp).replace(tzinfo=timezone.utc)

    diff = now - given_time
    diff_in_seconds = diff.total_seconds()
    diff_in_minutes = divmod(diff_in_seconds, 60)[0]
    diff_in_hours = divmod(diff_in_minutes, 60)[0]
    diff_in_days = divmod(diff_in_hours, 24)[0]

    date_str = given_time.strftime('%Y-%m-%d')

    if diff_in_seconds < 60:
        return f"{int(diff_in_seconds)} secs ago"
    elif diff_in_minutes < 60:
        return f"{int(diff_in_minutes)} mins ago"
    elif diff_in_hours < 24:
        return f"{int(diff_in_hours)} hours ago"
    elif diff_in_days < 15:
        return f"{int(diff_in_days)} days ago"
    else:
        return date_str

## Pipelines data -> My Pipelines

API to see Number of file collections (pipelines) created

In [None]:
results = client.get_pipelines()['pipelines_v2']

In [None]:
df = pd.DataFrame(results)
df = df[df["is_deleted"] != True]
df = df.drop(columns=["sources", "embed", "sink", "trigger_schedule", "latest_run", "available_metadata", "is_deleted", "owner"])
df['created'] = df['created'].apply(calculate_time_differences_unix)
display(df)

In [None]:
import matplotlib.pyplot as plt
df = pd.DataFrame(results)
df = df[df["is_deleted"] != True]
df = df.drop(columns=["sources", "embed", "sink", "trigger_schedule", "latest_run", "available_metadata", "is_deleted", "owner"])
df['created'] = pd.to_datetime(df['created'], unit='s')
df_grouped = df.groupby(df['created'].dt.date).size()

plt.figure(figsize=(10, 6))
df_grouped.plot(kind='bar')
plt.xlabel('Day')
plt.ylabel('Number of pipelines')
plt.title('Pipelines created by day')
plt.xticks(rotation=45)
plt.show()

## Files data -> My Files

API to see Number of files synchronized / syncing / failed for each collection

In [None]:
files = client.get_files(pipeline_id="590a6e0d-5444-44d2-8efc-1aadffb72e62")['files']

In [None]:
df_files = pd.DataFrame(files)
df_files = df_files.drop(columns=["metadata", "pipeline_id", "pipeline_run_id"])
df_files['created_time'] = df_files['created_time'].apply(calculate_time_differences_iso)
df_files['modified_time'] = df_files['modified_time'].apply(calculate_time_differences_iso)
display(df_files)

In [None]:
df_files_status_grouped = df_files.groupby(df_files['status']).size()

plt.figure(figsize=(10, 6))
df_files_status_grouped.plot(kind='bar')
plt.xlabel('Status')
plt.ylabel('Number of files with status')
plt.title(f'Status of files for pipeline {"590a6e0d-5444-44d2-8efc-1aadffb72e62"}')
plt.xticks(rotation=45)
plt.show()

In [None]:
import matplotlib.pyplot as plt
df_files_graph = pd.DataFrame(files)
df_files_graph['modified_time'] = pd.to_datetime(df_files_graph['modified_time'])
df_files_grouped = df_files_graph.groupby(df_files_graph['modified_time'].dt.date).size()

plt.figure(figsize=(10, 6))
df_files_grouped.plot(kind='bar')
plt.xlabel('Day')
plt.ylabel('Files updates')
plt.title('File updates tracking')
plt.xticks(rotation=45)
plt.show()

## Search results by pipeline

API to see Number of times a file collection has been searched over time

In [None]:
retrievals = client.get_retrievals_by_pipeline_id(pipeline_id="590a6e0d-5444-44d2-8efc-1aadffb72e62")

In [None]:
df_retrievals = pd.DataFrame(retrievals)
df_retrievals = df_retrievals.drop(columns=["pipeline_id", "results", "status"])
df_retrievals['timestamp'] = df_retrievals['timestamp'].apply(calculate_time_differences_unix)
# df_retrievals['modified_time'] = df_retrievals['modified_time'].apply(calculate_time_differences_iso)
display(df_retrievals)

In [None]:
df_retrievals_graph = pd.DataFrame(retrievals)
df_retrievals_graph = df_retrievals_graph.drop(columns=["pipeline_id", "results", "status"])
df_retrievals_graph['timestamp'] = pd.to_datetime(df_retrievals_graph['timestamp'], unit='s')  # or 'ms' for milliseconds
df_retrievals_grouped = df_retrievals_graph.groupby(df_retrievals_graph['timestamp'].dt.date).size()

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
df_retrievals_grouped.plot(kind='bar')
plt.xlabel('Day')
plt.ylabel('Retrievals')
plt.title(f'Pipeline retrievals for {"590a6e0d-5444-44d2-8efc-1aadffb72e62"}')
plt.xticks(rotation=45)
plt.show()

### Number of retrievals for a given file

API to see Number of times a file in a file collection has been searched over time

In [None]:
df_retrievals_exploded = df_retrievals.explode('files')
df_retrievals_exploded = df_retrievals_exploded[df_retrievals_exploded["files"] != 'File not defined']
display(df_retrievals_exploded)

In [None]:
import matplotlib.pyplot as plt
df_retrievals_exploded_grouped = df_retrievals_exploded.groupby(df_retrievals_exploded['files']).size()

plt.figure(figsize=(10, 6))
df_retrievals_exploded_grouped.plot(kind='bar')
plt.xlabel('File')
plt.ylabel('Retrievals')
plt.title(f'File retrievals for {"590a6e0d-5444-44d2-8efc-1aadffb72e62"}')
plt.xticks(rotation=45)
plt.show()

## Basic Auditing - Pipeline and File
Number of searches performed by a user against a pipeline

Number of searches performed by a user against a file

In [None]:
df_retrievals_user = pd.DataFrame(retrievals)
df_retrievals_user_grouped = df_retrievals_user.groupby(df_retrievals_user['requested_by']).size()

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
df_retrievals_user_grouped.plot(kind='bar')
plt.xlabel('User')
plt.ylabel('Retrievals')
plt.title(f'Retrievals for pipeline {"590a6e0d-5444-44d2-8efc-1aadffb72e62"} per user')
plt.xticks(rotation=45)
plt.show()

In [None]:
file_retrievals = client.get_retrievals_by_file_id(pipeline_id="590a6e0d-5444-44d2-8efc-1aadffb72e62", file_id="590a6e0d-5444-44d2-8efc-1aadffb72e62_Fabulous Frozen Delights FAQ/Fabulous Frozen Delights - General FAQ.docx")

In [None]:
df_file_retrievals = pd.DataFrame(file_retrievals)
df_file_retrievals = df_file_retrievals.drop(columns=["pipeline_id", "results", "status", "files"])
df_file_retrievals['timestamp'] = df_file_retrievals['timestamp'].apply(calculate_time_differences_unix)
# df_retrievals['modified_time'] = df_retrievals['modified_time'].apply(calculate_time_differences_iso)
display(df_file_retrievals)

In [None]:
import matplotlib.pyplot as plt
df_file_retrievals_user = df_file_retrievals.groupby(df_file_retrievals['requested_by']).size()

plt.figure(figsize=(10, 6))
df_file_retrievals_user.plot(kind='bar')
plt.xlabel('User')
plt.ylabel('Retrievals')
plt.title(f'Retrievals for file {"590a6e0d-5444-44d2-8efc-1aadffb72e62_Fabulous Frozen Delights FAQ/Fabulous Frozen Delights - General FAQ.docx"}')
plt.xticks(rotation=45)
plt.show()

## Search by user

Searches by user

Files accessed by user

In [None]:
#TBD
# Total searches
# Files accessed by user