In [2]:
# Import the data
import pandas as pd

df = pd.read_excel('TrainSubset.xlsx')

# Convert to date
df['Completion Date'] = pd.to_datetime(df['Completion Date'], origin='1899-12-30', unit='D', errors='coerce')
df['Match Activation Date'] = pd.to_datetime(df['Match Activation Date'], origin='1899-12-30', unit='D', errors='coerce')
df['Match Closure Meeting Date'] = pd.to_datetime(df['Match Closure Meeting Date'], origin='1899-12-30', unit='D', errors='coerce')
df['Little Birthdate'] = pd.to_datetime(df['Little Birthdate'], origin='1899-12-30', unit='D', errors='coerce')
df['Big Birthdate'] = pd.to_datetime(df['Big Birthdate'], origin='1899-12-30', unit='D', errors='coerce')
df['Big Approved Date'] = pd.to_datetime(df['Big Approved Date'], origin='1899-12-30', unit='D', errors='coerce')

# Sort the data
df_sorted = df.sort_values(by=['Match ID 18Char', 'Completion Date'], ascending=[True, True])

# Fill Empty Data 
df_sorted['Match Support Contact Notes'] = df_sorted['Match Support Contact Notes'].fillna('No Updates').astype(str)

df_sorted.head()

# Tokenizer to shorten sentences as well as remove unwanted stuff like URLs and spaces
import re
from nltk import sent_tokenize

def shorten_text(text):
    # Remove URLs
    text = re.sub(r'http[s]?://[^\s]+', '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove sequences of underscores longer than 3
    text = re.sub(r'_{4,}', '', text)
    
    # Split into dated blocks
    blocks = text.split('\n\n')
    shortened_blocks = []
    
    for block in blocks:
        # Extract the date (assuming YYYY-MM-DD format)
        match = re.match(r'(\d{4}-\d{2}-\d{2})', block)
        if not match:
            continue
        date = match.group(1)
        content = block[len(date):].strip()
        
        # Split content into lines
        lines = content.split('\n')
        filtered_lines = []
        
        for line in lines:
            if "Answer:" in line:
                question, answer = line.split("Answer:", 1)
                answer = answer.strip()
                if answer.lower() not in ['na', 'n/a', '-', '_', '__', '___', '']:
                    filtered_lines.append(f"{question.strip()} Answer: {answer}")
            else:
                filtered_lines.append(line.strip())
        
        if not filtered_lines:
            shortened_content = content.split('\n')[0]
        else:
            combined_content = ' '.join(filtered_lines)
            sentences = sent_tokenize(combined_content)
            key_sentences = [s for s in sentences if "See notes" not in s][:2]
            shortened_content = ' '.join(key_sentences) if key_sentences else combined_content.split('\n')[0]
        
        shortened_blocks.append(f"{date} {shortened_content}")
    
    return '\n\n'.join(shortened_blocks) if shortened_blocks else text.split('\n\n')[0]


# Ensure 'Completion Date' is in datetime format
df_sorted['Completion Date'] = pd.to_datetime(df_sorted['Completion Date'], errors='coerce')
df_sorted['Short Notes'] = df_sorted['Match Support Contact Notes'].apply(shorten_text)
df_sorted['Dated Short Notes'] = df_sorted['Completion Date'].astype(str) + ' ' + df_sorted['Short Notes'].fillna('').astype(str)

# Display the result
print(df_sorted.head(5))


       Match ID 18Char Match Activation Date Completion Date  \
16  a1v2J000002uR0JQAU            2018-04-12      2018-04-30   
15  a1v2J000002uR0JQAU            2018-04-12      2018-05-21   
14  a1v2J000002uR0JQAU            2018-04-12      2018-06-29   
18  a1v2J000002uR0JQAU            2018-04-12      2018-07-30   
10  a1v2J000002uR0JQAU            2018-04-12      2018-08-23   

                          Match Support Contact Notes   Stage  \
16  Question: Activities:           Answer: - Ques...  Closed   
15  Question: Activities:           Answer: MEC wa...  Closed   
14  Question: Activities:           Answer: MEC ha...  Closed   
18  Question: Activities:           Answer: Suppor...  Closed   
10  Question: Activities:           Answer: Very n...  Closed   

             Little ID           Big ID  Big County  State  Big Age  ...  \
16  0032J00003PfZ6OQAV  0032J00003PgoV1  Washington    NaN       65  ...   
15  0032J00003PfZ6OQAV  0032J00003PgoV1  Washington    NaN       65  ...

In [None]:
import os
from openai import OpenAI
import json
from typing import List, Dict, Any
from IPython.display import display, JSON  # For nice JSON display in Jupyter

os.environ["XAI_API_KEY"] = "xai-5q7CtzPajDnu5Ucaic4kCbaMXcskpnYt2In72q49rGBrM49T10gZ0kCkTsAVw0hqrnoY2kbszKNg7IlK"

context = '''
Follow prompt instruction explicitly without exceptions. You are a machine processing text. Your only task is to identify potential events, green flags (e.g., factors likely to enhance relationship quality and duration) and red flags (e.g., risks of early termination or poor outcomes) in the mentorship program by big brothers big sisters of america.

Background:
BBB or Agency = Big brothers’ big sisters of America organization
LB/LS = Little Brother/Sister (Mentee Or Child)
BB/BS = Big Brother/Sister (Mentor Or Volunteer)
MEC or MSC = Match coordinator from BBB
PG = Parent of LB/LS
Current Service area = Minnesota

Flags :
Green Flag indicates any events falling near to the categories below , with a positive impact on match
Red Flag indicates any events falling near to the categories below , with a negative impact on match

Green Flags to Detect:
•Any positive Events identified in the Rationale for Match
•Indication that Mentor completed BBB training pre-match
•Commitment to 18-month match
•Shared interests/preferences in match
•Monthly in-person/phone support from agency to mentor, youth, parent
•High mentor satisfaction, realistic expectations
•Youth reports positive relationship, frequent meetings
•Demographic alignment (race, gender, religion)
•Close geographic proximity or good transportation access
•Positive youth traits (5 Cs: competence, confidence, connection, care, character)
•Older, experienced mentor with empathy, flexibility, multicultural competence
•Younger mentee (elementary–early adolescence), good relational history

Red Flags to Detect:
•No pre-match training or ongoing support
•Mismatched interests, ignored mentor preferences
•Infrequent/superficial staff check-ins (<6 min)
•Mentor frustration, unrealistic expectations, youth resistance
•No closure plan for early termination
•Match ends <6 months (34–50% risk)
•Younger mentor (18–25), negative attitudes, low commitment
•Older mentee seeking autonomy, severe risk factors
•No monthly staff support (email-only contact)
•Inadequate BBB training, excessive/scanty staff involvement
•Parental dissatisfaction/interference
•Match ends <13–18 months


'''

output = '''
Response Guidence:

Always return a valid JSON response:
[
  {
    "date": "YYYY-MM-DD",
    "green_flag_count": <number>,
    "red_flag_count ": <number>,
    "events": {
      "Child/Family: Unrealistic expectations": <score>,
      "Volunteer: Unrealistic expectations": <score>,
      ......
    }
  },
]

Events to Detect :
•Match closure Discussed
•Changing Match Type
•COVID impact
•Child/Family: Feels incompatible with volunteer
•Child/Family: Moved
•Child/Family: Lost contact with agency
•Child/Family: Lost contact with volunteer/agency
•Child/Family: Lost contact with volunteer
•Child/Family: Moved out of service 
•Child/Family: Unrealistic expectationsarea
•Child/Family: Time constraints
•Child/Family: Infraction of match rules/agency policies
•Child/Family: Moved within service area
•Child: Graduated
•Child: Transportation Issues
•Child: Changed school/site
•Child: Lost interest
•Child: Family structure changed
•Child: Severity of challenges
•Volunteer: Transportation Issues
•Volunteer: Moved out of service area
•Volunteer: Moved within service area
•Volunteer: Lost contact with agency
•Volunteer: Lost contact with child/agency
•Volunteer: Feels incompatible with child/family
•Volunteer: Time constraint
•Volunteer: Deceased
•Volunteer: Lost contact with child/family
•Volunteer: Infraction of match rules/agency policies
•Volunteer: Unrealistic expectations
•Volunteer: Pregnancy
•Volunteer: Changed workplace/school partnership
•Agency: Challenges with program/partnership
•Agency: Concern with Volunteer re: child safety

Critical Instructions:
-Output format: JSON with "date" and "events" containing event-specific severity scores.
-Severity scores: Assign scores (0–5) to each event type (e.g., "Volunteer: Time constraint_severity"), using your best judgement where 1 is little impact, and 5 is implicates an immediate end to the match.
-Output dates should match input exactly
-Exclude events where Severity == 0
-The response must not exceed 1000 tokens.
-If the full JSON exceeds 1000 tokens, trim earlier dates and include only the most recent dates, starting from the latest date and working backward, until the output fits within 1000 tokens.
-Do not include any explanatory text or additional content beyond the JSON unless it fits within the token limit.
-Assume 1 token ≈ 4 characters (including spaces and punctuation) for token estimation.

'''

count = 0;

def parse_json_message(json_string: str) -> List[Dict[str, Any]]:
    
    try:
        brace_position = json_string.index('[')
        json_string = json_string[brace_position:]
    except:
        raise ValueError("Input does not start with '[': invalid JSON array format.")

    while True:
        if not json_string:
            raise ValueError("Couldn't fix JSON")
        try:
            data = json.loads(json_string + "]")
        except json.decoder.JSONDecodeError:
            json_string = json_string[:-1]
            continue
        break
    return data

def process_row(row: pd.Series, error_df: pd.DataFrame) -> List[Dict[str, Any]]:
    rationale = row["Rationale for Match"]
    query = row["Dated Short Notes"]
    matchid = row["Match ID 18Char"]
    
    try:
        # Assume process_grok_queries returns a JSON string
        json_response = process_grok_queries("Rationale For March " + rationale + " Dated Short Notes : " + query)
        parsed_response = parse_json_message(json_response)
        
        print("------------------------------")
        print(parsed_response)
    
        return parsed_response
    except (ValueError, json.JSONDecodeError) as e:
 
        error_row = pd.DataFrame({
            "Match ID 18Char": [matchid],
            "Rationale for Match": [rationale],
            "Dated Short Notes": [query],
            "Error": [str(e)]
        })

        globals()["error_df"] = pd.concat([error_df, error_row], ignore_index=True)
        return None
    
            
def process_grok_queries(query_str):
    global count
    # Retrieve the API key from the environment variable
    api_key = os.getenv("XAI_API_KEY")
    if not api_key:
        raise ValueError("XAI_API_KEY environment variable is not set.")
    
    print("Processing: " + query_str)
    
    # Initialize the OpenAI client with the Grok API base URL and API key
    client = OpenAI(
        api_key=api_key,
        base_url="https://api.x.ai/v1"
    )
    
    user_content = context + " Dated Match Notes: " +str(query_str) + output
    
    cached_context = {
        "role": "system",
        "content": context
    }
    
    messages = [
    cached_context,
        {"role": "user", "content": query_str + output}
    ]
    
    # Make API call!!
    response = client.chat.completions.create(
        model="grok-beta",
        messages=messages,
        max_tokens=1000
    )
    
    # Decode the data 
    count+=1
    answer = response.choices[0].message.content  
    
    print("Processed: " + str(count))
    print("------------------------------")
    print(answer)

    return answer

# Process the query str column !

error_df = pd.DataFrame(columns=["Match ID 18Char", "Rationale for Match", "Dated Short Notes", "Error"])  # To store errors with matchid

# Process each row and handle errors
merged_df["JSON Response"] = merged_df.apply(
    lambda row: process_row(row, error_df), axis=1
)
merged_df.to_excel('combined_notes.xlsx', index=False)