In [80]:
# Import the data
import pandas as pd

df = pd.read_excel('Test-Truncated-Restated.xlsx')

# Convert to date ... Dont matter to the model
#df['Completion Date'] = pd.to_datetime(df['Completion Date'], origin='1899-12-30', unit='D', errors='coerce')
#df['Match Activation Date'] = pd.to_datetime(df['Match Activation Date'], origin='1899-12-30', unit='D', errors='coerce')
#df['Match Closure Meeting Date'] = pd.to_datetime(df['Match Closure Meeting Date'], origin='1899-12-30', unit='D', errors='coerce')
#df['Little Birthdate'] = pd.to_datetime(df['Little Birthdate'], origin='1899-12-30', unit='D', errors='coerce')
#df['Big Birthdate'] = pd.to_datetime(df['Big Birthdate'], origin='1899-12-30', unit='D', errors='coerce')
#df['Big Approved Date'] = pd.to_datetime(df['Big Approved Date'], origin='1899-12-30', unit='D', errors='coerce')

# Sort the data
df_sorted = df.sort_values(by=['Match ID 18Char', 'Completion Date'], ascending=[True, True])
df_sorted = df_sorted.groupby('Match ID 18Char').tail(10)

# Fill Empty Data 
df_sorted['Match Support Contact Notes'] = df_sorted['Match Support Contact Notes'].fillna('No Updates').astype(str)

df_sorted.head()

# Tokenizer to shorten sentences as well as remove unwanted stuff like URLs and spaces
import re
from nltk import sent_tokenize

def shorten_text(text):
    # Remove URLs
    text = re.sub(r'http[s]?://[^\s]+', '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove sequences of underscores longer than 3
    text = re.sub(r'_{4,}', '', text)
    
    # Split into dated blocks
    blocks = text.split('\n\n')
    shortened_blocks = []
    
    for block in blocks:
        # Extract the date (assuming YYYY-MM-DD format)
        match = re.match(r'(\d{4}-\d{2}-\d{2})', block)
        if not match:
            continue
        date = match.group(1)
        content = block[len(date):].strip()
        
        # Split content into lines
        lines = content.split('\n')
        filtered_lines = []
        
        for line in lines:
            if "Answer:" in line:
                question, answer = line.split("Answer:", 1)
                answer = answer.strip()
                if answer.lower() not in ['na', 'n/a', '-', '_', '__', '___', '']:
                    filtered_lines.append(f"{question.strip()} Answer: {answer}")
            else:
                filtered_lines.append(line.strip())
        
        if not filtered_lines:
            shortened_content = content.split('\n')[0]
        else:
            combined_content = ' '.join(filtered_lines)
            sentences = sent_tokenize(combined_content)
            key_sentences = [s for s in sentences if "See notes" not in s][:2]
            shortened_content = ' '.join(key_sentences) if key_sentences else combined_content.split('\n')[0]
        
        shortened_blocks.append(f"{date} {shortened_content}")
    
    return '\n\n'.join(shortened_blocks) if shortened_blocks else text.split('\n\n')[0]


# Ensure 'Completion Date' is in datetime format
df_sorted['Completion Date'] = pd.to_datetime(df_sorted['Completion Date'], errors='coerce')
df_sorted['Short Notes'] = df_sorted['Match Support Contact Notes'].apply(shorten_text)
df_sorted['Dated Short Notes'] = df_sorted['Completion Date'].astype(str) + ' ' + df_sorted['Short Notes'].fillna('').astype(str)

# Display the result
print(df_sorted.head(5))


      Match ID 18Char Completion Date  \
0  a1v2J0000027CXKQA2      2017-12-20   
1  a1v2J0000027CXKQA2      2018-01-19   
2  a1v2J0000027CXKQA2      2018-02-18   
4  a1v2J0000027JFCQA2      2019-02-14   
5  a1v2J0000027JFCQA2      2019-05-21   

                         Match Support Contact Notes           Little ID  \
0  Question: Activities:           Answer: BB and...  0032J00003PfqfqQAB   
1  Question: Activities:           Answer: BB and...  0032J00003PfqfqQAB   
2                                         No Updates  0032J00003PfqfqQAB   
4  Question: Activities:           Answer: MEC sp...  0032J00003PfqfDQAR   
5  MEC started the conversation by asking how thi...  0032J00003PfqfDQAR   

            Big ID Big County  Big Age        Big Occupation Big: Military  \
0  0032J00003Pgbpe     Ramsey       25  Student: High School           NaN   
1  0032J00003Pgbpe     Ramsey       25  Student: High School           NaN   
2  0032J00003Pgbpe     Ramsey       25  Student: High School  

In [85]:
import os
from openai import OpenAI
import json
from typing import List, Dict, Any
from IPython.display import display, JSON  # For nice JSON display in Jupyter

os.environ["XAI_API_KEY"] = "xai-5q7CtzPajDnu5Ucaic4kCbaMXcskpnYt2In72q49rGBrM49T10gZ0kCkTsAVw0hqrnoY2kbszKNg7IlK"

context = '''
Follow prompt instruction explicitly without exceptions. You are a machine processing text. Your only task is to identify potential events, green flags (e.g., factors likely to enhance relationship quality and duration) and red flags (e.g., risks of early termination or poor outcomes) in the mentorship program by big brothers big sisters of america.

Background:
BBB or Agency = Big brothers’ big sisters of America organization
LB/LS = Little Brother/Sister (Mentee Or Child)
BB/BS = Big Brother/Sister (Mentor Or Volunteer)
MEC or MSC = Match coordinator from BBB
PG = Parent of LB/LS
Current Service area = Minnesota

Flags :
Green Flag indicates any events falling near to the categories below , with a positive impact on match
Red Flag indicates any events falling near to the categories below , with a negative impact on match

Green Flags to Detect:
•Any positive Events identified in the Rationale for Match
•Indication that Mentor completed BBB training pre-match
•Commitment to 18-month match
•Shared interests/preferences in match
•Monthly in-person/phone support from agency to mentor, youth, parent
•High mentor satisfaction, realistic expectations
•Youth reports positive relationship, frequent meetings
•Demographic alignment (race, gender, religion)
•Close geographic proximity or good transportation access
•Positive youth traits (5 Cs: competence, confidence, connection, care, character)
•Older, experienced mentor with empathy, flexibility, multicultural competence
•Younger mentee (elementary–early adolescence), good relational history

Red Flags to Detect:
•No pre-match training or ongoing support
•Mismatched interests, ignored mentor preferences
•Infrequent/superficial staff check-ins (<6 min)
•Mentor frustration, unrealistic expectations, youth resistance
•No closure plan for early termination
•Match ends <6 months (34–50% risk)
•Younger mentor (18–25), negative attitudes, low commitment
•Older mentee seeking autonomy, severe risk factors
•No monthly staff support (email-only contact)
•Inadequate BBB training, excessive/scanty staff involvement
•Parental dissatisfaction/interference
•Match ends <13–18 months

'''

output = '''
Response Guidance
Events to Detect (Used Exclusively for JSON "events" Field):
(Ensure detected events are exactly as listed below)

Match-Level Events:
Match closure Discussed,
Changing Match Type,
COVID impact

Child/Family-Related Events:
Child/Family: Feels incompatible with volunteer,
Child/Family: Moved,
Child/Family: Lost contact with agency,
Child/Family: Lost contact with volunteer/agency,
Child/Family: Lost contact with volunteer,
Child/Family: Moved out of service,
Child/Family: Unrealistic expectations,
Child/Family: Time constraints,
Child/Family: Infraction of match rules/agency policies
Child/Family: Moved within service area
Child: Graduated
Child: Transportation Issues
Child: Changed school/site
Child: Lost interest
Child: Family structure changed
Child: Severity of challenges

Volunteer-Related Events:
Volunteer: Transportation Issues,
Volunteer: Moved out of service area,
Volunteer: Moved within service area,
Volunteer: Lost contact with agency,
Volunteer: Lost contact with child/agency,
Volunteer: Feels incompatible with child/family,
Volunteer: Time constraints,
Volunteer: Deceased,
Volunteer: Lost contact with child/family,
Volunteer: Infraction of match rules/agency policies,
Volunteer: Unrealistic expectations,
Volunteer: Pregnancy,
Volunteer: Changed workplace/school partnership,

Agency-Level Events:
Agency: Challenges with program/partnership
Agency: Concern with Volunteer re: child safety

JSON Response Format (Strictly Follow This Structure)

Always return a valid JSON object containing:
green_flag_count: Number of detected positive indicators (if any)
red_flag_count: Number of detected concerning events
events: A dictionary where:
Keys are detected event names (must match exactly from the provided list)
Values are assigned severity scores (1–5)

Example JSON Response:
[
  {
    "green_flag_count": 2,
    "red_flag_count": 3,
    "events": {
      "Child/Family: Unrealistic expectations": 3,
      "Volunteer: Unrealistic expectations": 4,
      "Agency: Concern with Volunteer re: child safety": 5
    }
  }
]
'''


def parse_json_message(json_string: str) -> List[Dict[str, Any]]:
    
    try:
        brace_position = json_string.index('[')
        json_string = json_string[brace_position:]
    except:
        raise ValueError("Input does not start with '[': invalid JSON array format.")

    while True:
        if not json_string:
            raise ValueError("Couldn't fix JSON")
        try:
            data = json.loads(json_string + "]")
        except json.decoder.JSONDecodeError:
            json_string = json_string[:-1]
            continue
        break
    return data

def process_row(row: pd.Series, error_df: pd.DataFrame) -> List[Dict[str, Any]]:
    rationale = row["Rationale for Match"]
    query = row["Dated Short Notes"]
    matchid = row["Match ID 18Char"]
    
    try:
        # Assume process_grok_queries returns a JSON string
        json_response = query_grok(str(query), matchid , str(rationale))
        parsed_response = parse_json_message(json_response)
        
        print("------------------------------")
        print(parsed_response)
    
        return parsed_response
    except (ValueError, json.JSONDecodeError) as e:
 
        error_row = pd.DataFrame({
            "Match ID 18Char": [matchid],
            "Rationale for Match": [rationale],
            "Dated Short Notes": [query],
            "Error": [str(e)]
        })

        globals()["error_df"] = pd.concat([error_df, error_row], ignore_index=True)
        return None

    
count = 0
conversation_history: List[Dict[str, str]] = []  # Initialize empty, populated per matchid
last_matchid = None

def query_grok(query_str: str, matchid: str, rationale: str) -> str:

    global count, conversation_history, last_matchid
    
    # Retrieve the API key
    api_key = os.getenv("XAI_API_KEY")
    if not api_key:
        raise ValueError("XAI_API_KEY environment variable is not set.")
    
    # Initialize the OpenAI client
    client = OpenAI(
        api_key=api_key,
        base_url="https://api.x.ai/v1"
    )
    
    # Check if matchid has changed
    if matchid != last_matchid:
        print("---------------------------- New Convo Context --------------------------")
        conversation_history = [{"role": "system", "content": context + rationale }]
        last_matchid = matchid
        print(conversation_history)
    # Append the query to the conversation history
    user_message = {"role": "user", "content": query_str + output}
    conversation_history.append(user_message)
    
    
    # Make API call with the full conversation history
    response = client.chat.completions.create(
        model="grok-beta",
        messages=conversation_history,
        max_tokens=1000
    )
    
    # Extract and append the assistant's response
    count += 1
    answer = response.choices[0].message.content
    conversation_history.append({"role": "assistant", "content": answer})
    
    print("Processed: " + str(count))
    print("------------------------------")
    print(answer)
    
    
    token_count = response.usage.total_tokens if response.usage else 0
    print("------------------------------")
    print("Token Count: " + str(token_count))
    
    return answer


# Initialize error_df
error_df = pd.DataFrame(columns=["Match ID 18Char", "Rationale for Match", "Dated Short Notes", "Error"])

# Process each row and handle errors
df_sorted["JSON Response"] = df_sorted.apply(
    lambda row: process_row(row, error_df), axis=1
)

---------------------------- New Convo Context --------------------------
[{'role': 'system', 'content': '\nFollow prompt instruction explicitly without exceptions. You are a machine processing text. Your only task is to identify potential events, green flags (e.g., factors likely to enhance relationship quality and duration) and red flags (e.g., risks of early termination or poor outcomes) in the mentorship program by big brothers big sisters of america.\n\nBackground:\nBBB or Agency = Big brothers’ big sisters of America organization\nLB/LS = Little Brother/Sister (Mentee Or Child)\nBB/BS = Big Brother/Sister (Mentor Or Volunteer)\nMEC or MSC = Match coordinator from BBB\nPG = Parent of LB/LS\nCurrent Service area = Minnesota\n\nFlags :\nGreen Flag indicates any events falling near to the categories below , with a positive impact on match\nRed Flag indicates any events falling near to the categories below , with a negative impact on match\n\nGreen Flags to Detect:\n•Any positive Even

KeyboardInterrupt: 

In [78]:
df_sorted.to_excel('combined_notes.xlsx', index=False)

In [79]:
columns = [
    'Match closure Discussed',
    'Changing Match Type',
    'COVID impact',
    'Child/Family: Feels incompatible with volunteer',
    'Child/Family: Moved',
    'Child/Family: Lost contact with agency',
    'Child/Family: Lost contact with volunteer/agency',
    'Child/Family: Lost contact with volunteer',
    'Child/Family: Moved out of service',
    'Child/Family: Unrealistic expectations',
    'Child/Family: Time constraints',
    'Child/Family: Infraction of match rules/agency policies',
    'Child/Family: Moved within service area',
    'Child: Graduated',
    'Child: Transportation Issues',
    'Child: Changed school/site',
    'Child: Lost interest',
    'Child: Family structure changed',
    'Child: Severity of challenges',
    'Volunteer: Transportation Issues',
    'Volunteer: Moved out of service area',
    'Volunteer: Moved within service area',
    'Volunteer: Lost contact with agency',
    'Volunteer: Lost contact with child/agency',
    'Volunteer: Feels incompatible with child/family',
    'Volunteer: Time constraint',
    'Volunteer: Deceased',
    'Volunteer: Lost contact with child/family',
    'Volunteer: Infraction of match rules/agency policies',
    'Volunteer: Unrealistic expectations',
    'Volunteer: Pregnancy',
    'Volunteer: Changed workplace/school partnership',
    'Agency: Challenges with program/partnership',
    'Agency: Concern with Volunteer re: child safety'
]

# Add the events above as columns 
flat_df = pd.json_normalize(df_sorted["JSON Response"])
flat_df.columns = ['events']

events_df = flat_df["events"].apply(pd.Series).fillna(0)

for col in columns:
    if col not in events_df:
        events_df[col] = 0  

flat_df = flat_df.drop(columns=["events"]).join(events_df)
flat_df[columns] = flat_df[columns].astype(int)

flat_df.head()


# Merge the original DF with this
df_final = pd.concat([df_sorted, flat_df], axis=1)
df_final = df_final.drop(columns=["JSON Response"])
df_final = df_final.drop(columns=["Short Notes"])
df_final = df_final.drop(columns=["Dated Short Notes"])

df_final.to_excel('output_df.xlsx', index=False)

KeyError: 'JSON Response'