# Process logic

## Setup env

In [1]:
%%sh
# current_path=$(pwd)
# parent_dir=$(dirname "$current_dir")
export PROJECT_HOME="/home/ivan/Uforse/university_crawl"
echo $PROJECT_HOME
if [ -f "${PROJECT_HOME}/build" ]; then
    echo "Found build in PROJECT_HOME. Sourcing now..."
    . "${PROJECT_HOME}/build"
    echo "build executed successfully."
else
    echo "Error: build not found in PROJECT_HOME."
fi

/home/ivan/Uforse/university_crawl
Found build in PROJECT_HOME. Sourcing now...
Obtaining file:///home/ivan/Uforse/university_crawl/university_info_generator
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Installing collected packages: university_info_generator
  Attempting uninstall: university_info_generator
    Found existing installation: university_info_generator 0.0.3
    Uninstalling university_info_generator-0.0.3:
      Successfully uninstalled university_info_generator-0.0.3
  Running setup.py develop for university_info_generator
Successfully installed university_info_generator-0.0.3
Directory already exists: /home/ivan/Uforse/university_crawl/cache_repo
Environment variables set:
PROJECT_HOME=/home/ivan/Uforse/university_crawl
PYTHONPATH=/home/ivan/Uforse/university_crawl:
API_KEY_PATH=/home/ivan/Uforse/university_crawl
CACHE_REPO_PATH=/home/ivan/Uforse/university_crawl/cache_repo
LANGCHAIN_API_KEY=ls__3780df60c0ef4fd2b

In [4]:
from g4f.client import Client

client = Client()
response = client.chat.completions.create(
    model="gpt-4-turbo",
    messages=[{"role": "user", "content": "Hello"}],
)
print(response.choices[0].message.content)

Hi there! How can I assist you today? 😊


In [2]:
import os
import sys
from pprint import pprint

#'/home/ivan/Uforse/university_crawl/fetch_logic'
current_path = os.path.abspath('./')

#'/home/ivan/Uforse/university_crawl'
parent_path = os.path.dirname(current_path)
sys.path.append(parent_path)
import sys
print(sys.executable)
sys.path.append('/home/ivan/Uforse/university_crawl')

/home/ivan/anaconda3/envs/university_crawl/bin/python


In [3]:
import university_info_generator

In [4]:
from university_info_generator import config
from university_info_generator import UniversityInfoGenerator
from university_info_generator import UniversityBasicInfoType, UniversitySavedDictType, GPTMethodType, HandlerType
from university_info_generator.utility.google_sheet_utility import *
from university_info_generator.utility.save_load_utility import *
from university_info_generator.configs import ALL_ATTRIBUTE_NAME

## useful methods

In [5]:
def filter_jsonl_file(input_filepath, output_filepath, keyword):
    """
    Reads a JSONL file and writes out only those lines that do not contain the specified keyword.

    Args:
    - input_filepath (str): Path to the input JSONL file.
    - output_filepath (str): Path to the output JSONL file where results should be saved.
    - keyword (str): Keyword to search for in each JSON object to decide if it should be excluded.
    """
    with open(input_filepath, 'r', encoding='utf-8') as file, open(output_filepath, 'w', encoding='utf-8') as outfile:
        for line in file:
            if keyword not in line:  # Check if the keyword is in the line
                outfile.write(line)  # Write the line to the output file if keyword is not found

def extract_university_data(worksheet):
    """
    Extracts university names and their corresponding IDs from a worksheet.

    Args:
        worksheet: A gspread worksheet object with university data.

    Returns:
        List[tuple]: A list of tuples, each containing the university name and its ID.
    """
    # Get all values in the first column for university names
    university_names = worksheet.col_values(2)[1:]  # Assuming the first row is a header
    # Get all values in the second column for IDs
    university_ids = worksheet.col_values(1)[1:]  # Assuming the first row is a header

    # Combine the names and IDs, stripping any extra whitespace and skipping any empty entries
    university_data = [(id_.strip(), name.strip()) for name, id_ in zip(university_names, university_ids) if name.strip() and id_.strip()]
    return university_data


# # threading template
# def fetch_university_info(uni_gen, university_name, results, index):
#     # Retrieve university info and store it in the results list at the correct index
#     results[index] = uni_gen.get_university_info(university_name)

# results = [None] * len(lst)
# threads = []
# for i, university in enumerate(lst[:3]):
#     thread = threading.Thread(target=fetch_university_info, args=(uni_gen, university, results, i))
#     threads.append(thread)
#     thread.start()

# for thread in threads:
#     thread.join()

## Get Attr list

In [6]:
attr_df = get_attribute_df()
# attr_df.head(25)

In [7]:
store_cache(config.UNIVERSITY_ATTRIBUTE_CACHE_FILE_PATH, get_attribute_dict())

## Main train logic

In [8]:
uni_gen = UniversityInfoGenerator()
uni_gen.load_from_file(UniversitySavedDictType.ATTRIBUTE, config.UNIVERSITY_ATTRIBUTE_CACHE_FILE_PATH)
uni_gen.load_from_file(UniversitySavedDictType.GPT_CACHE, "./gpt_cache.jsonl")
uni_gen.load_from_file(UniversitySavedDictType.UNIVERSITY_BASIC_INFO, "./university_basic_info.jsonl")
uni_gen.load_from_file(UniversitySavedDictType.UNIVERSITY_INFO, "./university_info.jsonl")


In [9]:
pprint(uni_gen.attribute_dict["location"].keys())

dict_keys(['attribute_format', 'attribute_reference', 'attribute_prompt', 'example', 'handler', 'k_value', 'mapping'])


In [10]:
pprint(uni_gen.attribute_dict["graduation_rate"])

{'attribute_format': '<graduate_rate>%',
 'attribute_prompt': 'Find the graduation rate. Keep the format in two digits, '
                     'like 89 represent 89%',
 'attribute_reference': '',
 'example': '89%',
 'handler': <HandlerType.LANGCHAIN_TAVILY: 16>,
 'k_value': '',
 'mapping': ''}


In [11]:
target_worksheet = get_worksheet()


In [12]:
# canada
target_university_ca = get_worksheet(
    spreadsheet_title= "working_extract_info_output",
    sheetname= "target_university_canada_stat",
    sheet_client= get_sheet_client(),
)

# states
target_university_usa = get_worksheet(
    spreadsheet_title= "working_extract_info_output",
    sheetname= "target_university_list_usa",
    sheet_client= get_sheet_client(),
)


In [13]:
lst = extract_university_data(target_university_ca)
print(lst[-20:])

[('92', 'University of Calgary - Including medical and dental'), ('93', 'Burman University'), ('94', 'Concordia University of Edmonton'), ('95', 'University of Lethbridge'), ('96', "The King's University College"), ('97', 'Ambrose University'), ('98', 'Grant MacEwan University'), ('99', 'Mount Royal University'), ('100', 'University of British Columbia - Including medical and dental'), ('101', 'University of Northern British Columbia'), ('102', 'Royal Roads University'), ('103', 'Simon Fraser University'), ('104', 'University of Victoria'), ('105', 'Thompson Rivers University'), ('106', 'Capilano University'), ('107', 'Vancouver Island University'), ('108', 'Emily Carr University of Art and Design'), ('109', 'Kwantlen Polytechnic University'), ('110', 'University of the Fraser Valley'), ('111', 'Yukon University')]


In [14]:
usa_lst = extract_university_data(target_university_usa)
df = pd.DataFrame.from_dict(dict(usa_lst), orient="index").reset_index()
df.columns = ["university_name", "id_"]
file_path = "all_universities_usa.csv"
df.to_csv(file_path, index=False, encoding="utf-8")

In [15]:
df = pd.DataFrame.from_dict(dict(lst), orient="index").reset_index()
df.columns = ["university_name", "id_"]
file_path = "all_universities_canada.csv"
df.to_csv(file_path, index=False, encoding="utf-8")

In [16]:
df["university_name"]

0        1
1        2
2        3
3        4
4        5
      ... 
106    107
107    108
108    109
109    110
110    111
Name: university_name, Length: 111, dtype: object

In [17]:
from university_info_generator.configs.enum_class import UniversityGeneralInfoType, UniversityAttributeColumnType

In [18]:
import threading


In [19]:
import time

In [20]:
lst = extract_university_data(target_university_ca)

# target_name = ["toronto", "calgary", "british colombia", "victory", "simon fraser", "guelph", "waterloo"]

# def my_filter(x):
#     return any(name in x[0].lower() for name in target_name)

# lst = list(filter(my_filter, lst))
print(lst)
print(len(lst))

[('1', 'Memorial University of Newfoundland - Including medical and dental'), ('2', 'University of Prince Edward Island'), ('3', 'Acadia University'), ('4', 'Acadia Divinity College'), ('5', 'Atlantic School of Theology'), ('6', 'Cape Breton University'), ('7', 'Dalhousie University - Including medical and dental'), ('8', "University of King's College"), ('9', 'Mount Saint Vincent University'), ('10', 'Nova Scotia College of Art and Design University (NSCAD)'), ('11', 'Université Sainte-Anne'), ('12', 'St. Francis Xavier University'), ('13', "Saint Mary's University"), ('14', 'Mount Allison University'), ('15', 'University of New Brunswick'), ('16', 'Université de Moncton (parent)'), ('17', 'St. Thomas University'), ('18', "Bishop's University"), ('19', 'McGill University - Including medical and dental'), ('20', 'Université de Montréal - Including medical and dental'), ('21', 'Polytechnique Montréal'), ('22', 'École des hautes études commerciales'), ('23', 'Université Laval - Including

In [21]:
print(list(uni_gen.university_basic_info_dict.values())[106])

{'id_': '58', 'university_name': 'University of Waterloo', 'abbreviation': 'UW', 'website': 'https://uwaterloo.ca', 'wikipedia': 'https://en.wikipedia.org/wiki/University_of_Waterloo'}


In [None]:
# # def fetch_university_info(uni_gen, university_name, results, index):
# #     # Retrieve university info and store it in the results list at the correct index
# #     results[index] = uni_gen.get_university_info(university_name)

# # results = [None] * len(lst)
# # threads = []
# # for i, university in enumerate(lst[:3]):
# #     thread = threading.Thread(target=fetch_university_info, args=(uni_gen, university, results, i))
# #     threads.append(thread)
# #     thread.start()
# #     time.sleep(20)

# # for thread in threads:
# #     time.sleep(10)
# #     thread.join()
# # result = uni_gen.get_university_info(lst[0])
# for uni_name in uni_gen.university_basic_info_dict:
#     id_ = uni_gen.university_basic_info_dict[uni_name]["id_"]
#     print(uni_name, id_)
#     uni_gen.get_university_info(uni_name, id_)
uni_gen.get_university_info("Academy of Art University", -1)
# uni_gen.get_info_by_attribute(university_name="UW", attribute_name="location")
# uni_gen.get_info_by_attribute(university_name="UW", attribute_name=GeneralInfoType.LOCATION.value, handler=HandlerType.LANGCHAIN_SERPER, params={"transformer":  "RecursiveURL"})
# uni_gen.get_info_by_attribute(university_name="Adler Graduate Professional School", attribute_name=GeneralInfoType.DOMESTIC_STUDENT_TUITION.value, handler=HandlerType.LANGCHAIN_SERPER, params={"transformer":  "RecursiveURL"})
# uni_gen.get_university_basic_info('University of Waterloo', 58, BasicInfoType.ABBREVIATION)

Failed to batch ingest runs: LangSmithRateLimitError('Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError(\'429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/batch\', \'{"detail":"Monthly unique traces usage limit exceeded"}\')')
Failed to batch ingest runs: LangSmithRateLimitError('Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError(\'429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/batch\', \'{"detail":"Monthly unique traces usage limit exceeded"}\')')


In [None]:
uni_gen.university_info_dict["Academy of Art University"].to_dict_en()

{'id_': -1,
 'university_name': 'Academy of Art University',
 'abbreviation': 'AAU',
 'website': 'https://www.academyart.edu',
 'wikipedia': 'https://en.wikipedia.org/wiki/Academy_of_Art_University',
 'university_type': 'Private University',
 'graduation_year': '4',
 'location': 'San Francisco, California, United States',
 'graduation_rate': '34%',
 'domestic_student_tuition': 'Attribute: domestic_student_tuition\nOutput format: CAD $3,378 / CAD $3,378 - $3,378',
 'international_student_tuition': '$1,272 per unit',
 'ranking_qs_news_2024': '',
 'description': 'The Academy of Art University, located in San Francisco, California, is a privately owned for-profit art school. It was founded in 1929 by Richard S. Stephens and offers undergraduate and graduate programs in a wide array of art and design fields. The university is known for its inclusive admission policy, extensive online courses, and a commitment to preparing students for professional careers in the arts. The Academy of Art Uni

## df result

In [None]:
# print(uni_gen.university_basic_info_dict["University of Waterloo"])

In [None]:
from typing import Any
result_dict:Dict[str, Dict[str, Any]] = {}
for k in uni_gen.university_info_dict:
    value = uni_gen.university_info_dict[k]
    if type(value) == dict:
        result_dict[k] = value
    elif type(value) == University:
        result_dict[k] = value.to_dict_en()

In [None]:
df = pd.DataFrame.from_dict(result_dict, orient="index").reset_index()
df.head()
df.to_csv("./fetch_data/all_universities_canada_info.csv", index=False)

Failed to batch ingest runs: LangSmithRateLimitError('Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError(\'429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/batch\', \'{"detail":"Monthly unique traces usage limit exceeded"}\')')
Failed to batch ingest runs: LangSmithRateLimitError('Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError(\'429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/batch\', \'{"detail":"Monthly unique traces usage limit exceeded"}\')')
Failed to batch ingest runs: LangSmithRateLimitError('Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError(\'429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/batch\', \'{"detail":"Monthly unique traces usage limit exceeded"}\')')
Failed to batch ingest runs: LangSmithRateLimitError('Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError(\'42

In [None]:
# df.columns = ["index", " id_", "university_name", "abbreviation", "website", "wikipedia"]

# df.drop(columns="index", inplace=True)
# df.drop(columns="abbreviation", inplace=True)

## Saving Logic

In [None]:
uni_gen.save_to_file(UniversitySavedDictType.UNIVERSITY_INFO, "./university_info.jsonl")
uni_gen.save_to_file(UniversitySavedDictType.UNIVERSITY_BASIC_INFO, "./university_basic_info.jsonl")
uni_gen.save_to_file(UniversitySavedDictType.GPT_CACHE, "./gpt_cache.jsonl")
uni_gen.save_to_file(UniversitySavedDictType.TROUBLE_PRODUCED, "./trouble_produced.jsonl")
clear_worksheet_content(target_worksheet)
write_cache_to_worksheet("university_info.jsonl", target_worksheet)

An error occurred: APIError: [429]: Quota exceeded for quota metric 'Write requests' and limit 'Write requests per minute per user' of service 'sheets.googleapis.com' for consumer 'project_number:239229069911'.


In [None]:
# uni_gen.save_to_file(UniversitySavedDictType.UNIVERSITY_BASIC_INFO, "./university_basic_info.jsonl")

In [None]:
# uni_gen.save_to_file(UniversitySavedDictType.GPT_CACHE, "./gpt_cache.jsonl")

In [None]:
# uni_gen.save_to_file(UniversitySavedDictType.TROUBLE_PRODUCED, "./trouble_produced.jsonl")

In [None]:
# clear_worksheet_content(target_worksheet)

In [None]:
# write_cache_to_worksheet("university_info.jsonl", target_worksheet)
