# Process logic

## Setup env

In [1]:
%%sh
# current_path=$(pwd)
# parent_dir=$(dirname "$current_dir")
export PROJECT_HOME="/home/ivan/Uforse/university_crawl"
echo $PROJECT_HOME
if [ -f "${PROJECT_HOME}/build" ]; then
    echo "Found build in PROJECT_HOME. Sourcing now..."
    . "${PROJECT_HOME}/build"
    echo "build executed successfully."
else
    echo "Error: build not found in PROJECT_HOME."
fi

/home/ivan/Uforse/university_crawl
Found build in PROJECT_HOME. Sourcing now...
Obtaining file:///home/ivan/Uforse/university_crawl/university_info_generator
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Installing collected packages: university-info-generator
  Attempting uninstall: university-info-generator
    Found existing installation: university-info-generator 0.0.3
    Uninstalling university-info-generator-0.0.3:
      Successfully uninstalled university-info-generator-0.0.3
  Running setup.py develop for university-info-generator
Successfully installed university-info-generator-0.0.3
Directory already exists: /home/ivan/Uforse/university_crawl/cache_repo
Environment variables set:
PROJECT_HOME=/home/ivan/Uforse/university_crawl
PYTHONPATH=/home/ivan/Uforse/university_crawl:
API_KEY_PATH=/home/ivan/Uforse/university_crawl
CACHE_REPO_PATH=/home/ivan/Uforse/university_crawl/cache_repo
LANGCHAIN_API_KEY=ls__3780df60c0ef4fd2b

In [2]:
import os
import sys
from pprint import pprint

#'/home/ivan/Uforse/university_crawl/fetch_logic'
current_path = os.path.abspath('./')

#'/home/ivan/Uforse/university_crawl'
parent_path = os.path.dirname(current_path)
sys.path.append(parent_path)
import sys
print(sys.executable)
sys.path.append('/home/ivan/Uforse/university_crawl')

/home/ivan/anaconda3/envs/gpu/bin/python


In [3]:
import university_info_generator

In [4]:
from university_info_generator import config
from university_info_generator import UniversityInfoGenerator as uni
from university_info_generator import BasicInfoType, SavedDictType, GPTMethodType, HandlerType
from university_info_generator.utility.google_sheet_utility import *
from university_info_generator.utility.save_load_utility import *
from university_info_generator.configs import ALL_ATTRIBUTE_NAME

## useful methods

In [5]:
def filter_jsonl_file(input_filepath, output_filepath, keyword):
    """
    Reads a JSONL file and writes out only those lines that do not contain the specified keyword.

    Args:
    - input_filepath (str): Path to the input JSONL file.
    - output_filepath (str): Path to the output JSONL file where results should be saved.
    - keyword (str): Keyword to search for in each JSON object to decide if it should be excluded.
    """
    with open(input_filepath, 'r', encoding='utf-8') as file, open(output_filepath, 'w', encoding='utf-8') as outfile:
        for line in file:
            if keyword not in line:  # Check if the keyword is in the line
                outfile.write(line)  # Write the line to the output file if keyword is not found

def extract_university_data(worksheet):
    """
    Extracts university names and their corresponding IDs from a worksheet.

    Args:
        worksheet: A gspread worksheet object with university data.

    Returns:
        List[tuple]: A list of tuples, each containing the university name and its ID.
    """
    # Get all values in the first column for university names
    university_names = worksheet.col_values(1)[1:]  # Assuming the first row is a header
    # Get all values in the second column for IDs
    university_ids = worksheet.col_values(2)[1:]  # Assuming the first row is a header

    # Combine the names and IDs, stripping any extra whitespace and skipping any empty entries
    university_data = [(name.strip(), id_.strip()) for name, id_ in zip(university_names, university_ids) if name.strip() and id_.strip()]
    return university_data


# # threading template
# def fetch_university_info(uni_gen, university_name, results, index):
#     # Retrieve university info and store it in the results list at the correct index
#     results[index] = uni_gen.get_university_info(university_name)

# results = [None] * len(lst)
# threads = []
# for i, university in enumerate(lst[:3]):
#     thread = threading.Thread(target=fetch_university_info, args=(uni_gen, university, results, i))
#     threads.append(thread)
#     thread.start()

# for thread in threads:
#     thread.join()

## Get Attr list

In [6]:
attr_df = get_attribute_df()
# attr_df.head(25)

In [7]:
store_cache(config.UNIVERSITY_ATTRIBUTE_CACHE_FILE_PATH, get_attribute_dict())

## Main train logic

In [8]:
uni_gen = uni()
uni_gen.load_from_file(SavedDictType.ATTRIBUTE, config.UNIVERSITY_ATTRIBUTE_CACHE_FILE_PATH)
uni_gen.load_from_file(SavedDictType.GPT_CACHE, "./gpt_cache.jsonl")
uni_gen.load_from_file(SavedDictType.UNIVERSITY_BASIC_INFO, "./university_basic_info.jsonl")
# uni_gen.load_from_file(SavedDictType.UNIVERSITY_INFO, "./university_info.jsonl")


In [9]:
# pprint(uni_gen.attribute_dict)

In [10]:
pprint(uni_gen.attribute_dict["location"].keys())

dict_keys(['attribute_format', 'attribute_reference', 'attribute_prompt', 'example', 'handler', 'k_value', 'mapping'])


In [11]:
pprint(uni_gen.attribute_dict["graduation_rate"])

{'attribute_format': '<graduate_rate>%',
 'attribute_prompt': 'Find the graduation rate. Keep the format in two digits, '
                     'like 89 represent 89%',
 'attribute_reference': '',
 'example': '89%',
 'handler': <HandlerType.LANGCHAIN_TAVILY: 16>,
 'k_value': '',
 'mapping': ''}


In [12]:
target_worksheet = get_worksheet()


In [28]:
# canada
target_university_ca = get_worksheet(
    spreadsheet_title= "working_extract_info_output",
    sheetname= "target_university_list_canada",
    sheet_client= get_sheet_client(),
)

# states
target_university_usa = get_worksheet(
    spreadsheet_title= "working_extract_info_output",
    sheetname= "target_university_list_usa",
    sheet_client= get_sheet_client(),
)


In [14]:
lst = extract_university_data(target_university_ca)
print(lst[-20:])

[('Université de Sherbrooke', '191'), ('Université du Québec en Abitibi-Témiscamingue (UQAT)', '192'), ('Université du Québec en Outaouais (UQO)', '193'), ('Université du Québec à Chicoutimi (UQAC)', '194'), ('Université du Québec à Montréal (UQAM)', '195'), ('Université du Québec à Rimouski (UQAR)', '196'), ('Université du Québec à Trois-Rivières (UQTR)', '197'), ('Vancouver Community College (VCC)', '198'), ('Vancouver Island University', '199'), ('Vancouver Island University (VIU)', '200'), ('Western University (UWO)', '201'), ('Wilfrid Laurier University (WLU)', '202'), ('Windsor University', '203'), ('York University', '204'), ('Yorkville University', '205'), ('Yukon University', '206'), ('target_university_list', '207'), ('university_name', '208'), ('École de technologie supérieure', '209'), ('École de technologie supérieure (ETS)', '210')]


In [31]:
usa_lst = extract_university_data(target_university_usa)
df = pd.DataFrame.from_dict(dict(usa_lst), orient="index").reset_index()
df.columns = ["university_name", "id_"]
file_path = "all_universities_usa.csv"
df.to_csv(file_path, index=False, encoding="utf-8")

In [30]:
df = pd.DataFrame.from_dict(dict(lst), orient="index").reset_index()
df.columns = ["university_name", "id_"]
file_path = "all_universities_canada.csv"
df.to_csv(file_path, index=False, encoding="utf-8")

In [16]:
df["university_name"]

0                     Adler Graduate Professional School
1        Alberta Business School - University of Alberta
2                                      Algoma University
3                                      Algonquin College
4      Asper School of Business - University of Manitoba
                             ...                        
205                                     Yukon University
206                               target_university_list
207                                      university_name
208                      École de technologie supérieure
209                École de technologie supérieure (ETS)
Name: university_name, Length: 210, dtype: object

In [17]:
from university_info_generator.configs.enum_class import GeneralInfoType, AttributeColumnType

In [18]:
import threading


In [19]:
import time

In [20]:
lst = extract_university_data(target_university_ca)

target_name = ["toronto", "calgary", "british colombia", "victory", "simon fraser", "guelph", "waterloo"]

def my_filter(x):
    return any(name in x[0].lower() for name in target_name)

# lst = list(filter(my_filter, lst))
print(lst)
print(len(lst))

[('Adler Graduate Professional School', '1'), ('Alberta Business School - University of Alberta', '2'), ('Algoma University', '3'), ('Algonquin College', '4'), ('Asper School of Business - University of Manitoba', '5'), ('Assiniboine Community College', '6'), ('Athabasca University', '7'), ('Beedie School of Business - Simon Fraser University', '8'), ("Bishop's University", '9'), ('Bow Valley College', '10'), ('Brandon University', '11'), ('British Columbia Institute of Technology (BCIT)', '12'), ('Brock University', '13'), ('Cambrian College', '14'), ('Cambrian College (Hanson Campus)', '15'), ('Camosun College', '16'), ('Canada College/Collège Canada', '17'), ('Canadore College', '18'), ('Cape Breton University (CBU)', '19'), ('Capilano University', '20'), ('Carleton University', '21'), ('Carlton Trail College', '22'), ('Centennial College', '23'), ('Coast Mountain College', '24'), ('College of New Caledonia', '25'), ('College of the North Atlantic', '26'), ('College of the Rockies',

In [21]:
# # def fetch_university_info(uni_gen, university_name, results, index):
# #     # Retrieve university info and store it in the results list at the correct index
# #     results[index] = uni_gen.get_university_info(university_name)

# # results = [None] * len(lst)
# # threads = []
# # for i, university in enumerate(lst[:3]):
# #     thread = threading.Thread(target=fetch_university_info, args=(uni_gen, university, results, i))
# #     threads.append(thread)
# #     thread.start()
# #     time.sleep(20)

# # for thread in threads:
# #     time.sleep(10)
# #     thread.join()
# # result = uni_gen.get_university_info(lst[0])
# for uni, id_ in lst[:2]:
#     uni_gen.get_university_info(uni, id_)

# uni_gen.get_info_by_attribute(university_name="UW", attribute_name="location")
# uni_gen.get_info_by_attribute(university_name="UW", attribute_name=GeneralInfoType.LOCATION.value, handler=HandlerType.LANGCHAIN_SERPER, params={"transformer":  "RecursiveURL"})
# uni_gen.get_info_by_attribute(university_name="Adler Graduate Professional School", attribute_name=GeneralInfoType.DOMESTIC_STUDENT_TUITION.value, handler=HandlerType.LANGCHAIN_SERPER, params={"transformer":  "RecursiveURL"})


## Saving Logic

In [22]:
uni_gen.save_to_file(SavedDictType.UNIVERSITY_INFO, "./university_info.jsonl")

In [23]:
uni_gen.save_to_file(SavedDictType.UNIVERSITY_BASIC_INFO, "./university_basic_info.jsonl")

In [24]:
uni_gen.save_to_file(SavedDictType.GPT_CACHE, "./gpt_cache.jsonl")

In [25]:
uni_gen.save_to_file(SavedDictType.TROUBLE_PRODUCED, "./trouble_produced.jsonl")

In [26]:
clear_worksheet_content(target_worksheet)

In [27]:
write_cache_to_worksheet("university_info.jsonl", target_worksheet)


An error occurred: Missing columns in DataFrame that are expected in the worksheet: {'ranking_qs_news_2024', 'wikipedia', 'characteristics', 'faculty', 'ranking_times_rank_2024', 'popular_programs', 'abbreviation', 'website', 'important_calendar', 'graduation_rate', 'programs', 'others', 'description', 'graduation_year', 'ranking_us_news_2023', 'location', 'university_type', 'statistics', 'international_student_tuition', 'ranking_arwu_rank_2023', 'domestic_student_tuition', 'id_', 'university_name'}
