# Process logic

## Setup env

In [41]:
%%sh
# current_path=$(pwd)
# parent_dir=$(dirname "$current_dir")
export PROJECT_HOME="/home/ivan/Uforse/university_crawl"
echo $PROJECT_HOME
if [ -f "${PROJECT_HOME}/build" ]; then
    echo "Found build in PROJECT_HOME. Sourcing now..."
    . "${PROJECT_HOME}/build"
    echo "build executed successfully."
else
    echo "Error: build not found in PROJECT_HOME."
fi

/home/ivan/Uforse/university_crawl
Found build in PROJECT_HOME. Sourcing now...
Obtaining file:///home/ivan/Uforse/university_crawl/university_info_generator
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Installing collected packages: university-info-generator
  Attempting uninstall: university-info-generator
    Found existing installation: university-info-generator 0.0.3
    Uninstalling university-info-generator-0.0.3:
      Successfully uninstalled university-info-generator-0.0.3
  Running setup.py develop for university-info-generator
Successfully installed university-info-generator-0.0.3
Directory already exists: /home/ivan/Uforse/university_crawl/cache_repo
Environment variables set:
PROJECT_HOME=/home/ivan/Uforse/university_crawl
PYTHONPATH=/home/ivan/Uforse/university_crawl:
API_KEY_PATH=/home/ivan/Uforse/university_crawl
CACHE_REPO_PATH=/home/ivan/Uforse/university_crawl/cache_repo
LANGCHAIN_API_KEY=ls__3780df60c0ef4fd2b

In [42]:
import os
import sys
from pprint import pprint

#'/home/ivan/Uforse/university_crawl/fetch_logic'
current_path = os.path.abspath('./')

#'/home/ivan/Uforse/university_crawl'
parent_path = os.path.dirname(current_path)
sys.path.append(parent_path)
import sys
print(sys.executable)
sys.path.append('/home/ivan/Uforse/university_crawl')

/home/ivan/anaconda3/envs/gpu/bin/python


In [43]:
import university_info_generator

In [44]:
from university_info_generator import config
from university_info_generator import UniversityInfoGenerator as uni
from university_info_generator import BasicInfoType, SavedDictType, GPTMethodType, HandlerType
from university_info_generator.utility.google_sheet_utility import *
from university_info_generator.utility.save_load_utility import *
from university_info_generator.configs import ALL_ATTRIBUTE_NAME

## useful methods

In [45]:
def filter_jsonl_file(input_filepath, output_filepath, keyword):
    """
    Reads a JSONL file and writes out only those lines that do not contain the specified keyword.

    Args:
    - input_filepath (str): Path to the input JSONL file.
    - output_filepath (str): Path to the output JSONL file where results should be saved.
    - keyword (str): Keyword to search for in each JSON object to decide if it should be excluded.
    """
    with open(input_filepath, 'r', encoding='utf-8') as file, open(output_filepath, 'w', encoding='utf-8') as outfile:
        for line in file:
            if keyword not in line:  # Check if the keyword is in the line
                outfile.write(line)  # Write the line to the output file if keyword is not found

def extract_university_data(worksheet):
    """
    Extracts university names and their corresponding IDs from a worksheet.

    Args:
        worksheet: A gspread worksheet object with university data.

    Returns:
        List[tuple]: A list of tuples, each containing the university name and its ID.
    """
    # Get all values in the first column for university names
    university_names = worksheet.col_values(1)[1:]  # Assuming the first row is a header
    # Get all values in the second column for IDs
    university_ids = worksheet.col_values(2)[1:]  # Assuming the first row is a header

    # Combine the names and IDs, stripping any extra whitespace and skipping any empty entries
    university_data = [(name.strip(), id_.strip()) for name, id_ in zip(university_names, university_ids) if name.strip() and id_.strip()]
    return university_data


# # threading template
# def fetch_university_info(uni_gen, university_name, results, index):
#     # Retrieve university info and store it in the results list at the correct index
#     results[index] = uni_gen.get_university_info(university_name)

# results = [None] * len(lst)
# threads = []
# for i, university in enumerate(lst[:3]):
#     thread = threading.Thread(target=fetch_university_info, args=(uni_gen, university, results, i))
#     threads.append(thread)
#     thread.start()

# for thread in threads:
#     thread.join()

## Get Attr list

In [46]:
attr_df = get_attribute_df()
# attr_df.head(25)

In [47]:
store_cache(config.UNIVERSITY_ATTRIBUTE_CACHE_FILE_PATH, get_attribute_dict())

## Main train logic

In [48]:
uni_gen = uni()
uni_gen.load_from_file(SavedDictType.ATTRIBUTE, config.UNIVERSITY_ATTRIBUTE_CACHE_FILE_PATH)
uni_gen.load_from_file(SavedDictType.GPT_CACHE, "./gpt_cache.jsonl")
uni_gen.load_from_file(SavedDictType.UNIVERSITY_BASIC_INFO, "./university_basic_info.jsonl")
uni_gen.load_from_file(SavedDictType.UNIVERSITY_INFO, "./university_info.jsonl")


In [49]:
# pprint(uni_gen.attribute_dict)

In [50]:
pprint(uni_gen.attribute_dict["location"].keys())

dict_keys(['attribute_format', 'attribute_reference', 'attribute_prompt', 'example', 'handler', 'k_value', 'mapping'])


In [51]:
pprint(uni_gen.attribute_dict["ranking_arwu_rank_2023"])

{'attribute_format': 'Union[int, str]',
 'attribute_prompt': '',
 'attribute_reference': '["https://www.shanghairanking.com/rankings/arwu/2023.html", '
                        '"https://www.shanghairanking.com/institution"]',
 'example': '44 or 100-200 or 1400+',
 'handler': <HandlerType.LANGCHAIN_TAVILY: 16>,
 'k_value': 3.0,
 'mapping': '2023 Academic Ranking of World Universities, ShanghaiRanking'}


In [52]:
target_worksheet = get_worksheet()


In [53]:
target_university = get_worksheet(
    spreadsheet_title= "working_extract_info_output",
    sheetname= "target_university_list",
    sheet_client= get_sheet_client(),
)


In [54]:
lst = extract_university_data(target_university)
print(lst[-20:])

[('Concord University', '2121'), ('Eastern West Virgina Community and Technical College', '2122'), ('Fairmont State University', '2123'), ('Marshall University', '2124'), ('Ohio Valley University', '2125'), ('Pierpont Community and Technical College', '2126'), ('Shepherd University', '2127'), ('University of Charleston', '2128'), ('West Liberty University', '2129'), ('Wheeling Jesuit University', '2130'), ('West Virginia Northern Community College', '2131'), ('West Virginia State University', '2132'), ('West Virginia University', '2133'), ('West Virginia Wesleyan College', '2134'), ('Casper College', '2135'), ('Central Wyoming College', '2136'), ('National Outdoor Leadership School', '2137'), ('Northwest College', '2138'), ('Northern Wyoming Community College District', '2139'), ('University of Wyoming', '2140')]


In [55]:
df = pd.DataFrame.from_dict(dict(lst), orient="index").reset_index()
df.columns = ["university_name", "id_"]
file_path = "all_universities.csv"
df.to_csv(file_path, index=False, encoding="utf-8")

In [56]:
df["university_name"]

0                      Adler Graduate Professional School
1         Alberta Business School - University of Alberta
2                                       Algoma University
3                                       Algonquin College
4       Asper School of Business - University of Manitoba
                              ...                        
2114                              Central Wyoming College
2115                   National Outdoor Leadership School
2116                                    Northwest College
2117          Northern Wyoming Community College District
2118                                University of Wyoming
Name: university_name, Length: 2119, dtype: object

In [57]:
from university_info_generator.configs.enum_class import GeneralInfoType, AttributeColumnType

In [58]:
import threading


In [59]:
import time

In [60]:
lst = extract_university_data(target_university)

target_name = ["toronto", "calgary", "british colombia", "victory", "simon fraser", "guelph", "waterloo"]

def my_filter(x):
    return any(name in x[0].lower() for name in target_name)

lst = list(filter(my_filter, lst))
print(lst)
print(len(lst))

[('Beedie School of Business - Simon Fraser University', '8'), ('Haskayne School of Business - University of Calgary', '54'), ('Lambton College (Toronto Campus/Cestar College)', '67'), ('Rotman School of Management - University of Toronto', '113'), ('Simon Fraser University', '128'), ('Simon Fraser University (SFU)', '129'), ('St. Clair College (Toronto Campus/Ace Acumen College)', '135'), ('University of Calgary', '158'), ('University of Calgary (UofC)', '159'), ('University of Guelph', '162'), ('University of Toronto', '177'), ('University of Toronto (UofT)', '178'), ('University of Waterloo', '181')]
13


In [61]:
# # def fetch_university_info(uni_gen, university_name, results, index):
# #     # Retrieve university info and store it in the results list at the correct index
# #     results[index] = uni_gen.get_university_info(university_name)

# # results = [None] * len(lst)
# # threads = []
# # for i, university in enumerate(lst[:3]):
# #     thread = threading.Thread(target=fetch_university_info, args=(uni_gen, university, results, i))
# #     threads.append(thread)
# #     thread.start()
# #     time.sleep(20)

# # for thread in threads:
# #     time.sleep(10)
# #     thread.join()
# # result = uni_gen.get_university_info(lst[0])
for uni, id_ in lst[:20]:
    uni_gen.get_university_info(uni, id_)

# uni_gen.get_info_by_attribute(university_name="UW", attribute_name="location")
# uni_gen.get_info_by_attribute(university_name="UW", attribute_name=GeneralInfoType.LOCATION.value, handler=HandlerType.LANGCHAIN_SERPER, params={"transformer":  "RecursiveURL"})
# uni_gen.get_info_by_attribute(university_name="Adler Graduate Professional School", attribute_name=GeneralInfoType.DOMESTIC_STUDENT_TUITION.value, handler=HandlerType.LANGCHAIN_SERPER, params={"transformer":  "RecursiveURL"})


used langchain: get_retrieved_attr_with_format_tavily, university_name: Simon Fraser University,
                attribute: university_type
used langchain: get_retrieved_attr_with_format_tavily, university_name: Simon Fraser University,
                attribute: years to graduate
used langchain: get_retrieved_attr_with_format_tavily, university_name: Simon Fraser University,
                attribute: location
used langchain: get_retrieved_attr_with_format_tavily, university_name: Simon Fraser University,
                attribute: graduation_rate
used langchain: get_retrieved_attr_with_format_tavily, university_name: Simon Fraser University,
                attribute: domestic_student_tuition
used langchain: get_retrieved_attr_with_format_tavily, university_name: Simon Fraser University,
                attribute: international_student_tuition
used langchain: get_retrieved_attr_with_format_tavily, university_name: Simon Fraser University,
                attribute: description
used l

## Saving Logic

In [62]:
uni_gen.save_to_file(SavedDictType.UNIVERSITY_INFO, "./university_info.jsonl")

In [68]:
uni_gen.save_to_file(SavedDictType.UNIVERSITY_BASIC_INFO, "./university_basic_info.jsonl")

In [64]:
uni_gen.save_to_file(SavedDictType.GPT_CACHE, "./gpt_cache.jsonl")

In [65]:
uni_gen.save_to_file(SavedDictType.TROUBLE_PRODUCED, "./trouble_produced.jsonl")

In [66]:
clear_worksheet_content(target_worksheet)

Cleared 20 rows and 23 columns.


In [67]:
write_cache_to_worksheet("university_info.jsonl", target_worksheet)
