# Process logic

## Setup env

In [4]:
%%sh
current_path=$(pwd)
parent_dir=$(dirname "$current_dir")
export PROJECT_HOME="/home/ivan/Uforse/university_crawl"
echo $PROJECT_HOME
if [ -f "${PROJECT_HOME}/build" ]; then
    echo "Found .bashrc in PROJECT_HOME. Sourcing now..."
    . "${PROJECT_HOME}/build"
    echo "build executed successfully."
else
    echo "Error: build not found in PROJECT_HOME."
fi

/home/ivan/Uforse/university_crawl
Found .bashrc in PROJECT_HOME. Sourcing now...
Obtaining file:///home/ivan/Uforse/university_crawl/university_info_generator
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Installing collected packages: university-info-generator
  Attempting uninstall: university-info-generator
    Found existing installation: university-info-generator 0.0.3
    Uninstalling university-info-generator-0.0.3:
      Successfully uninstalled university-info-generator-0.0.3
  Running setup.py develop for university-info-generator
Successfully installed university-info-generator-0.0.3
Directory already exists: /home/ivan/Uforse/university_crawl/cache_repo
Environment variables set:
PROJECT_HOME=/home/ivan/Uforse/university_crawl
PYTHONPATH=/home/ivan/Uforse/university_crawl:
API_KEY_PATH=/home/ivan/Uforse/university_crawl
CACHE_REPO_PATH=/home/ivan/Uforse/university_crawl/cache_repo
LANGCHAIN_API_KEY=ls__3780df60c0ef4fd

In [5]:
import os
import sys
from pprint import pprint

#'/home/ivan/Uforse/university_crawl/fetch_logic'
current_path = os.path.abspath('./')

#'/home/ivan/Uforse/university_crawl'
parent_path = os.path.dirname(current_path)
sys.path.append(parent_path)
import sys
print(sys.executable)
sys.path.append('/home/ivan/Uforse/university_crawl')

/home/ivan/anaconda3/envs/gpu/bin/python


In [6]:
import university_info_generator

In [7]:
from university_info_generator import config
from university_info_generator import UniversityInfoGenerator as uni
from university_info_generator import BasicInfoType, SavedDictType, GPTMethodType, HandlerType
from university_info_generator.utility.google_sheet_utility import *
from university_info_generator.utility.save_load_utility import *
from university_info_generator.configs import ALL_ATTRIBUTE_NAME

## useful methods

In [8]:
def filter_jsonl_file(input_filepath, output_filepath, keyword):
    """
    Reads a JSONL file and writes out only those lines that do not contain the specified keyword.

    Args:
    - input_filepath (str): Path to the input JSONL file.
    - output_filepath (str): Path to the output JSONL file where results should be saved.
    - keyword (str): Keyword to search for in each JSON object to decide if it should be excluded.
    """
    with open(input_filepath, 'r', encoding='utf-8') as file, open(output_filepath, 'w', encoding='utf-8') as outfile:
        for line in file:
            if keyword not in line:  # Check if the keyword is in the line
                outfile.write(line)  # Write the line to the output file if keyword is not found

def extract_university_names(worksheet):
    # Get all values in the first column (index 1)
    university_names = worksheet.col_values(1)  # This retrieves all data in the first column
    # Optionally, you might want to skip the header or any empty values
    university_names = [name.strip() for name in university_names if name.strip() != '']
    return university_names

# # threading template
# def fetch_university_info(uni_gen, university_name, results, index):
#     # Retrieve university info and store it in the results list at the correct index
#     results[index] = uni_gen.get_university_info(university_name)

# results = [None] * len(lst)
# threads = []
# for i, university in enumerate(lst[:3]):
#     thread = threading.Thread(target=fetch_university_info, args=(uni_gen, university, results, i))
#     threads.append(thread)
#     thread.start()

# for thread in threads:
#     thread.join()

## Get Attr list

In [9]:
attr_df = get_attribute_df()
# attr_df.head(25)

In [10]:
store_cache(config.UNIVERSITY_ATTRIBUTE_CACHE_FILE_PATH, get_attribute_dict())

## Main train logic

In [11]:
uni_gen = uni()
uni_gen.load_from_file(SavedDictType.ATTRIBUTE, config.UNIVERSITY_ATTRIBUTE_CACHE_FILE_PATH)
# uni_gen.load_from_file(SavedDictType.GPT_CACHE, "./gpt_cache.jsonl")
uni_gen.load_from_file(SavedDictType.UNIVERSITY_BASIC_INFO, "./university_basic_info.jsonl")
# uni_gen.load_from_file(SavedDictType.UNIVERSITY_INFO, "./university_info.jsonl")


In [12]:
# pprint(uni_gen.attribute_dict)

In [13]:
pprint(uni_gen.attribute_dict["location"].keys())

dict_keys(['attribute_format', 'attribute_reference', 'attribute_prompt', 'example', 'handler', 'k_value', 'mapping'])


In [14]:
pprint(uni_gen.attribute_dict["ranking_arwu_rank_2023"])

{'attribute_format': 'Union[int, str]',
 'attribute_prompt': '',
 'attribute_reference': '["https://www.shanghairanking.com/rankings/arwu/2023.html", '
                        '"https://www.shanghairanking.com/institution"]',
 'example': '"44", "100-200"',
 'handler': <HandlerType.LANGCHAIN_TAVILY: 16>,
 'k_value': 3.0,
 'mapping': '2023 Academic Ranking of World Universities, ShanghaiRanking'}


In [15]:
target_worksheet = get_worksheet()


In [16]:
target_university = get_worksheet(
    spreadsheet_title= "working_extract_info_output",
    sheetname= "target_university_list",
    sheet_client= get_sheet_client(),
)


In [17]:
lst = extract_university_names(target_university)[1:][:10]
lst.extend(["UW", "UofT", "University Of British Columbia", "University of Calgary"])
pprint(lst)

['Adler Graduate Professional School',
 'Alberta Business School - University of Alberta',
 'Algoma University',
 'Algonquin College',
 'Asper School of Business - University of Manitoba',
 'Assiniboine Community College',
 'Athabasca University',
 'Beedie School of Business - Simon Fraser University',
 "Bishop's University",
 'Bow Valley College',
 'UW',
 'UofT',
 'University Of British Columbia',
 'University of Calgary']


In [18]:
pprint(uni_gen.attribute_dict["ranking_arwu_rank_2023"]["attribute_reference"])

('["https://www.shanghairanking.com/rankings/arwu/2023.html", '
 '"https://www.shanghairanking.com/institution"]')


In [19]:
from university_info_generator.configs.enum_class import GeneralInfoType, AttributeColumnType

In [20]:
print(AttributeColumnType.K_VALUE.get_default_value())

10


In [21]:
import threading


In [22]:
import time

In [None]:
# def fetch_university_info(uni_gen, university_name, results, index):
#     # Retrieve university info and store it in the results list at the correct index
#     results[index] = uni_gen.get_university_info(university_name)

# results = [None] * len(lst)
# threads = []
# for i, university in enumerate(lst[:3]):
#     thread = threading.Thread(target=fetch_university_info, args=(uni_gen, university, results, i))
#     threads.append(thread)
#     time.sleep(30)
#     thread.start()

# for thread in threads:
#     thread.join()
# result = uni_gen.get_university_info(lst[0])

# for uni in lst:
#     uni_gen.get_university_info(uni)

# uni_gen.get_info_by_attribute(university_name="UW", attribute_name="location")
uni_gen.get_info_by_attribute(university_name="UW", attribute_name=GeneralInfoType.LOCATION.value, handler=HandlerType.LANGCHAIN_SERPER, params={"transformer":  "RecursiveURL"})

('["https://www.shanghairanking.com/rankings/arwu/2023.html", '
 '"https://www.shanghairanking.com/institution"]')


## Saving Logic

In [None]:
uni_gen.save_to_file(SavedDictType.UNIVERSITY_INFO, "./university_info.jsonl")

In [None]:
# uni_gen.save_to_file(SavedDictType.UNIVERSITY_BASIC_INFO, "./university_basic_info.jsonl")

In [None]:
uni_gen.save_to_file(SavedDictType.GPT_CACHE, "./gpt_cache.jsonl")

In [None]:
uni_gen.save_to_file(SavedDictType.TROUBLE_PRODUCED, "./trouble_produced.jsonl")

In [None]:
clear_worksheet_content(target_worksheet)

In [None]:
write_cache_to_worksheet("university_info.jsonl", target_worksheet)


An error occurred: Missing columns in DataFrame that are expected in the worksheet: {'graduation_rate', 'ranking_times_rank_2024', 'characteristics', 'university_type', 'popular_programs', 'description', 'website', 'domestic_student_tuition', 'international_student_tuition', 'others', 'abbreviation', 'statistics', 'ranking_us_news_2023', 'ranking_qs_news_2024', 'location', 'important_calendar', 'programs', 'faculty', 'id_', 'ranking_arwu_rank_2023', 'graduation_year', 'university_name', 'wikipedia'}
