## Setup env

In [1]:
! cd /home/ivan/Uforse/uni_info_generator/ && bash .bashrc
! echo $CACHE_REPO_PATH
! cd /home/ivan/Uforse/uni_info_generator/

Obtaining file:///home/ivan/Uforse/uni_info_generator/university_info_generator
  Preparing metadata (setup.py) ... [?25ldone
Installing collected packages: university-info-generator
  Attempting uninstall: university-info-generator
    Found existing installation: university-info-generator 0.1.2
    Uninstalling university-info-generator-0.1.2:
      Successfully uninstalled university-info-generator-0.1.2
  Running setup.py develop for university-info-generator
Successfully installed university-info-generator-0.1.2
Environment variables set:
PROJECT_HOME=/home/ivan/Uforse/uni_info_generator
PYTHONPATH=/home/ivan/Uforse/uni_info_generator:
API_KEY_PATH=/home/ivan/Uforse/uni_info_generator
CACHE_REPO_PATH=/home/ivan/Uforse/uni_info_generator/cache_repo
Environment set up for uni_info_generator



In [2]:
import os
import sys
from pprint import pprint

#'/home/ivan/Uforse/uni_info_generator/fetch_logic'
current_path = os.path.abspath('./')

#'/home/ivan/Uforse/uni_info_generator'
parent_path = os.path.dirname(current_path)
sys.path.append(parent_path)


In [3]:
import sys
print(sys.executable)
sys.path.append('/home/ivan/Uforse/uni_info_generator')


/home/ivan/anaconda3/envs/gpu/bin/python


In [4]:
from university_info_generator import config
from university_info_generator import UniversityInfoGenerator as uni
from university_info_generator import BasicInfoType, SavedDictType, GPTMethodType
from university_info_generator.utility.google_sheet_utility import *
from university_info_generator.utility.save_load_utility import *

## useful methods

In [5]:
def filter_jsonl_file(input_filepath, output_filepath, keyword):
    """
    Reads a JSONL file and writes out only those lines that do not contain the specified keyword.

    Args:
    - input_filepath (str): Path to the input JSONL file.
    - output_filepath (str): Path to the output JSONL file where results should be saved.
    - keyword (str): Keyword to search for in each JSON object to decide if it should be excluded.
    """
    with open(input_filepath, 'r', encoding='utf-8') as file, open(output_filepath, 'w', encoding='utf-8') as outfile:
        for line in file:
            if keyword not in line:  # Check if the keyword is in the line
                outfile.write(line)  # Write the line to the output file if keyword is not found

def extract_university_names(worksheet):
    # Get all values in the first column (index 1)
    university_names = worksheet.col_values(1)  # This retrieves all data in the first column
    # Optionally, you might want to skip the header or any empty values
    university_names = [name.strip() for name in university_names if name.strip() != '']
    return university_names

## Get Attr list

In [6]:
attr_df = get_attribute_df()
# attr_df.head(25)

In [7]:
store_cache(config.UNIVERSITY_ATTRIBUTE_CACHE_FILE_PATH, get_attribute_dict())

In [8]:
load_cache(config.UNIVERSITY_ATTRIBUTE_CACHE_FILE_PATH)

{'university_type': {'attribute_format': 'string',
  'attribute_reference': nan,
  'attribute_prompt': 'Focus on whether the university is public or private',
  'example': 'Public University',
  'handler': 'GPT_BASIC'},
 'abbreviation': {'attribute_format': 'string',
  'attribute_reference': nan,
  'attribute_prompt': nan,
  'example': 'UBC',
  'handler': 'GPT_BASIC'},
 'graduation_year': {'attribute_format': 'int',
  'attribute_reference': nan,
  'attribute_prompt': 'The time, years, for general students, undergraduate, to graduate.',
  'example': 4,
  'handler': 'LANGCHAIN_TAVILY'},
 'location': {'attribute_format': '["<city>, <province>, <country>", "<city>, <province>, <country>"]\'',
  'attribute_reference': nan,
  'attribute_prompt': 'The different locations of the campus in different cities.\nYou can check location on wikipedia and official websites. Find all the campus that the university currently have, includes main campus and others',
  'example': 'Vancouver campus, British 

## Main train logic

In [9]:
uni_gen = uni()
uni_gen.load_from_file(SavedDictType.ATTRIBUTE, config.UNIVERSITY_ATTRIBUTE_CACHE_FILE_PATH)
# uni_gen.load_from_file(SavedDictType.GPT_CACHE, "./gpt_cache.jsonl")
uni_gen.load_from_file(SavedDictType.UNIVERSITY_BASIC_INFO, "./university_basic_info.jsonl")
# uni_gen.load_from_file(SavedDictType.UNIVERSITY_INFO, "./university_info.jsonl")


In [10]:
pprint(uni_gen.attribute_dict)

{'abbreviation': {'attribute_format': 'string',
                  'attribute_prompt': nan,
                  'attribute_reference': nan,
                  'example': 'UBC',
                  'handler': <HandlerType.GPT_BASIC: 1>},
 'characteristics': {'attribute_format': 'List[str]',
                     'attribute_prompt': 'Consider some overview and '
                                         'interesting facts about the '
                                         'university',
                     'attribute_reference': nan,
                     'example': '[ "The University of British Columbia (UBC) '
                                'is distinguished by a variety of unique '
                                'features and achievements that contribute to '
                                'its reputation as a leading global center for '
                                'teaching, learning, and research. Here are '
                                'some key highlights:  ",  "Sustainability 

In [11]:
pprint(uni_gen.attribute_dict["ranking_us_news_2023"])

{'attribute_format': 'int',
 'attribute_prompt': nan,
 'attribute_reference': '["https://www.usnews.com/education/best-global-universities/","https://www.usnews.com/education/best-global-universities/search","https://www.usnews.com/education/best-global-universities/canada",]',
 'example': 35,
 'handler': <HandlerType.LANGCHAIN_TAVILY: 2>}


In [12]:
target_worksheet = get_worksheet()


In [13]:
target_university = get_worksheet(
    spreadsheet_title= "working_extract_info_output",
    sheetname= "target_university_list",
    sheet_client= get_sheet_client(),
)

In [None]:
lst = extract_university_names(target_university)[1:][:10]
lst.extend(["UW", "UofT", "University Of British Columbia", "University of Calgary"])
pprint(lst)

In [None]:
for university in lst:
    uni_gen.get_university_info(university)
# uni_gen.get_info_by_attribute(university_name="UW", attribute_name="statistics")

In [17]:
uni_gen.save_to_file(SavedDictType.UNIVERSITY_INFO, "./university_info.jsonl")

In [18]:
# uni_gen.save_to_file(SavedDictType.UNIVERSITY_BASIC_INFO, "./university_basic_info.jsonl")

In [19]:
uni_gen.save_to_file(SavedDictType.GPT_CACHE, "./gpt_cache.jsonl")

In [24]:
clear_worksheet_content(target_worksheet)

Cleared 28 rows and 23 columns.


In [25]:
write_cache_to_worksheet("university_info.jsonl", target_worksheet)
