In [1]:
import pandas as pd
import re, csv

In [2]:
word_map = {
	'nan|see bulletin' : 'none',
	'(approval|approved) by the department|department approval|consent of department' : 'consent of the department',
	'(by )?consent of (the )?instructor' : 'consent of the instructor',
	' only|(none; |none ; )' : '',
	'-|for ' : ' ',
	'year (?!(standing|program))' : 'year standing ',
	'standing students' : 'standing',
	'medical students' : 'medicine students',
	'architectural students' : 'architecture students',
	'dental students' : 'dentistry students',
	'year standing(( ?; ?)| )medicine students' : 'year standing; and medicine students',
	'year standing(( ?; ?)| )optometry students' : 'year standing; and medicine students in optometry sub-major',
	'year standing(( ?; ?)| )veterinary medicine students' : 'year standing; and veterinary medicine students',
	'year standing(( ?; ?)| )architecture students' : 'year standing; and architecture students',
	'consent of (the )?(philosophy and religion|biology|english) department' : 'consent of the department',
}

typo_map = {
	'contsent' : 'consent',
	'instrucyor' : 'instructor',
	'intrutor' : 'instructor',
	'divsion' : 'division',
	'couse' : 'course',
	'????' : 'or',
	'???' : 'and',
	'foruth' : 'fourth',
	'englmajors' : 'engl. majors',
	'studentsor' : 'students or',
	'and458314' : 'and 458314',
	'; or;' : '; or',
	'division' : 'department',
	'majors' : 'major',
	' hc ' : ' home and community ',
	' me ' : ' mechanical engineering ',
	'concurrence' : 'concurrent',
	'consent ot the department' : 'consent of the department',
	'third year students of consent of the department' : 'third year students or consent of the department'
}

faculty = ['medicine', 'pharmacy', 'veterinary medicine', 'dentistry', 'architecture', 'science', 'associated medical sciences', 'economics', 'agro-industry', 'agriculture', 'humanities', 'engineering', 'arts, media and technology', 'business administration', 'education']

bulletin_header = ['courseno', 'title_long_en', 'degree_level', 'pre_en', 'year_end', 'open_status']

In [3]:
class BulletinBeautifier :

	def __init__(self, csv_path) :
		self.arr = self.csv_to_arr(csv_path, True)

	def csv_to_arr(self, csv_path, has_course_num = False) :
		data = pd.read_csv(csv_path, delimiter = ',')
		rows = []
		for _, row in data.iterrows() :
			if has_course_num :
				row[0] = str(row[0])
				row[0] = '0' * (6 - len(row[0])) + row[0]
			rows.append(row)
		return rows

	def arr_to_csv(self, header, des_path, arr = []) :
		with open(des_path, 'w') as f :
			writer = csv.writer(f)
			writer.writerow(header)
			if arr == [] :
				writer.writerows(self.arr)
			else :
				writer.writerows(arr)

	def filter_corres_courses(self) :
		new_arr = []
		for row in self.arr :
			if row[4] != 2599 or int(row[0]) % 1000 >= 700 or str(row[5]) == '0' :
				continue
			new_arr.append(row)
		self.arr = new_arr
		self.remove_duplicate_course()

	def convert_condition_to_lower(self) :
		new_arr = []
		for row in self.arr :
			row[3] = str(row[3]).lower()
			new_arr.append(row)
		self.arr = new_arr
	
	def remove_unwanted_ws(self) :
		new_arr = []
		for row in self.arr :
			row[3] = self._remove_unwanted_ws(row[3])
			new_arr.append(row)
		self.arr = new_arr

	def _remove_unwanted_ws(self, s) :
		s = re.sub(' +', ' ', s)
		s = s.replace('( ', '(')
		s = s.replace(' )', ')')
		s = s.strip()
		return s

	def remove_full_stop(self) :
		new_arr = []
		for row in self.arr :
			row[3] = row[3].replace('.', '')
			new_arr.append(row)
		self.arr = new_arr

	def gen_condition_files(self, des_path) :
		for index, row in enumerate(self.arr) :
			condition = row[3]
			with open(des_path + 'condition' + str(index) + '.txt', 'w') as f:
				f.write(condition)

	def replace_synonym_and_typo(self) :
		new_arr = []
		for row in self.arr :
			for key in typo_map :
				if key in row[3] :
					row[3] = row[3].replace(key, typo_map[key])
			for key in word_map :
				row[3] = re.sub(key, word_map[key], row[3])
				row[3] = self._remove_unwanted_ws(row[3])
			new_arr.append(row)
		self.arr = new_arr

	def remove_semicolon(self) :
		new_arr = []
		for row in self.arr :
			if '; and ' in row[3] or '; or ' in row[3] :
				row[3] = '(' + row[3] + ')'
				row[3] = row[3].replace('; and ', ') and (')
				row[3] = row[3].replace('; or ', ') or (')
				row[3] = self._remove_atomic_paren(row[3])
				row[3] = self._remove_unwanted_ws(row[3])
			new_arr.append(row)
		self.arr = new_arr

	def _remove_atomic_paren(self, s) :
		strt = s.find('(')
		end = s.find(')')
		while strt != -1 :
			in_paren = s[strt + 1 : end]
			if ' and ' not in in_paren and ' or ' not in in_paren and ',' not in in_paren :
				s = s[ : strt] + in_paren + s[end + 1 : ]
				strt = s.find('(', end - 2)
				end = s.find(')', strt)
			else :
				strt = s.find('(', end)
				end = s.find(')', strt)
		return s

	# we can modified array in place ?
	def remove_optometry(self) :
		new_arr = []
		med_in_opt = 'medicine students in optometry sub-major'
		for_opt = 'for optometry students'
		for row in self.arr :
			if med_in_opt in row[3] :
				row[3] = row[3].replace(med_in_opt, 'medicine students')
			if row[3] == for_opt :
				row[3] = 'none'
			new_arr.append(row)
		self.arr = new_arr

	def normalize_condition_strings(self) :
		self.convert_condition_to_lower()
		self.remove_full_stop()
		self.remove_unwanted_ws()

	def merge_concurrence(self) :
		new_arr = []
		con = 'or concurrent'
		for row in self.arr :
			new_s = ''
			s_ind = 0
			while row[3].find(con, s_ind) != -1 :
				f_ind = row[3].find(con, s_ind)
				not_have_to = f_ind + len(con) + 3 <= len(row[3]) and row[3][f_ind + len(con) + 1 : f_ind + len(con) + 3] != 'to'
				over_len = f_ind + len(con) + 3 > len(row[3])
				c_num = row[3][f_ind - 7 : f_ind - 1]
				if (not_have_to or over_len) and c_num.isnumeric() :
					new_s = new_s + row[3][s_ind : f_ind - 7] + 'concurrent to ' + c_num
					s_ind = f_ind + len(con)
				else :
					new_s = new_s + row[3][s_ind : f_ind + len(con)]
					s_ind = f_ind + len(con)
			new_s = new_s + row[3][s_ind : ]
			row[3] = new_s
			new_arr.append(row)
		self.arr = new_arr

	def replace_manually_fixed_conditions(self, csv_path) :
		manual_arr = self.csv_to_arr(csv_path, True)
		for manual_row in manual_arr :
			for arr_row in self.arr :
				if str(arr_row[0]) == str(manual_row[0]) :
					arr_row[3] = manual_row[2]

	def remove_duplicate_course(self) :
		new_arr = []
		for row in self.arr :
			if str(row[0]) == "112212" and str(row[3]) == "112106" :
				pass
			else :
				new_arr.append(row)
		self.arr = new_arr

	def _longest_fac_match(self, i, arr) :
		cur = ''
		mn = i
		for ind in reversed(range(i)) :
			if arr[ind] == ' ' :
				continue
			if cur != '' :
				cur = arr[ind] + ' ' + cur
			else :
				cur = arr[ind]
			for fac in faculty :
				if cur == fac :
					mn = ind
		return mn

	def _is_non_fac(self, i, arr) :
		for ind in reversed(range(i)) :
			if arr[ind] == ' ' :
				continue
			if arr[ind] == 'non' :
				return True
		return False

	def add_prefix_for(self) :
		new_arr = []
		for row in self.arr :
			if not 'students' in row[3] :
				new_arr.append(row)
				continue
			splitted = re.split(r'([(|)| ])', row[3])
			add_lst = []
			for i, each in enumerate(splitted) :
				if each == 'students' :
					ind = self._longest_fac_match(i - 1, splitted)
					if self._is_non_fac(ind, splitted) :
						add_lst.append((ind, 'not for '))
					else :
						add_lst.append((ind, 'for '))
			for i, (fst, snd) in enumerate(add_lst) :
				splitted.insert(fst + i, snd)
			row[3] = ''.join(splitted)
			row[3] = row[3].replace('non ', '')
			new_arr.append(row)
		self.arr = new_arr

	# temporary method
	def replace_err(self, csv_path) :
		err_arr = self.csv_to_arr(csv_path, True)
		for err_row in err_arr :
			for arr_row in self.arr :
				if arr_row[0] == err_row[0] :
					arr_row[3] = 'none'

	# temporary method
	def filter_good(self, csv_path) :
		manual_arr = self.csv_to_arr(csv_path, True)
		for arr_row in self.arr :
			flag = False
			for manual_row in manual_arr :
				if arr_row[0] == manual_row[0] :
					flag = True
			if not flag :
				arr_row[3] = 'none'

In [4]:
# read bulletin
b = BulletinBeautifier('./csv/reg-condition.csv')
# filter only active courses
b.filter_corres_courses()
# save csv file
b.arr_to_csv(bulletin_header, './csv/reg-condition-corres.csv')
# normalization
b.normalize_condition_strings()
# map synonym and refactor
b.replace_synonym_and_typo()
# remove semicolon
b.remove_semicolon()
# merge concurrence
b.merge_concurrence()
# remove optometry sub-major
b.remove_optometry()
# add prefix for / not for
b.add_prefix_for()
# replace manually fixed conditions
b.replace_manually_fixed_conditions('./manually-fixed-conditions.csv')
# temporary set to none for error cases
b.replace_err('./merged.csv')
# temporary remove good conditions
# b.filter_good('./manually-fixed-conditions.csv')
# save csv file
b.arr_to_csv(bulletin_header, './csv/reg-condition-corres-refactored.csv')
# gen condition files
b.gen_condition_files('./conditions/')

In [19]:
def gen_err_lst() :
	file = open('./grammar/out.txt', 'r')
	lines = file.readlines()
	arrs = []
	for line in lines :
		line = line.strip()
		idx = line[23 : ].find('.')
		c_idx = int(line[23 : 23 + idx])
		c_num = str(b.arr[c_idx][0])
		c_num = '0' * (6 - len(c_num)) + c_num
		condition = line[line.find(' ') + 1 : ]
		arrs.append([c_num, condition, 'none'])
	with open('./err_lst.csv', 'w') as f :
		writer = csv.writer(f)
		writer.writerow(['courseNumber', 'condition', 'fixed_condition'])
		writer.writerows(arrs)

In [51]:
def gen_replace_file() :
	arr = b.csv_to_arr('./err_lst.csv')
	replace = []
	same = []
	for row in arr :
		if row[2] != 'none' :
			replace.append(row)
		else :
			same.append(row)
	with open('./replace.csv', 'w') as f :
		writer = csv.writer(f)
		writer.writerow(['courseNumber', 'old_condition', 'new_condition'])
		writer.writerows(replace)
	with open('./error.csv', 'w') as f :
		writer = csv.writer(f)
		writer.writerow(['courseNumber', 'old_condition', 'new_condition'])
		writer.writerows(same)

In [None]:
lst = []

def count_not_active() :
	arr = b.csv_to_arr('./csv/reg-condition.csv', True)
	for row in arr :
		if str(row[5]) == '0' and row[4] == 2599 and int(row[0]) % 1000 < 700 :
			lst.append(str(row[0]))
	manual_arr = b.csv_to_arr('./manually-fixed-conditions.csv', True)
	for row in manual_arr :
		flag = False
		for rm in lst :
			if rm == row[0] :
				flag = True
		if flag :
			print(row[0])

count_not_active()

In [17]:
def filter_mapping_exception() :
	arr = b.csv_to_arr('./mapping_exception.csv')
	new_arr = []
	for row in arr :
		if str(row[6]) == '2599' :
			new_arr.append([row[0], row[8]])
	b.arr_to_csv(['courseNumber', 'newCondition'], './mapping_exception_filtered.csv', new_arr)

In [20]:
def merge_error_mapping() :
	fixed = b.csv_to_arr('./mapping_exception_filtered.csv')
	err = b.csv_to_arr('./err_lst.csv')
	merged = []
	for e in err :
		for f in fixed :
			if str(f[0]) == str(e[0]) :
				e[2] = f[1]
		merged.append(e)
	b.arr_to_csv(['courseNumber', 'newCondition'], './merged.csv', merged)

In [7]:
def get_course_ids() :
	arr = b.arr
	with open('./courseIds.txt', 'w') as f :
		for row in arr :
			f.write('"' + '0' * (6 - len(str(row[0]))) + str(row[0]) + '",')

get_course_ids()