In [8]:
import pandas as pd
import re, pprint, csv

In [9]:
word_map = {
	'approval by the department' : 'consent of the department',
	'approved by the department' : 'consent of the department',
	'department approval' : 'consent of the department',
	'consent of department' : 'consent of the department',
	'consent of the division' : 'consent of the department',
	'consent of division' : 'consent of the department',
	'by consent of the instructor' : 'consent of the instructor',
	'consent of instructor' : 'consent of the instructor',
	'for first year standing' : 'first year standing',
	'for second year standing' : 'second year standing',
	'for third year standing' : 'third year standing',
	'for fourth year standing' : 'fourth year standing',
	'for fifth year standing' : 'fifth year standing',
	'for sixth year standing' : 'sixth year standing',
	'first year students' : 'first year standing',
	'second year students' : 'second year standing',
	'third year students' : 'third year standing',
	'fourth year students' : 'fourth year standing',
	'fifth year students' : 'fifth year standing',
	'sixth year students' : 'sixth year standing',
	'nan' : 'none',
	'concurrence' : 'concurrent'
}

typo_map = {
	'contsent' : 'consent',
	'instrucyor' : 'instructor',
	'intrutor' : 'instructor',
	'divsion' : 'division',
	'????' : 'or',
	'???' : 'and',
	'consent ot the department' : 'consent of the department',
	'third year students of consent of the department' : 'third year students or consent of the department'
}

bulletin_header = ['courseno', 'title_long_en', 'degree_level', 'pre_en', 'year_end', 'open_status']

In [10]:
class BulletinBeautifier :

	def csv_to_arr(self, csv_path) :
		data = pd.read_csv(csv_path, delimiter = ',')
		rows = []
		for _, row in data.iterrows() :
			rows.append(row)
		return rows

	def arr_to_csv(self, arr, header, des_path) :
		with open(des_path, 'w') as f :
			writer = csv.writer(f)
			writer.writerow(header)
			writer.writerows(arr)

	def filter_corres_courses(self, arr) :
		rows = []
		for row in arr :
			if row[4] != 2599 or int(row[0]) % 1000 >= 700 :
				continue
			rows.append(row)
		return rows

	def pprint_to_file(self, data, des_path) :
		with open(des_path, 'w') as f :
			pprint.pprint(data, f)

	def convert_condition_to_lower(self, arr) :
		rows = []
		for row in arr :
			row[3] = str(row[3]).lower()
			rows.append(row)
		return rows
	
	def remove_double_ws(self, arr) :
		rows = []
		for row in arr :
			row[3] = re.sub(' +', ' ', row[3])
			rows.append(row)
		return rows

	def gen_condition_files(self, arr, des_path) :
		for index, row in enumerate(arr) :
			condition = row[3]
			with open(des_path + 'condition' + str(index) + '.txt', 'w') as f:
				f.write(condition)

	def replace_synonym_and_typo(self, arr) :
		rows = []
		for row in arr :
			for key in typo_map :
				if key in row[3] :
					row[3] = row[3].replace(key, typo_map[key])
			for key in word_map :
				if key in row[3] :
					row[3] = row[3].replace(key, word_map[key])
			rows.append(row)
		return rows

	def gen_token_lookup(self, arr) :
		lookup = dict()
		for row in arr :
			paren = []
			condition = row[3]
			condition = self.split_out_paren(condition, paren)
			splited = re.split(';| and | or ', condition)
			splited = [(lambda s : s.strip())(s) for s in splited]
			splited = list(filter(None, splited))
			for s in splited :
				if s in lookup and lookup[s] != None and len(lookup[s]) < 1 :
					tmp = lookup[s]
					tmp.append(row[0])
					lookup[s] = tmp
				else :
					lookup[s] = [row[0]]
			for s in paren :
				if s in lookup and lookup[s] != None and len(lookup[s]) < 1 :
					tmp = lookup[s]
					tmp.append(row[0])
					lookup[s] = tmp
				else :
					lookup[s] = [row[0]]
		lookup_not_simple = dict((key, lookup[key]) for key in dict(lookup) if not str(key).isnumeric())
		return lookup_not_simple

	def split_out_paren(self, s, token) :
		result = ""
		cur_str = ""
		in_paren = ""
		open = False
		for i in range(len(s)) :
			if s[i] == '(' and open :
				raise Exception("There is two nested loop !")
			if s[i] == '(' :
				open = True
				in_paren += s[i]
				result += cur_str + ' '
				cur_str = ""
			elif open and s[i] == ')' :
				open = False
				in_paren += s[i]
				token.append(in_paren)
				in_paren = ""
			elif open :
				in_paren += s[i]
			else :
				cur_str += s[i]
		result += cur_str
		result = re.sub(' +', ' ', result.strip())
		return result

	def split_out_paren_test(self) :
		input = ["261261 (261215, 245874) 236548",
				"(261215, 245874) 236548",
				"(261215, 245874)",
				"261254 (261215, 245874)",
				"261254, 261215",
				"261216(261215)262262"
				]
		output = ["261261 236548",
				"236548",
				"",
				"261254",
				"261254, 261215",
				"261216 262262"
				]
		token = [["(261215, 245874)"],
				["(261215, 245874)"],
				["(261215, 245874)"],
				["(261215, 245874)"],
				[],
				["(261215)"]
				]
		for i in range(len(input)) :
			cur_token = []
			assert(self.split_out_paren(input[i], cur_token) == output[i])
			assert(cur_token == token[i])

In [11]:
b = BulletinBeautifier()
# read bulletin
bulletin = b.csv_to_arr('./csv/reg-condition.csv')
# filter only active courses
bulletin_corres = b.filter_corres_courses(bulletin)
# convert condition to lower case
bulletin_corres_lower = b.convert_condition_to_lower(bulletin_corres)
# remove double white spaces
bulletin_corres_lower = b.remove_double_ws(bulletin_corres_lower)
# map synonym and refactor
bulletin_corres_lower_refactored = b.replace_synonym_and_typo(bulletin_corres_lower)
# remove double white spaces
bulletin_corres_lower_refactored_no_dws = b.remove_double_ws(bulletin_corres_lower_refactored)
# gen condition files
b.gen_condition_files(bulletin_corres_lower_refactored_no_dws, './conditions/')
# test parsing in antlr4

# test gen lookup
lookup = b.gen_token_lookup(bulletin_corres_lower_refactored_no_dws)
b.pprint_to_file(lookup, './txt/lookup.txt')
b.arr_to_csv(bulletin_corres, bulletin_header, './csv/reg-condition-corres.csv')
b.arr_to_csv(bulletin_corres_lower_refactored_no_dws, bulletin_header, './csv/reg-condition-corres-refactored.csv')

# unit-test
b.split_out_paren_test()