In [52]:
import pandas as pd
import re, csv

In [53]:
word_map = {
	'approval by the department' : 'consent of the department',
	'approved by the department' : 'consent of the department',
	'department approval' : 'consent of the department',
	'consent of department' : 'consent of the department',
	'consent of the division' : 'consent of the department',
	'consent of division' : 'consent of the department',
	'by consent of the instructor' : 'consent of the instructor',
	'consent of instructor' : 'consent of the instructor',
	'for first year standing' : 'first year standing',
	'for second year standing' : 'second year standing',
	'for third year standing' : 'third year standing',
	'for fourth year standing' : 'fourth year standing',
	'for fifth year standing' : 'fifth year standing',
	'for sixth year standing' : 'sixth year standing',
	'first year students' : 'first year standing',
	'second year students' : 'second year standing',
	'third year students' : 'third year standing',
	'fourth year students' : 'fourth year standing',
	'fifth year students' : 'fifth year standing',
	'sixth year students' : 'sixth year standing',
	'nan' : 'none',
	'concurrence' : 'concurrent'
}

typo_map = {
	'contsent' : 'consent',
	'instrucyor' : 'instructor',
	'intrutor' : 'instructor',
	'divsion' : 'division',
	'????' : 'or',
	'???' : 'and',
	'; or;' : '; or',
	'consent ot the department' : 'consent of the department',
	'third year students of consent of the department' : 'third year students or consent of the department'
}

bulletin_header = ['courseno', 'title_long_en', 'degree_level', 'pre_en', 'year_end', 'open_status']

In [54]:
class BulletinBeautifier :

	def __init__(self, csv_path) :
		self.arr = self.csv_to_arr(csv_path)

	def csv_to_arr(self, csv_path) :
		data = pd.read_csv(csv_path, delimiter = ',')
		rows = []
		for _, row in data.iterrows() :
			rows.append(row)
		return rows

	def arr_to_csv(self, header, des_path) :
		with open(des_path, 'w') as f :
			writer = csv.writer(f)
			writer.writerow(header)
			writer.writerows(self.arr)

	def filter_corres_courses(self) :
		new_arr = []
		for row in self.arr :
			if row[4] != 2599 or int(row[0]) % 1000 >= 700 :
				continue
			new_arr.append(row)
		self.arr = new_arr

	def convert_condition_to_lower(self) :
		new_arr = []
		for row in self.arr :
			row[3] = str(row[3]).lower()
			new_arr.append(row)
		self.arr = new_arr
	
	def remove_double_ws(self) :
		new_arr = []
		for row in self.arr :
			row[3] = re.sub(' +', ' ', row[3])
			new_arr.append(row)
		self.arr = new_arr

	def gen_condition_files(self, des_path) :
		for index, row in enumerate(self.arr) :
			condition = row[3]
			with open(des_path + 'condition' + str(index) + '.txt', 'w') as f:
				f.write(condition)

	def replace_synonym_and_typo(self) :
		new_arr = []
		for row in self.arr :
			for key in typo_map :
				if key in row[3] :
					row[3] = row[3].replace(key, typo_map[key])
			for key in word_map :
				if key in row[3] :
					row[3] = row[3].replace(key, word_map[key])
			new_arr.append(row)
		self.arr = new_arr

	def remove_semicolon(self) :
		new_arr = []
		for row in self.arr :
			if '; and ' in row[3] or '; or ' in row[3] :
				row[3] = '(' + row[3] + ')'
				row[3] = row[3].replace('; and ', ') and (')
				row[3] = row[3].replace('; or ', ') or (')
			new_arr.append(row)
		self.arr = new_arr
		# remove parentheses on atomic string

	def merge_concurrence(self) :
		new_arr = []
		con = 'or concurrent'
		for row in self.arr :
			new_s = ''
			s_ind = 0
			while row[3].find(con, s_ind) != -1 :
				f_ind = row[3].find(con, s_ind)
				not_have_to = f_ind + len(con) + 3 <= len(row[3]) and row[3][f_ind + len(con) + 1 : f_ind + len(con) + 3] != 'to'
				over_len = f_ind + len(con) + 3 > len(row[3])
				if not_have_to or over_len :
					c_num = row[3][f_ind - 7 : f_ind - 1]
					new_s = new_s + row[3][s_ind : f_ind - 7] + 'concurrent to ' + c_num
					s_ind = f_ind + len(con)
				else :
					new_s = new_s + row[3][s_ind : f_ind + len(con)]
					s_ind = f_ind + len(con)
			new_s = new_s + row[3][s_ind : ]
			row[3] = new_s
			new_arr.append(row)
		self.arr = new_arr

In [55]:
# read bulletin
b = BulletinBeautifier('./csv/reg-condition.csv')
# filter only active courses
b.filter_corres_courses()
# save csv file
b.arr_to_csv(bulletin_header, './csv/reg-condition-corres.csv')
# convert condition to lower case
b.convert_condition_to_lower()
# remove double white spaces
b.remove_double_ws()
# map synonym and refactor
b.replace_synonym_and_typo()
# remove semicolon
b.remove_semicolon()
# merge concurrence
b.merge_concurrence()
# save csv file
b.arr_to_csv(bulletin_header, './csv/reg-condition-corres-refactored.csv')
# gen condition files
b.gen_condition_files('./conditions/')