In [195]:
import pandas as pd
import re, csv

In [196]:
word_map = {
	'nan' : 'none',
	'(approval|approved) by the department|department approval|consent of department' : 'consent of the department',
	'(by )?consent of (the )?instructor' : 'consent of the instructor',
	' only|(none; |none ; )' : '',
	'-|for ' : ' ',
	'year (?!(standing|program))' : 'year standing ',
	'majors? (student(?!s)|(?!students))|majors?(?![ a-z])' : 'major students ',

	'; ?non major students' : '; and non major students',
	'standing students' : 'standing',

	'year standing(( ?; ?)| )medical students' : 'year standing; and medical students',
	'year standing(( ?; ?)| )major students' : 'year standing; and major students',
	'year standing(( ?; ?)| )optometry students' : 'year standing; and optometry sub-major students',
	'year standing(( ?; ?)| )veterinary medicine students' : 'year standing; and veterinary medicine students',
	'year standing(( ?; ?)| )architectural students' : 'year standing; and architectural students',

	'consent of (the )?(philosophy and religion|biology|english) department' : 'consent of the department',
}

typo_map = {
	'contsent' : 'consent',
	'instrucyor' : 'instructor',
	'intrutor' : 'instructor',
	'divsion' : 'division',
	'couse' : 'course',
	'????' : 'or',
	'???' : 'and',
	'foruth' : 'fourth',
	'englmajors' : 'engl. majors',
	'studentsor' : 'students or',
	'and458314' : 'and 458314',
	'; or;' : '; or',
	'division' : 'department',
	'majors' : 'major',
	' hc ' : ' home and country ',
	' me ' : ' mechanical engineering ',
	'concurrence' : 'concurrent',
	'consent ot the department' : 'consent of the department',
	'third year students of consent of the department' : 'third year students or consent of the department'
}

bulletin_header = ['courseno', 'title_long_en', 'degree_level', 'pre_en', 'year_end', 'open_status']

In [192]:
class BulletinBeautifier :

	def __init__(self, csv_path) :
		self.arr = self.csv_to_arr(csv_path)

	def csv_to_arr(self, csv_path) :
		data = pd.read_csv(csv_path, delimiter = ',')
		rows = []
		for _, row in data.iterrows() :
			rows.append(row)
		return rows

	def arr_to_csv(self, header, des_path) :
		with open(des_path, 'w') as f :
			writer = csv.writer(f)
			writer.writerow(header)
			writer.writerows(self.arr)

	def filter_corres_courses(self) :
		new_arr = []
		for row in self.arr :
			if row[4] != 2599 or int(row[0]) % 1000 >= 700 :
				continue
			new_arr.append(row)
		self.arr = new_arr

	def convert_condition_to_lower(self) :
		new_arr = []
		for row in self.arr :
			row[3] = str(row[3]).lower()
			new_arr.append(row)
		self.arr = new_arr
	
	def remove_unwanted_ws(self) :
		new_arr = []
		for row in self.arr :
			row[3] = self._remove_unwanted_ws(row[3])
			new_arr.append(row)
		self.arr = new_arr

	def _remove_unwanted_ws(self, s) :
		s = re.sub(' +', ' ', s)
		s = s.replace('( ', '(')
		s = s.replace(' )', ')')
		s.strip()
		return s

	def remove_full_stop(self) :
		new_arr = []
		for row in self.arr :
			row[3] = row[3].replace('.', '')
			new_arr.append(row)
		self.arr = new_arr

	def gen_condition_files(self, des_path) :
		for index, row in enumerate(self.arr) :
			condition = row[3]
			with open(des_path + 'condition' + str(index) + '.txt', 'w') as f:
				f.write(condition)

	def replace_synonym_and_typo(self) :
		new_arr = []
		for row in self.arr :
			for key in typo_map :
				if key in row[3] :
					row[3] = row[3].replace(key, typo_map[key])
			for key in word_map :
				row[3] = re.sub(key, word_map[key], row[3])
				row[3] = self._remove_unwanted_ws(row[3])
			new_arr.append(row)
		self.arr = new_arr

	def remove_semicolon(self) :
		new_arr = []
		for row in self.arr :
			if '; and ' in row[3] or '; or ' in row[3] :
				row[3] = '(' + row[3] + ')'
				row[3] = row[3].replace('; and ', ') and (')
				row[3] = row[3].replace('; or ', ') or (')
				row[3] = self._remove_unwanted_ws(row[3])
				row[3] = self._remove_atomic_paren(row[3])
			new_arr.append(row)
		self.arr = new_arr

	def _remove_atomic_paren(self, s) :
		strt = s.find('(')
		end = s.find(')')
		while strt != -1 :
			in_paren = s[strt + 1 : end]
			if ' and ' not in in_paren and ' or ' not in in_paren and ',' not in in_paren :
				s = s[ : strt] + in_paren + s[end + 1 : ]
				strt = s.find('(', end - 2)
				end = s.find(')', strt)
			else :
				strt = s.find('(', end)
				end = s.find(')', strt)
		return s

	def merge_concurrence(self) :
		new_arr = []
		con = 'or concurrent'
		for row in self.arr :
			new_s = ''
			s_ind = 0
			while row[3].find(con, s_ind) != -1 :
				f_ind = row[3].find(con, s_ind)
				not_have_to = f_ind + len(con) + 3 <= len(row[3]) and row[3][f_ind + len(con) + 1 : f_ind + len(con) + 3] != 'to'
				over_len = f_ind + len(con) + 3 > len(row[3])
				c_num = row[3][f_ind - 7 : f_ind - 1]
				if (not_have_to or over_len) and c_num.isnumeric() :
					new_s = new_s + row[3][s_ind : f_ind - 7] + 'concurrent to ' + c_num
					s_ind = f_ind + len(con)
				else :
					new_s = new_s + row[3][s_ind : f_ind + len(con)]
					s_ind = f_ind + len(con)
			new_s = new_s + row[3][s_ind : ]
			row[3] = new_s
			new_arr.append(row)
		self.arr = new_arr

In [193]:
# read bulletin
b = BulletinBeautifier('./csv/reg-condition.csv')
# filter only active courses
b.filter_corres_courses()
# save csv file
b.arr_to_csv(bulletin_header, './csv/reg-condition-corres.csv')
# convert condition to lower case
b.convert_condition_to_lower()
# remove full stop
b.remove_full_stop()
# remove unwanted white spaces
b.remove_unwanted_ws()
# map synonym and refactor
b.replace_synonym_and_typo()
# remove semicolon
b.remove_semicolon()
# merge concurrence
b.merge_concurrence()
# save csv file
b.arr_to_csv(bulletin_header, './csv/reg-condition-corres-refactored.csv')
# gen condition files
b.gen_condition_files('./conditions/')

In [187]:
s = '159361 and fourth year students'
print(re.sub('year (?!(standing|program))|year students?', 'year standing ', s))
# print(re.sub('majors?(?![ a-z])', 'major students', s))

159361 and fourth year standing students
