In [41]:
import pandas as pd
import re, csv

In [42]:
# use regex instead
word_map = {
	'approval by the department' : 'consent of the department',
	'approved by the department' : 'consent of the department',
	'department approval' : 'consent of the department',
	'consent of department' : 'consent of the department',
	'consent of the division' : 'consent of the department',
	'consent of division' : 'consent of the department',
	'by consent of the instructor' : 'consent of the instructor',
	'consent of instructor' : 'consent of the instructor',
	'year students' : 'year standing',
	'year medical students' : 'year standing and medical students',
	'year standing medical students' : 'year standing and medical students',
	'for first' : 'first',
	'for second' : 'second',
	'for third' : 'third',
	'for fourth' : 'fourth',
	'for fifth' : 'fifth',
	'for sixth' : 'sixth',
	'for veterinary medicine students' : 'veterinary medicine students',
	'for dental students' : 'dental students',
	'for majors only' : 'major students',
	'for major only' : 'major students',
	'for non-major only' : 'non-major students',
	'for non majors only' : 'non-major students',
	'for non-majors only' : 'non-major students',
	'non-major only' : 'non-major students',
	'for non-major students' : 'non-major students',
	'for non-major' : 'non-major students',
	'; major students' : ' and major students',
	'; non-major students' : ' and non-major students',
	'; veterinary medicine students' : ' and veterinary medicine students',
	'; for architectural students' : ' and architectural students',
	'; for veterinary students' : ' and veterinary students',
	'fourth year medical technology students or consent of the department' :
		'fourth year standing and medical technology students or consent of the department',
	'second year standing pharmacy students' : 'second year standing and pharmacy students',
	'major student or third year standing' : 'major students or third year standing',
	'second year standing in vet ' : 'second year standing and veterinary students',
	'fourth year standing major students' : 'fourth year standing and major students',
	'sixth year standing: for veterinary' : 'sixth year standing and veterinary students',
	'psy 330 (013330)  and major students' : '013330 and major students',
	'sixth year veterinary students' : 'sixth year standing and veterinary students',
	'for agricultural students only' : 'for agricultural students',
	'fourth year pharmacy students' : 'fourth year standing and pharmacy students',
	'304251; dental students' : '304251 and dental students',
	'year dental students' : 'year standing and dental students',
	'year medical student' : 'year standing and medical students',
	'for medical students' : 'medical students',
	'the fourth year student' : 'fourth year standing',
	'year major students' : 'year standing and major students',
	'year standing veterinary medicine students' : 'year standing and veterinary medicine students',
	'third year standing : for architectural students' : 'third year standing and architectural students',
	'third year standing; for economics students only' : 'third year standing and economics students',
	'third and fourth year major students only' : '(third year standing or fourth year standing) and major students',
	'third and fourth year standing' : '(third year standing or fourth year standing)',
	'year standing and veterinary students' : 'year standing and veterinary medicine students',
	'fourth year medical technology students or consent of the department' :
		'fourth year standing and medical technology students or consent of the department',
	'second year standing pharmacy students' : 'second year standing and pharmacy students',
	'second year standing in vet or 356210' : 'second year standing and veterinary medicine students or 356210',
	'major student or third year standing' : 'major students or third year standing',
	'fourth year standing major students' : 'fourth year standing and major students',
	'for agricultural students' : 'agricultural students',
	'nan' : 'none',
	'none; ' : '',
	'none ; ' : '',
	'concurrence' : 'concurrent',
}

typo_map = {
	'contsent' : 'consent',
	'instrucyor' : 'instructor',
	'intrutor' : 'instructor',
	'divsion' : 'division',
	'????' : 'or',
	'???' : 'and',
	'foruth' : 'fourth',
	'non - major' : 'non-major',
	'englmajors' : 'engl. majors',
	'studentsor' : 'students or',
	'and458314' : 'and 458314',
	'; or;' : '; or',
	'consent ot the department' : 'consent of the department',
	'third year students of consent of the department' : 'third year students or consent of the department'
	# division
}

bulletin_header = ['courseno', 'title_long_en', 'degree_level', 'pre_en', 'year_end', 'open_status']

In [43]:
class BulletinBeautifier :

	def __init__(self, csv_path) :
		self.arr = self.csv_to_arr(csv_path)

	def csv_to_arr(self, csv_path) :
		data = pd.read_csv(csv_path, delimiter = ',')
		rows = []
		for _, row in data.iterrows() :
			rows.append(row)
		return rows

	def arr_to_csv(self, header, des_path) :
		with open(des_path, 'w') as f :
			writer = csv.writer(f)
			writer.writerow(header)
			writer.writerows(self.arr)

	def filter_corres_courses(self) :
		new_arr = []
		for row in self.arr :
			if row[4] != 2599 or int(row[0]) % 1000 >= 700 :
				continue
			new_arr.append(row)
		self.arr = new_arr

	def convert_condition_to_lower(self) :
		new_arr = []
		for row in self.arr :
			row[3] = str(row[3]).lower()
			new_arr.append(row)
		self.arr = new_arr
	
	def remove_double_ws(self) :
		new_arr = []
		for row in self.arr :
			row[3] = re.sub(' +', ' ', row[3])
			new_arr.append(row)
		self.arr = new_arr

	def remove_full_stop(self) :
		new_arr = []
		for row in self.arr :
			row[3] = row[3].replace('.', '')
			new_arr.append(row)
		self.arr = new_arr

	def gen_condition_files(self, des_path) :
		for index, row in enumerate(self.arr) :
			condition = row[3]
			with open(des_path + 'condition' + str(index) + '.txt', 'w') as f:
				f.write(condition)

	def replace_synonym_and_typo(self) :
		new_arr = []
		for row in self.arr :
			for key in typo_map :
				if key in row[3] :
					row[3] = row[3].replace(key, typo_map[key])
			flag = True
			while flag :
				flag = False
				for key in word_map :
					if key in row[3] :
						row[3] = row[3].replace(key, word_map[key])
						flag = True
			new_arr.append(row)
		self.arr = new_arr

	def remove_semicolon(self) :
		new_arr = []
		for row in self.arr :
			if '; and ' in row[3] or '; or ' in row[3] :
				row[3] = '(' + row[3] + ')'
				row[3] = row[3].replace('; and ', ') and (')
				row[3] = row[3].replace('; or ', ') or (')
			new_arr.append(row)
		self.arr = new_arr
		# remove parentheses on atomic string

	def merge_concurrence(self) :
		new_arr = []
		con = 'or concurrent'
		for row in self.arr :
			new_s = ''
			s_ind = 0
			while row[3].find(con, s_ind) != -1 :
				f_ind = row[3].find(con, s_ind)
				not_have_to = f_ind + len(con) + 3 <= len(row[3]) and row[3][f_ind + len(con) + 1 : f_ind + len(con) + 3] != 'to'
				over_len = f_ind + len(con) + 3 > len(row[3])
				if not_have_to or over_len :
					c_num = row[3][f_ind - 7 : f_ind - 1]
					new_s = new_s + row[3][s_ind : f_ind - 7] + 'concurrent to ' + c_num
					s_ind = f_ind + len(con)
				else :
					new_s = new_s + row[3][s_ind : f_ind + len(con)]
					s_ind = f_ind + len(con)
			new_s = new_s + row[3][s_ind : ]
			row[3] = new_s
			new_arr.append(row)
		self.arr = new_arr

In [44]:
# read bulletin
b = BulletinBeautifier('./csv/reg-condition.csv')
# filter only active courses
b.filter_corres_courses()
# save csv file
b.arr_to_csv(bulletin_header, './csv/reg-condition-corres.csv')
# convert condition to lower case
b.convert_condition_to_lower()
# remove full stop
b.remove_full_stop()
# remove double white spaces
b.remove_double_ws()
# map synonym and refactor
b.replace_synonym_and_typo()
# remove semicolon
b.remove_semicolon()
# merge concurrence
b.merge_concurrence()
# save csv file
b.arr_to_csv(bulletin_header, './csv/reg-condition-corres-refactored.csv')
# gen condition files
b.gen_condition_files('./conditions/')