Skip to content

Commit

Permalink
archive: code cleanup and documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
mishaschwartz committed Nov 17, 2022
1 parent 4b81199 commit 4b59040
Showing 1 changed file with 141 additions and 70 deletions.
211 changes: 141 additions & 70 deletions lib/tasks/archive.rake
Original file line number Diff line number Diff line change
Expand Up @@ -110,12 +110,81 @@ namespace :markus do
end

# Return a mapping from table names to the ActiveRecord class corresponding to the table.
# If there are multiple classes stored in a given table (ie. single table inheritance), then
# If there are multiple classes stored in a given table (ie. single table inheritance) and +parent_only+ is true, then
# use the class that is the parent of all the others.
def base_classes
ApplicationRecord.descendants
.group_by(&:table_name)
.transform_values { |klasses| klasses.select { |k| (klasses - [k, *k.descendants]).empty? }.first }
def table_classes(parent_only: true)
Rails.application.eager_load!
table_classes = ApplicationRecord.descendants.group_by(&:table_name)
if parent_only
table_classes.transform_values { |klasses| klasses.select { |k| (klasses - [k, *k.descendants]).empty? }.first }
else
table_classes
end
end

# Return an array of ids of all records with class == +klass+ that are associated with the +course+
# Records are associated with a given course if they have an association with the course either directly
# or through another association.
def ids_associated_to_course(course, klass)
# classes that have a has_one, has_many, or belongs_to association with Course
klass.joins(:course).where(courses: course).ids
rescue ActiveRecord::ConfigurationError
class_ids = Set.new
klass.reflect_on_all_associations.each do |assoc|
if assoc.polymorphic?
foreign_ids = klass.unscoped.distinct.pluck(assoc.foreign_type).map do |foreign_klass|
foreign_klass.constantize.joins(:course).where(courses: course).ids
rescue ActiveRecord::ConfigurationError
# no association between the foreign class and course
end.flatten
class_ids |= klass.where(assoc.foreign_key => foreign_ids).ids
else
begin
class_ids |= klass.joins(assoc.name => :course).where(assoc.name => { courses: course }).ids
rescue ActiveRecord::ConfigurationError
# no association between this class's associations and the course
end
end
end
class_ids.to_a
end

# Return :d, :i, or nil indicating:
# - :d => directly associated to the Course class (either directly or through another association or
# has a .course instance method that returns the associated course),
# - :i => indirectly associated to the Course class (has an association that is directly associated to the
# Course class),
# - nil => not associated with the Course class
#
# Note that indirect associations where the intermediate direct association is polymorphic can only be detected if
# there exists at least one record in the database that has that indirect association.
# For example, if class X is associated to the Course class indirectly through one of classes Y or Z (where Y and Z
# are polymorphic classes), then there needs to be at least one instance of X present that is associated with either
# Y or Z.
def association_type(klass, options = {})
direct = klass.method_defined?(:course) || klass.reflect_on_all_associations.any? { |a| a.name == :course }
return true if direct && options[:direct_only]
return :d if direct
klass.reflect_on_all_associations.each do |assoc|
if assoc.polymorphic?
foreign_types = klass.unscoped.distinct.pluck(assoc.foreign_type)
return :i if foreign_types.any? { |k| association_type(k.constantize, direct_only: true) }
elsif association_type(assoc.klass, direct_only: true)
return :i
end
end
nil
end

def foreign_keys(klass)
# Return a ma
klass.reflect_on_all_associations.map do |association|
if association.polymorphic?
[association.foreign_key, [true, association.foreign_type]]
else
[association.foreign_key, [false, association.klass.table_name]]
end
end.to_h
end

namespace :archive do
Expand All @@ -135,50 +204,25 @@ namespace :markus do

task :course, [:course_name] => :environment do |_task, args|
course = Course.find_by(name: args[:course_name])
Rails.application.eager_load!

table_names = ApplicationRecord.descendants.reject(&:abstract_class).group_by(&:table_name)

# Create a temporary directory to write database csv files and data files to
archive_basename = "archive-#{course.name}"
archive_dir = Rails.root.join("tmp/#{archive_basename}")
FileUtils.rm_rf archive_dir
FileUtils.mkdir_p archive_dir + 'db'
FileUtils.mkdir_p archive_dir + 'data'
FileUtils.mkdir_p [archive_dir + 'db', archive_dir + 'data']
FileUtils.cp Rails.root.join('db/structure.sql'), archive_dir + 'db/structure.sql'
raw_connection = ActiveRecord::Base.connection.raw_connection

table_names.each do |table_name, classes|
table_classes(parent_only: false).each do |table_name, classes|
ids = Set.new
classes.each do |klass|
# classes that have a has_one, has_many, or belongs_to association with Course
class_ids = klass.joins(:course).where(courses: course).ids
class_ids = ids_associated_to_course(course, klass)
ids |= class_ids
rescue ActiveRecord::ConfigurationError
klass.reflect_on_all_associations.each do |assoc|
if assoc.polymorphic?
foreign_ids = klass.unscoped.distinct.pluck(assoc.foreign_type).map do |foreign_klass|
foreign_klass.constantize.joins(:course).where(courses: course).ids
rescue ActiveRecord::ConfigurationError
# no association between the foreign class and course
end.flatten
class_ids = klass.where(assoc.foreign_key => foreign_ids).ids
else
begin
class_ids = klass.joins(assoc.name => :course).where(assoc.name => { courses: course }).ids
rescue ActiveRecord::ConfigurationError
next
end
end
ids |= class_ids
end
ensure
archive_data_files(klass, class_ids, archive_dir + 'data') if defined?(class_ids)
archive_data_files(klass, class_ids, archive_dir + 'data')
end
next if ids.empty?
File.open(archive_dir + "db/#{table_name}.csv", 'w') do |f|
query = "COPY (SELECT * FROM #{table_name} WHERE id IN (#{ids.to_a.join(', ')})) TO STDOUT CSV HEADER"
raw_connection.copy_data(query) do
while (row = raw_connection.get_copy_data)
ActiveRecord::Base.connection.raw_connection.copy_data(query) do
while (row = ActiveRecord::Base.connection.raw_connection.get_copy_data)
f.write row
end
end
Expand All @@ -187,8 +231,9 @@ namespace :markus do

zip_cmd = ['tar', '-czvf', "#{archive_dir}.tar.gz", archive_basename]
Open3.capture2(*zip_cmd, chdir: Rails.root.join('tmp').to_s)
FileUtils.rm_rf archive_dir
puts "Course #{course.name} has been archived to #{archive_dir}.tar.gz"
ensure
FileUtils.rm_rf archive_dir
end
end

Expand Down Expand Up @@ -236,11 +281,17 @@ namespace :markus do
table_names << table_name
end
end

# migrate all the data up to the current migration. If migrations affect data files, access the files
# in the data_dir directory.
temporary_file_storage(data_dir) { archive_db.connection.migration_context.migrate }

# get table load order after loading into the temporary database because we need the records
# present in the database to resolve polymorphic associations.
table_load_order = load_order

# Copy files out of the temporary database back to csv files in the data dir now that all the data has
# been migrated to the current migration
table_names.each do |table_name|
File.open(File.join(db_dir, "#{table_name}.csv"), 'w') do |f|
query = "COPY (SELECT * FROM #{table_name}) TO STDOUT CSV HEADER"
Expand All @@ -253,56 +304,73 @@ namespace :markus do
end
end

table_classes = base_classes
table_class_mapping = table_classes
new_ids = Hash.new { |h, k| h[k] = {} }
errors_reported = false
new_file_locations = []
ActiveRecord.transaction do
table_load_order.each do |table_name|
db_file = File.join(db_dir, "#{table_name}.csv")
next unless File.exist?(db_file)

klass = table_classes[table_name]
klass = table_class_mapping[table_name]
reverse_enums = klass.defined_enums.transform_values { |h| h.map { |k, v| [v&.to_s, k] }.to_h }
foreign_keys = klass.reflect_on_all_associations.map do |association|
if association.polymorphic?
[association.foreign_key, [true, association.foreign_type]] # the association is polymorphic
else
[association.foreign_key, [false, association.klass.table_name]]
end
end.to_h

foreign_keys = klass.reflect_on_all_associations(:belongs_to).index_by(&:foreign_key)

CSV.parse(File.read(db_file), headers: true) do |row|
attributes = row.map do |k, v|
v = reverse_enums[k]&.[](v) || v # handle case where column is an enum value
polymorphic, assoc_info = foreign_keys[k]
unless polymorphic.nil?
# get the table of the class in the "foreign_type" column if polymorphic, otherwise use the
# (already calculated) table that the foreign key refers to
foreign_table = polymorphic ? v[assoc_info].constantize.table_name : assoc_info
v = new_ids[foreign_table][v]
end
[k, v]
end.to_h.except('id')
# transform columns with an enum value to the enum key so that ActiveRecord can use it
attributes = row.map { |k, v| [k, reverse_enums[k]&.[](v) || v] }.to_h
record = klass.new(attributes)
data_file = nil
if record.respond_to? :_file_location
data_file = temporary_file_storage(data_dir) { record._file_location }
end

# update foreign key references
record.attributes.each do |k, v|
association = foreign_keys[k]
next if association.nil?

if association.polymorphic?
associated_class = record.attributes[association.foreign_type].constantize
else
associated_class = association.klass
end
record.assign_attributes(k => new_ids[associated_class.table_name][v])
end
# nullify the id so that it can be assigned a new one on save
record.id = nil
if record.save
new_id = record.id
new_ids[table_name][row['id']] = new_id
data_file = File.join(data_dir, "#{table_name}.#{row['id']}")
if File.exist?(data_file)
if record.respond_to? :_file_location
FileUtils.cp_r data_file, record._file_location
# save the new id so that future records with associations to this record can refer to its new id
new_ids[table_name][row['id']] = record.id
if record.respond_to?(:_file_location)
# copy any associated files from the archived location to the new location on disk
if !data_file.nil? && File.exist?(data_file)
new_location = record._file_location
if File.exist?(new_location)
warn "Cannot copy archived data files associated with #{record.inspect} to #{new_location}. " \
'A file or directory already exists at that path.'
errors_reported = true
else
FileUtils.mkdir_p(File.dirname(new_location))
FileUtils.cp_r(data_file, new_location)
new_file_locations << new_location
end
else
warn "Unable to copy files associated with #{record.inspect}. " \
'This record does not have a _file_location method'
warn "Cannot find archived data files associated with #{record.inspect}."
errors_reported = true
end
end
elsif record.respond_to?(:course)
warn "Unable to create record #{record.inspect}\nError(s): #{record.errors.full_messages.join(', ')}"
errors_reported = true
else
u.attributes.slice(*u.errors.select { |e| e.type == :taken }.map { |e| e.attribute.to_s })
taken_attrs = record.errors.select { |e| e.type == :taken }.map { |e| e.attribute.to_s }
old_record = record.class.find_by(taken_attrs) # TODO: figure out if there can be multiple?
taken_attrs = record.errors
.select { |e| e.type == :taken }
.map { |e| [e.attribute, e.options[:value]] }
.to_h
old_record = record.class.find_by(taken_attrs)
if old_record.nil?
warn "Unable to create record #{record.inspect}\nError(s): #{record.errors.full_messages.join(', ')}"
errors_reported = true
Expand All @@ -315,7 +383,10 @@ namespace :markus do
ensure
if errors_reported
warn "Do you want to commit all changes even though there were some errors reported? Type 'yes' to confirm."
ActiveRecord::Rollback unless gets.chomp == 'yes'
unless gets.chomp == 'yes'
new_file_locations.each { |loc| FileUtils.rm_rf loc }
raise ActiveRecord::Rollback
end
end
end
ensure
Expand Down

0 comments on commit 4b59040

Please sign in to comment.