diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index a42b2e1a0b75..e063c9a8adb2 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -4,6 +4,8 @@ #include "Emu/Memory/vm.h" #include "Crypto/sha1.h" #include "Utilities/StrUtil.h" +#include "Utilities/JIT.h" +#include "Utilities/sysinfo.h" #include "SPUThread.h" #include "SPUAnalyser.h" @@ -24,39 +26,154 @@ const spu_decoder s_spu_iflag; extern u64 get_timebased_time(); +// Move 4 args for calling native function from a GHC calling convention function +static u8* move_args_ghc_to_native(u8* raw) +{ +#ifdef _WIN32 + // mov rcx, r13 + // mov rdx, rbp + // mov r8, r12 + // mov r9, rbx + std::memcpy(raw, "\x4C\x89\xE9\x48\x89\xEA\x4D\x89\xE0\x49\x89\xD9", 12); +#else + // mov rdi, r13 + // mov rsi, rbp + // mov rdx, r12 + // mov rcx, rbx + std::memcpy(raw, "\x4C\x89\xEF\x48\x89\xEE\x4C\x89\xE2\x48\x89\xD9", 12); +#endif + + return raw + 12; +} + DECLARE(spu_runtime::tr_dispatch) = [] { // Generate a special trampoline to spu_recompiler_base::dispatch with pause instruction - u8* const trptr = jit_runtime::alloc(16, 16); - trptr[0] = 0xf3; // pause - trptr[1] = 0x90; - trptr[2] = 0xff; // jmp [rip] - trptr[3] = 0x25; - std::memset(trptr + 4, 0, 4); + u8* const trptr = jit_runtime::alloc(32, 16); + u8* raw = move_args_ghc_to_native(trptr); + *raw++ = 0xf3; // pause + *raw++ = 0x90; + *raw++ = 0xff; // jmp [rip] + *raw++ = 0x25; + std::memset(raw, 0, 4); const u64 target = reinterpret_cast(&spu_recompiler_base::dispatch); - std::memcpy(trptr + 8, &target, 8); + std::memcpy(raw + 4, &target, 8); return reinterpret_cast(trptr); }(); DECLARE(spu_runtime::tr_branch) = [] { // Generate a trampoline to spu_recompiler_base::branch - u8* const trptr = jit_runtime::alloc(16, 16); - trptr[0] = 0xff; // jmp [rip] - trptr[1] = 0x25; - std::memset(trptr + 2, 0, 4); + u8* const trptr = jit_runtime::alloc(32, 16); + u8* raw = move_args_ghc_to_native(trptr); + *raw++ = 0xff; // jmp [rip] + *raw++ = 0x25; + std::memset(raw, 0, 4); const u64 target = reinterpret_cast(&spu_recompiler_base::branch); - std::memcpy(trptr + 6, &target, 8); + std::memcpy(raw + 4, &target, 8); return reinterpret_cast(trptr); }(); DECLARE(spu_runtime::g_dispatcher) = [] { const auto ptr = reinterpret_cast(jit_runtime::alloc(sizeof(spu_function_t), 8, false)); - ptr->raw() = &spu_recompiler_base::dispatch; + ptr->raw() = tr_dispatch; return ptr; }(); +DECLARE(spu_runtime::g_gateway) = build_function_asm([](asmjit::X86Assembler& c, auto& args) +{ + // Gateway for SPU dispatcher, converts from native to GHC calling convention, also saves RSP value for spu_escape + using namespace asmjit; + +#ifdef _WIN32 + c.push(x86::r15); + c.push(x86::r14); + c.push(x86::r13); + c.push(x86::r12); + c.push(x86::rsi); + c.push(x86::rdi); + c.push(x86::rbp); + c.push(x86::rbx); + c.sub(x86::rsp, 0xa8); + c.movaps(x86::oword_ptr(x86::rsp, 0x90), x86::xmm15); + c.movaps(x86::oword_ptr(x86::rsp, 0x80), x86::xmm14); + c.movaps(x86::oword_ptr(x86::rsp, 0x70), x86::xmm13); + c.movaps(x86::oword_ptr(x86::rsp, 0x60), x86::xmm12); + c.movaps(x86::oword_ptr(x86::rsp, 0x50), x86::xmm11); + c.movaps(x86::oword_ptr(x86::rsp, 0x40), x86::xmm10); + c.movaps(x86::oword_ptr(x86::rsp, 0x30), x86::xmm9); + c.movaps(x86::oword_ptr(x86::rsp, 0x20), x86::xmm8); + c.movaps(x86::oword_ptr(x86::rsp, 0x10), x86::xmm7); + c.movaps(x86::oword_ptr(x86::rsp, 0), x86::xmm6); +#else + c.push(x86::rbp); + c.push(x86::r15); + c.push(x86::r14); + c.push(x86::r13); + c.push(x86::r12); + c.push(x86::rbx); + c.push(x86::rax); +#endif + + // Load g_dispatcher pointer to call g_dispatcher[0] + c.mov(x86::rax, asmjit::imm_ptr(spu_runtime::g_dispatcher)); + c.mov(x86::rax, x86::qword_ptr(x86::rax)); + + // Save native stack pointer for longjmp emulation + c.mov(x86::qword_ptr(args[0], ::offset32(&spu_thread::saved_native_sp)), x86::rsp); + + // Move 4 args (despite spu_function_t def) + c.mov(x86::r13, args[0]); + c.mov(x86::rbp, args[1]); + c.mov(x86::r12, args[2]); + c.mov(x86::rbx, args[3]); + + if (utils::has_avx()) + { + c.vzeroupper(); + } + + c.call(x86::rax); + + if (utils::has_avx()) + { + c.vzeroupper(); + } + +#ifdef _WIN32 + c.movaps(x86::xmm6, x86::oword_ptr(x86::rsp, 0)); + c.movaps(x86::xmm7, x86::oword_ptr(x86::rsp, 0x10)); + c.movaps(x86::xmm8, x86::oword_ptr(x86::rsp, 0x20)); + c.movaps(x86::xmm9, x86::oword_ptr(x86::rsp, 0x30)); + c.movaps(x86::xmm10, x86::oword_ptr(x86::rsp, 0x40)); + c.movaps(x86::xmm11, x86::oword_ptr(x86::rsp, 0x50)); + c.movaps(x86::xmm12, x86::oword_ptr(x86::rsp, 0x60)); + c.movaps(x86::xmm13, x86::oword_ptr(x86::rsp, 0x70)); + c.movaps(x86::xmm14, x86::oword_ptr(x86::rsp, 0x80)); + c.movaps(x86::xmm15, x86::oword_ptr(x86::rsp, 0x90)); + c.add(x86::rsp, 0xa8); + c.pop(x86::rbx); + c.pop(x86::rbp); + c.pop(x86::rdi); + c.pop(x86::rsi); + c.pop(x86::r12); + c.pop(x86::r13); + c.pop(x86::r14); + c.pop(x86::r15); +#else + c.add(x86::rsp, +8); + c.pop(x86::rbx); + c.pop(x86::r12); + c.pop(x86::r13); + c.pop(x86::r14); + c.pop(x86::r15); + c.pop(x86::rbp); +#endif + + c.ret(); +}); + DECLARE(spu_runtime::g_interpreter) = nullptr; spu_cache::spu_cache(const std::string& loc) @@ -347,7 +464,7 @@ bool spu_runtime::func_compare::operator()(const std::vector& lhs, const st spu_runtime::spu_runtime() { // Initialize "empty" block - m_map[std::vector()] = &spu_recompiler_base::dispatch; + m_map[std::vector()] = tr_dispatch; // Clear LLVM output m_cache_path = Emu.PPUCache(); @@ -412,7 +529,7 @@ bool spu_runtime::add(u64 last_reset_count, void* _where, spu_function_t compile else { // Allocate some writable executable memory - u8* const wxptr = jit_runtime::alloc(size0 * 22 + 11, 16); + u8* const wxptr = jit_runtime::alloc(size0 * 22 + 14, 16); if (!wxptr) { @@ -425,7 +542,7 @@ bool spu_runtime::add(u64 last_reset_count, void* _where, spu_function_t compile // Write jump instruction with rel32 immediate auto make_jump = [&](u8 op, auto target) { - verify("Asm overflow" HERE), raw + 8 <= wxptr + size0 * 22; + verify("Asm overflow" HERE), raw + 8 <= wxptr + size0 * 22 + 16; // Fallback to dispatch if no target const u64 taddr = target ? reinterpret_cast(target) : reinterpret_cast(tr_dispatch); @@ -460,26 +577,18 @@ bool spu_runtime::add(u64 last_reset_count, void* _where, spu_function_t compile workload.back().beg = beg; workload.back().end = _end; - // mov eax, [spu_thread::pc] + // Load PC: mov eax, [r13 + spu_thread::pc] + *raw++ = 0x41; *raw++ = 0x8b; -#ifdef _WIN32 - *raw++ = 0x81; -#else - *raw++ = 0x87; -#endif - const u32 pc_off = ::offset32(&spu_thread::pc); - std::memcpy(raw, &pc_off, 4); - raw += 4; + *raw++ = 0x45; + *raw++ = ::narrow(::offset32(&spu_thread::pc)); - // lea r9, [ls + rax] - *raw++ = 0x4c; + // Get LS address starting from PC: lea rcx, [rbp + rax] + *raw++ = 0x48; *raw++ = 0x8d; - *raw++ = 0x0c; -#ifdef _WIN32 - *raw++ = 0x02; -#else - *raw++ = 0x06; -#endif + *raw++ = 0x4c; + *raw++ = 0x05; + *raw++ = 0x00; for (std::size_t i = 0; i < workload.size(); i++) { @@ -580,17 +689,26 @@ bool spu_runtime::add(u64 last_reset_count, void* _where, spu_function_t compile } // Emit 32-bit comparison - verify("Asm overflow" HERE), raw + 12 <= wxptr + size0 * 22; + verify("Asm overflow" HERE), raw + 12 <= wxptr + size0 * 22 + 16; if (w.from != w.level) { - // If necessary (level has advanced), emit load: mov eax, [r9 + addr] - *raw++ = 0x41; - *raw++ = 0x8b; - *raw++ = 0x81; + // If necessary (level has advanced), emit load: mov eax, [rcx + addr] const u32 cmp_lsa = w.level * 4u; - std::memcpy(raw, &cmp_lsa, 4); - raw += 4; + + if (cmp_lsa < 0x80) + { + *raw++ = 0x8b; + *raw++ = 0x41; + *raw++ = ::narrow(cmp_lsa); + } + else + { + *raw++ = 0x8b; + *raw++ = 0x81; + std::memcpy(raw, &cmp_lsa, 4); + raw += 4; + } } // Emit comparison: cmp eax, imm32 @@ -796,20 +914,15 @@ spu_function_t spu_runtime::make_branch_patchpoint(u32 target) const return nullptr; } - // Save address of the following jmp -#ifdef _WIN32 - raw[0] = 0x4c; // lea r8, [rip+1] + // Save address of the following jmp (GHC CC 3rd argument) + raw[0] = 0x4c; // lea r12, [rip+1] raw[1] = 0x8d; - raw[2] = 0x05; -#else - raw[0] = 0x48; // lea rdx, [rip+1] - raw[1] = 0x8d; - raw[2] = 0x15; -#endif + raw[2] = 0x25; raw[3] = 0x01; raw[4] = 0x00; raw[5] = 0x00; raw[6] = 0x00; + raw[7] = 0x90; // nop // Jump to spu_recompiler_base::branch @@ -3992,6 +4105,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator // Add entry function (contains only state/code check) const auto main_func = llvm::cast(m_module->getOrInsertFunction(hash, get_ftype()).getCallee()); const auto main_arg2 = &*(main_func->arg_begin() + 2); + main_func->setCallingConv(CallingConv::GHC); set_function(main_func); // Start compilation @@ -4119,17 +4233,10 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator const auto pbcount = spu_ptr(&spu_thread::block_counter); m_ir->CreateStore(m_ir->CreateAdd(m_ir->CreateLoad(pbcount), m_ir->getInt64(check_iterations)), pbcount); - const auto gateway = llvm::cast(m_module->getOrInsertFunction("spu_chunk_gateway", get_ftype()).getCallee()); - gateway->setLinkage(GlobalValue::InternalLinkage); - gateway->setCallingConv(CallingConv::GHC); - - // Save host thread's stack pointer - const auto native_sp = spu_ptr(&spu_thread::saved_native_sp); - const auto rsp_name = MetadataAsValue::get(m_context, MDNode::get(m_context, {MDString::get(m_context, "rsp")})); - m_ir->CreateStore(m_ir->CreateCall(get_intrinsic(Intrinsic::read_register), {rsp_name}), native_sp); + // Call the entry function chunk + const auto entry_chunk = add_function(m_pos); + tail_chunk(entry_chunk->chunk); - m_ir->CreateCall(gateway, {m_thread, m_lsptr, m_base_pc})->setCallingConv(gateway->getCallingConv()); - m_ir->CreateRetVoid(); m_ir->SetInsertPoint(label_stop); m_ir->CreateRetVoid(); @@ -4139,7 +4246,9 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator { const auto pbfail = spu_ptr(&spu_thread::block_failure); m_ir->CreateStore(m_ir->CreateAdd(m_ir->CreateLoad(pbfail), m_ir->getInt64(1)), pbfail); - call("spu_dispatch", &spu_recompiler_base::dispatch, m_thread, m_ir->getInt32(0), main_arg2)->setTailCall(); + const auto dispci = call("spu_dispatch", spu_runtime::tr_dispatch, m_thread, m_ir->getInt32(0), main_arg2); + dispci->setCallingConv(CallingConv::GHC); + dispci->setTailCall(); m_ir->CreateRetVoid(); } else @@ -4147,17 +4256,12 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator m_ir->CreateUnreachable(); } - set_function(gateway); - - // Call the entry function chunk - const auto entry_chunk = add_function(m_pos); - tail_chunk(entry_chunk->chunk); - // Longjmp analogue (restore saved host thread's stack pointer) const auto escape = llvm::cast(m_module->getOrInsertFunction("spu_escape", get_ftype()).getCallee()); escape->setLinkage(GlobalValue::InternalLinkage); m_ir->SetInsertPoint(BasicBlock::Create(m_context, "", escape)); const auto load_sp = m_ir->CreateLoad(_ptr(&*escape->arg_begin(), ::offset32(&spu_thread::saved_native_sp))); + const auto rsp_name = MetadataAsValue::get(m_context, MDNode::get(m_context, {MDString::get(m_context, "rsp")})); m_ir->CreateCall(get_intrinsic(Intrinsic::write_register), {rsp_name, m_ir->CreateSub(load_sp, m_ir->getInt64(8))}); m_ir->CreateRetVoid(); diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index 814d84133bc2..944c1f71d831 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -59,6 +59,8 @@ class spu_runtime // Debug module output location std::string m_cache_path; +public: + // Trampoline to spu_recompiler_base::dispatch static const spu_function_t tr_dispatch; @@ -100,6 +102,9 @@ class spu_runtime // All dispatchers (array allocated in jit memory) static atomic_t* const g_dispatcher; + // Recompiler entry point + static const spu_function_t g_gateway; + // Interpreter entry point static spu_function_t g_interpreter; diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index b303badcb2a0..c2d29a887c5e 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -832,7 +832,7 @@ void spu_thread::cpu_task() } } - spu_runtime::g_dispatcher[0](*this, vm::_ptr(offset), nullptr); + spu_runtime::g_gateway(*this, vm::_ptr(offset), nullptr); } // Print some stats