From 8ef7e264cc96bff428cf23a9f1a1753b2f9b6e03 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Sat, 11 May 2019 20:48:47 +0300 Subject: [PATCH] SPU LLVM: regain some efficiency Avoid returns from the recompiler gateway, favoring tail calls. This may address performance degradation after #5923. --- rpcs3/Emu/Cell/SPURecompiler.cpp | 86 +++++++++++++++++++++++--------- 1 file changed, 62 insertions(+), 24 deletions(-) diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index 93aaa9805523..32d0dc2fa119 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -3153,6 +3153,9 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator // Function for check_state execution llvm::Function* m_test_state{}; + // Chunk for external tail call (dispatch) + llvm::Function* m_dispatch{}; + llvm::MDNode* m_md_unlikely; llvm::MDNode* m_md_likely; @@ -3219,11 +3222,15 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator } // Chunk function type - // 0. Result (void) + // 0. Result (tail call target) // 1. Thread context // 2. Local storage pointer // 3. - const auto chunk_type = get_ftype(); +#ifdef _WIN32 + const auto chunk_type = get_ftype(); +#else + const auto chunk_type = get_ftype(); +#endif // Get function chunk name const std::string name = fmt::format("spu-chunk-0x%05x", addr); @@ -3273,9 +3280,18 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator void tail_chunk(llvm::Value* chunk, llvm::Value* base_pc = nullptr) { auto call = m_ir->CreateCall(chunk, {m_thread, m_lsptr, base_pc ? base_pc : m_base_pc}); - call->setCallingConv(m_finfo ? m_finfo->chunk->getCallingConv() : llvm::cast(chunk)->getCallingConv()); + auto func = m_finfo ? m_finfo->chunk : llvm::cast(chunk); + call->setCallingConv(func->getCallingConv()); call->setTailCall(); - m_ir->CreateRetVoid(); + + if (func->getReturnType() == get_type()) + { + m_ir->CreateRetVoid(); + } + else + { + m_ir->CreateRet(call); + } } // Call the real function @@ -3451,7 +3467,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator const auto result = llvm::BasicBlock::Create(m_context, "", m_function); m_ir->SetInsertPoint(result); update_pc(target); - m_ir->CreateRetVoid(); + tail_chunk(m_dispatch); m_ir->SetInsertPoint(cblock); return result; } @@ -4103,7 +4119,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator m_fake_global1 = new llvm::GlobalVariable(*m_module, get_type(), false, llvm::GlobalValue::InternalLinkage, m_ir->getFalse()); // Add entry function (contains only state/code check) - const auto main_func = llvm::cast(m_module->getOrInsertFunction(hash, get_ftype()).getCallee()); + const auto main_func = llvm::cast(m_module->getOrInsertFunction(hash, get_ftype()).getCallee()); const auto main_arg2 = &*(main_func->arg_begin() + 2); main_func->setCallingConv(CallingConv::GHC); set_function(main_func); @@ -4235,7 +4251,23 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator // Call the entry function chunk const auto entry_chunk = add_function(m_pos); - tail_chunk(entry_chunk->chunk); + const auto entry_call = m_ir->CreateCall(entry_chunk->chunk, {m_thread, m_lsptr, m_base_pc}); + entry_call->setCallingConv(entry_chunk->chunk->getCallingConv()); + +#ifdef _WIN32 + // TODO: fix this mess + const auto dispatcher = m_ir->CreateIntToPtr(m_ir->getInt64((u64)+spu_runtime::g_dispatcher), get_type()); +#else + const auto dispatcher = new llvm::GlobalVariable(*m_module, get_type(), true, GlobalValue::ExternalLinkage, nullptr, "spu_dispatcher"); + m_engine->addGlobalMapping("spu_dispatcher", (u64)+spu_runtime::g_dispatcher); +#endif + + // Proceed to the next code + const auto next_func = entry_chunk->chunk->getReturnType() == get_type() ? m_ir->CreateLoad(dispatcher) : llvm::cast(entry_call); + const auto next_call = m_ir->CreateCall(m_ir->CreateBitCast(next_func, main_func->getType()), {m_thread, m_lsptr, m_ir->getInt64(0)}); + next_call->setCallingConv(main_func->getCallingConv()); + next_call->setTailCall(); + m_ir->CreateRetVoid(); m_ir->SetInsertPoint(label_stop); m_ir->CreateRetVoid(); @@ -4246,7 +4278,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator { const auto pbfail = spu_ptr(&spu_thread::block_failure); m_ir->CreateStore(m_ir->CreateAdd(m_ir->CreateLoad(pbfail), m_ir->getInt64(1)), pbfail); - const auto dispci = call("spu_dispatch", spu_runtime::tr_dispatch, m_thread, m_ir->getInt32(0), main_arg2); + const auto dispci = call("spu_dispatch", spu_runtime::tr_dispatch, m_thread, m_lsptr, main_arg2); dispci->setCallingConv(CallingConv::GHC); dispci->setTailCall(); m_ir->CreateRetVoid(); @@ -4256,6 +4288,20 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator m_ir->CreateUnreachable(); } + m_dispatch = cast(module->getOrInsertFunction("spu-null", entry_chunk->chunk->getFunctionType()).getCallee()); + m_dispatch->setLinkage(llvm::GlobalValue::InternalLinkage); + m_dispatch->setCallingConv(entry_chunk->chunk->getCallingConv()); + set_function(m_dispatch); + + if (entry_chunk->chunk->getReturnType() == get_type()) + { + m_ir->CreateRetVoid(); + } + else + { + m_ir->CreateRet(m_ir->CreateLoad(dispatcher)); + } + // Longjmp analogue (restore saved host thread's stack pointer) const auto escape = llvm::cast(m_module->getOrInsertFunction("spu_escape", get_ftype()).getCallee()); escape->setLinkage(GlobalValue::InternalLinkage); @@ -4475,19 +4521,13 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator std::vector chunks; chunks.reserve(m_size / 4); - const auto null = cast(module->getOrInsertFunction("spu-null", entry_chunk->chunk->getFunctionType()).getCallee()); - null->setLinkage(llvm::GlobalValue::InternalLinkage); - null->setCallingConv(entry_chunk->chunk->getCallingConv()); - set_function(null); - m_ir->CreateRetVoid(); - for (u32 i = start; i < end; i += 4) { const auto found = m_functions.find(i); if (found == m_functions.end()) { - chunks.push_back(null); + chunks.push_back(m_dispatch); continue; } @@ -5018,7 +5058,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator m_block->block_end = m_ir->GetInsertBlock(); update_pc(); call("spu_unknown", &exec_unk, m_thread, m_ir->getInt32(op_unk.opcode)); - m_ir->CreateRetVoid(); } static bool exec_stop(spu_thread* _spu, u32 code) @@ -5054,12 +5093,11 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator { m_block->block_end = m_ir->GetInsertBlock(); update_pc(m_pos + 4); - m_ir->CreateRetVoid(); - } - else - { - check_state(m_pos + 4); + tail_chunk(m_dispatch); + return; } + + check_state(m_pos + 4); } void STOPD(spu_opcode_t op) // @@ -5847,7 +5885,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator { m_block->block_end = m_ir->GetInsertBlock(); update_pc(m_pos + 4); - m_ir->CreateRetVoid(); + tail_chunk(m_dispatch); } } @@ -7675,7 +7713,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator m_ir->SetInsertPoint(fail); } - m_ir->CreateRetVoid(); + tail_chunk(m_dispatch); m_ir->SetInsertPoint(cblock); return result; } @@ -7814,7 +7852,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator } else { - m_ir->CreateRetVoid(); + tail_chunk(m_dispatch); } } else