Skip to content

Commit

Permalink
SPU: implement recompiler gateway function in assembly
Browse files Browse the repository at this point in the history
Use GHC calling convention directly for SPU object entry points.
This may address performance degradation after RPCS3#5923.
  • Loading branch information
Nekotekina committed May 11, 2019
1 parent e89f5ed commit 5c53ff6
Show file tree
Hide file tree
Showing 3 changed files with 176 additions and 67 deletions.
236 changes: 170 additions & 66 deletions rpcs3/Emu/Cell/SPURecompiler.cpp
Expand Up @@ -4,6 +4,8 @@
#include "Emu/Memory/vm.h"
#include "Crypto/sha1.h"
#include "Utilities/StrUtil.h"
#include "Utilities/JIT.h"
#include "Utilities/sysinfo.h"

#include "SPUThread.h"
#include "SPUAnalyser.h"
Expand All @@ -24,39 +26,154 @@ const spu_decoder<spu_iflag> s_spu_iflag;

extern u64 get_timebased_time();

// Move 4 args for calling native function from a GHC calling convention function
static u8* move_args_ghc_to_native(u8* raw)
{
#ifdef _WIN32
// mov rcx, r13
// mov rdx, rbp
// mov r8, r12
// mov r9, rbx
std::memcpy(raw, "\x4C\x89\xE9\x48\x89\xEA\x4D\x89\xE0\x49\x89\xD9", 12);
#else
// mov rdi, r13
// mov rsi, rbp
// mov rdx, r12
// mov rcx, rbx
std::memcpy(raw, "\x4C\x89\xEF\x48\x89\xEE\x4C\x89\xE2\x48\x89\xD9", 12);
#endif

return raw + 12;
}

DECLARE(spu_runtime::tr_dispatch) = []
{
// Generate a special trampoline to spu_recompiler_base::dispatch with pause instruction
u8* const trptr = jit_runtime::alloc(16, 16);
trptr[0] = 0xf3; // pause
trptr[1] = 0x90;
trptr[2] = 0xff; // jmp [rip]
trptr[3] = 0x25;
std::memset(trptr + 4, 0, 4);
u8* const trptr = jit_runtime::alloc(32, 16);
u8* raw = move_args_ghc_to_native(trptr);
*raw++ = 0xf3; // pause
*raw++ = 0x90;
*raw++ = 0xff; // jmp [rip]
*raw++ = 0x25;
std::memset(raw, 0, 4);
const u64 target = reinterpret_cast<u64>(&spu_recompiler_base::dispatch);
std::memcpy(trptr + 8, &target, 8);
std::memcpy(raw + 4, &target, 8);
return reinterpret_cast<spu_function_t>(trptr);
}();

DECLARE(spu_runtime::tr_branch) = []
{
// Generate a trampoline to spu_recompiler_base::branch
u8* const trptr = jit_runtime::alloc(16, 16);
trptr[0] = 0xff; // jmp [rip]
trptr[1] = 0x25;
std::memset(trptr + 2, 0, 4);
u8* const trptr = jit_runtime::alloc(32, 16);
u8* raw = move_args_ghc_to_native(trptr);
*raw++ = 0xff; // jmp [rip]
*raw++ = 0x25;
std::memset(raw, 0, 4);
const u64 target = reinterpret_cast<u64>(&spu_recompiler_base::branch);
std::memcpy(trptr + 6, &target, 8);
std::memcpy(raw + 4, &target, 8);
return reinterpret_cast<spu_function_t>(trptr);
}();

DECLARE(spu_runtime::g_dispatcher) = []
{
const auto ptr = reinterpret_cast<decltype(spu_runtime::g_dispatcher)>(jit_runtime::alloc(sizeof(spu_function_t), 8, false));
ptr->raw() = &spu_recompiler_base::dispatch;
ptr->raw() = tr_dispatch;
return ptr;
}();

DECLARE(spu_runtime::g_gateway) = build_function_asm<spu_function_t>([](asmjit::X86Assembler& c, auto& args)
{
// Gateway for SPU dispatcher, converts from native to GHC calling convention, also saves RSP value for spu_escape
using namespace asmjit;

#ifdef _WIN32
c.push(x86::r15);
c.push(x86::r14);
c.push(x86::r13);
c.push(x86::r12);
c.push(x86::rsi);
c.push(x86::rdi);
c.push(x86::rbp);
c.push(x86::rbx);
c.sub(x86::rsp, 0xa8);
c.movaps(x86::oword_ptr(x86::rsp, 0x90), x86::xmm15);
c.movaps(x86::oword_ptr(x86::rsp, 0x80), x86::xmm14);
c.movaps(x86::oword_ptr(x86::rsp, 0x70), x86::xmm13);
c.movaps(x86::oword_ptr(x86::rsp, 0x60), x86::xmm12);
c.movaps(x86::oword_ptr(x86::rsp, 0x50), x86::xmm11);
c.movaps(x86::oword_ptr(x86::rsp, 0x40), x86::xmm10);
c.movaps(x86::oword_ptr(x86::rsp, 0x30), x86::xmm9);
c.movaps(x86::oword_ptr(x86::rsp, 0x20), x86::xmm8);
c.movaps(x86::oword_ptr(x86::rsp, 0x10), x86::xmm7);
c.movaps(x86::oword_ptr(x86::rsp, 0), x86::xmm6);
#else
c.push(x86::rbp);
c.push(x86::r15);
c.push(x86::r14);
c.push(x86::r13);
c.push(x86::r12);
c.push(x86::rbx);
c.push(x86::rax);
#endif

// Load g_dispatcher pointer to call g_dispatcher[0]
c.mov(x86::rax, asmjit::imm_ptr(spu_runtime::g_dispatcher));
c.mov(x86::rax, x86::qword_ptr(x86::rax));

// Save native stack pointer for longjmp emulation
c.mov(x86::qword_ptr(args[0], ::offset32(&spu_thread::saved_native_sp)), x86::rsp);

// Move 4 args (despite spu_function_t def)
c.mov(x86::r13, args[0]);
c.mov(x86::rbp, args[1]);
c.mov(x86::r12, args[2]);
c.mov(x86::rbx, args[3]);

if (utils::has_avx())
{
c.vzeroupper();
}

c.call(x86::rax);

if (utils::has_avx())
{
c.vzeroupper();
}

#ifdef _WIN32
c.movaps(x86::xmm6, x86::oword_ptr(x86::rsp, 0));
c.movaps(x86::xmm7, x86::oword_ptr(x86::rsp, 0x10));
c.movaps(x86::xmm8, x86::oword_ptr(x86::rsp, 0x20));
c.movaps(x86::xmm9, x86::oword_ptr(x86::rsp, 0x30));
c.movaps(x86::xmm10, x86::oword_ptr(x86::rsp, 0x40));
c.movaps(x86::xmm11, x86::oword_ptr(x86::rsp, 0x50));
c.movaps(x86::xmm12, x86::oword_ptr(x86::rsp, 0x60));
c.movaps(x86::xmm13, x86::oword_ptr(x86::rsp, 0x70));
c.movaps(x86::xmm14, x86::oword_ptr(x86::rsp, 0x80));
c.movaps(x86::xmm15, x86::oword_ptr(x86::rsp, 0x90));
c.add(x86::rsp, 0xa8);
c.pop(x86::rbx);
c.pop(x86::rbp);
c.pop(x86::rdi);
c.pop(x86::rsi);
c.pop(x86::r12);
c.pop(x86::r13);
c.pop(x86::r14);
c.pop(x86::r15);
#else
c.add(x86::rsp, +8);
c.pop(x86::rbx);
c.pop(x86::r12);
c.pop(x86::r13);
c.pop(x86::r14);
c.pop(x86::r15);
c.pop(x86::rbp);
#endif

c.ret();
});

DECLARE(spu_runtime::g_interpreter) = nullptr;

spu_cache::spu_cache(const std::string& loc)
Expand Down Expand Up @@ -347,7 +464,7 @@ bool spu_runtime::func_compare::operator()(const std::vector<u32>& lhs, const st
spu_runtime::spu_runtime()
{
// Initialize "empty" block
m_map[std::vector<u32>()] = &spu_recompiler_base::dispatch;
m_map[std::vector<u32>()] = tr_dispatch;

// Clear LLVM output
m_cache_path = Emu.PPUCache();
Expand Down Expand Up @@ -412,7 +529,7 @@ bool spu_runtime::add(u64 last_reset_count, void* _where, spu_function_t compile
else
{
// Allocate some writable executable memory
u8* const wxptr = jit_runtime::alloc(size0 * 22 + 11, 16);
u8* const wxptr = jit_runtime::alloc(size0 * 22 + 14, 16);

if (!wxptr)
{
Expand All @@ -425,7 +542,7 @@ bool spu_runtime::add(u64 last_reset_count, void* _where, spu_function_t compile
// Write jump instruction with rel32 immediate
auto make_jump = [&](u8 op, auto target)
{
verify("Asm overflow" HERE), raw + 8 <= wxptr + size0 * 22;
verify("Asm overflow" HERE), raw + 8 <= wxptr + size0 * 22 + 16;

// Fallback to dispatch if no target
const u64 taddr = target ? reinterpret_cast<u64>(target) : reinterpret_cast<u64>(tr_dispatch);
Expand Down Expand Up @@ -460,26 +577,18 @@ bool spu_runtime::add(u64 last_reset_count, void* _where, spu_function_t compile
workload.back().beg = beg;
workload.back().end = _end;

// mov eax, [spu_thread::pc]
// Load PC: mov eax, [r13 + spu_thread::pc]
*raw++ = 0x41;
*raw++ = 0x8b;
#ifdef _WIN32
*raw++ = 0x81;
#else
*raw++ = 0x87;
#endif
const u32 pc_off = ::offset32(&spu_thread::pc);
std::memcpy(raw, &pc_off, 4);
raw += 4;
*raw++ = 0x45;
*raw++ = ::narrow<s8>(::offset32(&spu_thread::pc));

// lea r9, [ls + rax]
*raw++ = 0x4c;
// Get LS address starting from PC: lea rcx, [rbp + rax]
*raw++ = 0x48;
*raw++ = 0x8d;
*raw++ = 0x0c;
#ifdef _WIN32
*raw++ = 0x02;
#else
*raw++ = 0x06;
#endif
*raw++ = 0x4c;
*raw++ = 0x05;
*raw++ = 0x00;

for (std::size_t i = 0; i < workload.size(); i++)
{
Expand Down Expand Up @@ -580,17 +689,26 @@ bool spu_runtime::add(u64 last_reset_count, void* _where, spu_function_t compile
}

// Emit 32-bit comparison
verify("Asm overflow" HERE), raw + 12 <= wxptr + size0 * 22;
verify("Asm overflow" HERE), raw + 12 <= wxptr + size0 * 22 + 16;

if (w.from != w.level)
{
// If necessary (level has advanced), emit load: mov eax, [r9 + addr]
*raw++ = 0x41;
*raw++ = 0x8b;
*raw++ = 0x81;
// If necessary (level has advanced), emit load: mov eax, [rcx + addr]
const u32 cmp_lsa = w.level * 4u;
std::memcpy(raw, &cmp_lsa, 4);
raw += 4;

if (cmp_lsa < 0x80)
{
*raw++ = 0x8b;
*raw++ = 0x41;
*raw++ = ::narrow<s8>(cmp_lsa);
}
else
{
*raw++ = 0x8b;
*raw++ = 0x81;
std::memcpy(raw, &cmp_lsa, 4);
raw += 4;
}
}

// Emit comparison: cmp eax, imm32
Expand Down Expand Up @@ -796,20 +914,15 @@ spu_function_t spu_runtime::make_branch_patchpoint(u32 target) const
return nullptr;
}

// Save address of the following jmp
#ifdef _WIN32
raw[0] = 0x4c; // lea r8, [rip+1]
// Save address of the following jmp (GHC CC 3rd argument)
raw[0] = 0x4c; // lea r12, [rip+1]
raw[1] = 0x8d;
raw[2] = 0x05;
#else
raw[0] = 0x48; // lea rdx, [rip+1]
raw[1] = 0x8d;
raw[2] = 0x15;
#endif
raw[2] = 0x25;
raw[3] = 0x01;
raw[4] = 0x00;
raw[5] = 0x00;
raw[6] = 0x00;

raw[7] = 0x90; // nop

// Jump to spu_recompiler_base::branch
Expand Down Expand Up @@ -3992,6 +4105,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
// Add entry function (contains only state/code check)
const auto main_func = llvm::cast<llvm::Function>(m_module->getOrInsertFunction(hash, get_ftype<void, u8*, u8*, u8*>()).getCallee());
const auto main_arg2 = &*(main_func->arg_begin() + 2);
main_func->setCallingConv(CallingConv::GHC);
set_function(main_func);

// Start compilation
Expand Down Expand Up @@ -4119,17 +4233,10 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
const auto pbcount = spu_ptr<u64>(&spu_thread::block_counter);
m_ir->CreateStore(m_ir->CreateAdd(m_ir->CreateLoad(pbcount), m_ir->getInt64(check_iterations)), pbcount);

const auto gateway = llvm::cast<Function>(m_module->getOrInsertFunction("spu_chunk_gateway", get_ftype<void, u8*, u8*, u32>()).getCallee());
gateway->setLinkage(GlobalValue::InternalLinkage);
gateway->setCallingConv(CallingConv::GHC);

// Save host thread's stack pointer
const auto native_sp = spu_ptr<u64>(&spu_thread::saved_native_sp);
const auto rsp_name = MetadataAsValue::get(m_context, MDNode::get(m_context, {MDString::get(m_context, "rsp")}));
m_ir->CreateStore(m_ir->CreateCall(get_intrinsic<u64>(Intrinsic::read_register), {rsp_name}), native_sp);
// Call the entry function chunk
const auto entry_chunk = add_function(m_pos);
tail_chunk(entry_chunk->chunk);

m_ir->CreateCall(gateway, {m_thread, m_lsptr, m_base_pc})->setCallingConv(gateway->getCallingConv());
m_ir->CreateRetVoid();
m_ir->SetInsertPoint(label_stop);
m_ir->CreateRetVoid();

Expand All @@ -4139,25 +4246,22 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
{
const auto pbfail = spu_ptr<u64>(&spu_thread::block_failure);
m_ir->CreateStore(m_ir->CreateAdd(m_ir->CreateLoad(pbfail), m_ir->getInt64(1)), pbfail);
call("spu_dispatch", &spu_recompiler_base::dispatch, m_thread, m_ir->getInt32(0), main_arg2)->setTailCall();
const auto dispci = call("spu_dispatch", spu_runtime::tr_dispatch, m_thread, m_ir->getInt32(0), main_arg2);
dispci->setCallingConv(CallingConv::GHC);
dispci->setTailCall();
m_ir->CreateRetVoid();
}
else
{
m_ir->CreateUnreachable();
}

set_function(gateway);

// Call the entry function chunk
const auto entry_chunk = add_function(m_pos);
tail_chunk(entry_chunk->chunk);

// Longjmp analogue (restore saved host thread's stack pointer)
const auto escape = llvm::cast<llvm::Function>(m_module->getOrInsertFunction("spu_escape", get_ftype<void, u8*>()).getCallee());
escape->setLinkage(GlobalValue::InternalLinkage);
m_ir->SetInsertPoint(BasicBlock::Create(m_context, "", escape));
const auto load_sp = m_ir->CreateLoad(_ptr<u64>(&*escape->arg_begin(), ::offset32(&spu_thread::saved_native_sp)));
const auto rsp_name = MetadataAsValue::get(m_context, MDNode::get(m_context, {MDString::get(m_context, "rsp")}));
m_ir->CreateCall(get_intrinsic<u64>(Intrinsic::write_register), {rsp_name, m_ir->CreateSub(load_sp, m_ir->getInt64(8))});
m_ir->CreateRetVoid();

Expand Down
5 changes: 5 additions & 0 deletions rpcs3/Emu/Cell/SPURecompiler.h
Expand Up @@ -59,6 +59,8 @@ class spu_runtime
// Debug module output location
std::string m_cache_path;

public:

// Trampoline to spu_recompiler_base::dispatch
static const spu_function_t tr_dispatch;

Expand Down Expand Up @@ -100,6 +102,9 @@ class spu_runtime
// All dispatchers (array allocated in jit memory)
static atomic_t<spu_function_t>* const g_dispatcher;

// Recompiler entry point
static const spu_function_t g_gateway;

// Interpreter entry point
static spu_function_t g_interpreter;

Expand Down
2 changes: 1 addition & 1 deletion rpcs3/Emu/Cell/SPUThread.cpp
Expand Up @@ -832,7 +832,7 @@ void spu_thread::cpu_task()
}
}

spu_runtime::g_dispatcher[0](*this, vm::_ptr<u8>(offset), nullptr);
spu_runtime::g_gateway(*this, vm::_ptr<u8>(offset), nullptr);
}

// Print some stats
Expand Down

0 comments on commit 5c53ff6

Please sign in to comment.