rpcs3/Emu/RSX/Common/FragmentProgramDecompiler.cpp

﻿#include "stdafx.h"
#include "Emu/System.h"
#include "../rsx_methods.h"

#include "FragmentProgramDecompiler.h"

#include <algorithm>

FragmentProgramDecompiler::FragmentProgramDecompiler(const RSXFragmentProgram &prog, u32& size)
	: m_size(size)
	, m_prog(prog)
	, m_ctrl(prog.ctrl)
{
	m_size = 0;
}

void FragmentProgramDecompiler::SetDst(std::string code, u32 flags)
{
	if (!src0.exec_if_eq && !src0.exec_if_gr && !src0.exec_if_lt) return;

	if (src1.scale)
	{
		std::string modifier;
		switch (src1.scale)
		{
		case 0: break;
		case 1: code = "(" + code + " * "; modifier = "2."; break;
		case 2: code = "(" + code + " * "; modifier = "4."; break;
		case 3: code = "(" + code + " * "; modifier = "8."; break;
		case 5: code = "(" + code + " / "; modifier = "2."; break;
		case 6: code = "(" + code + " / "; modifier = "4."; break;
		case 7: code = "(" + code + " / "; modifier = "8."; break;

		default:
			rsx_log.error("Bad scale: %d", u32{ src1.scale });
			break;
		}

		if (flags & OPFLAGS::skip_type_cast && dst.fp16 && device_props.has_native_half_support)
		{
			modifier = getHalfTypeName(1) + "(" + modifier + ")";
		}

		if (!modifier.empty())
		{
			code = code + modifier + ")";
		}
	}

	if (!dst.no_dest)
	{
		if (dst.exp_tex)
		{
			//Expand [0,1] to [-1, 1]. Confirmed by Castlevania: LOS
			AddCode("//exp tex flag is set");
			code = "((" + code + "- 0.5) * 2.)";
		}

		if (dst.fp16 && device_props.has_native_half_support && !(flags & OPFLAGS::skip_type_cast))
		{
			// Cast to native data type
			code = ClampValue(code, 1);
		}

		if (dst.saturate)
		{
			code = ClampValue(code, 4);
		}
		else if (dst.prec)
		{
			switch (dst.opcode)
			{
			case RSX_FP_OPCODE_NRM:
			case RSX_FP_OPCODE_MAX:
			case RSX_FP_OPCODE_MIN:
			case RSX_FP_OPCODE_COS:
			case RSX_FP_OPCODE_SIN:
			case RSX_FP_OPCODE_REFL:
			case RSX_FP_OPCODE_FRC:
			case RSX_FP_OPCODE_LIT:
			case RSX_FP_OPCODE_LIF:
			case RSX_FP_OPCODE_LG2:
				break;
			case RSX_FP_OPCODE_MOV:
				// NOTE: Sometimes varying inputs from VS are out of range so do not exempt any input types, unless fp16 (Naruto UNS)
				if (dst.fp16 && src0.fp16 && src0.reg_type == RSX_FP_REGISTER_TYPE_TEMP)
					break;
			default:
			{
				// fp16 precsion flag on f32 register; ignore
				if (dst.prec == 1 && !dst.fp16)
					break;

				// Native type already has fp16 clamped (input must have been cast)
				if (dst.prec == 1 && dst.fp16 && device_props.has_native_half_support)
					break;

				// clamp value to allowed range
				code = ClampValue(code, dst.prec);
				break;
			}
			}
		}
	}

	opflags = flags;
	code += (flags & OPFLAGS::no_src_mask) ? "" : "$m";

	if (dst.no_dest)
	{
		if (dst.set_cond)
		{
			AddCode("$ifcond " + m_parr.AddParam(PF_PARAM_NONE, getFloatTypeName(4), "cc" + std::to_string(src0.cond_mod_reg_index)) + "$m = " + code + ";");
		}
		else
		{
			AddCode("$ifcond " + code + ";");
		}

		return;
	}

	std::string dest = AddReg(dst.dest_reg, !!dst.fp16) + "$m";

	AddCodeCond(Format(dest), code);
	//AddCode("$ifcond " + dest + code + (append_mask ? "$m;" : ";"));

	if (dst.set_cond)
	{
		AddCode(m_parr.AddParam(PF_PARAM_NONE, getFloatTypeName(4), "cc" + std::to_string(src0.cond_mod_reg_index)) + "$m = " + dest + ";");
	}

	u32 reg_index = dst.fp16 ? dst.dest_reg >> 1 : dst.dest_reg;

	verify(HERE), reg_index < temp_registers.size();
	temp_registers[reg_index].tag(dst.dest_reg, !!dst.fp16, dst.mask_x, dst.mask_y, dst.mask_z, dst.mask_w);
}

void FragmentProgramDecompiler::AddFlowOp(const std::string& code)
{
	//Flow operations can only consider conditionals and have no dst

	if (src0.exec_if_gr && src0.exec_if_lt && src0.exec_if_eq)
	{
		AddCode(code + ";");
		return;
	}
	else if (!src0.exec_if_gr && !src0.exec_if_lt && !src0.exec_if_eq)
	{
		AddCode("//" + code + ";");
		return;
	}

	//We have a conditional expression
	std::string cond = GetRawCond();

	AddCode("if (any(" + cond + ")) " + code + ";");
}

void FragmentProgramDecompiler::AddCode(const std::string& code)
{
	main.append(m_code_level, '\t') += Format(code) + "\n";
}

std::string FragmentProgramDecompiler::GetMask()
{
	std::string ret;

	static const char dst_mask[4] =
	{
		'x', 'y', 'z', 'w',
	};

	if (dst.mask_x) ret += dst_mask[0];
	if (dst.mask_y) ret += dst_mask[1];
	if (dst.mask_z) ret += dst_mask[2];
	if (dst.mask_w) ret += dst_mask[3];

	return ret.empty() || strncmp(ret.c_str(), dst_mask, 4) == 0 ? "" : ("." + ret);
}

std::string FragmentProgramDecompiler::AddReg(u32 index, bool fp16)
{
	const std::string type_name = (fp16 && device_props.has_native_half_support)? getHalfTypeName(4) : getFloatTypeName(4);
	const std::string reg_name = std::string(fp16 ? "h" : "r") + std::to_string(index);

	return m_parr.AddParam(PF_PARAM_NONE, type_name, reg_name, type_name + "(0., 0., 0., 0.)");
}

bool FragmentProgramDecompiler::HasReg(u32 index, bool fp16)
{
	const std::string type_name = (fp16 && device_props.has_native_half_support)? getHalfTypeName(4) : getFloatTypeName(4);
	const std::string reg_name = std::string(fp16 ? "h" : "r") + std::to_string(index);

	return m_parr.HasParam(PF_PARAM_NONE, type_name, reg_name);
}

std::string FragmentProgramDecompiler::AddCond()
{
	return m_parr.AddParam(PF_PARAM_NONE, getFloatTypeName(4), "cc" + std::to_string(src0.cond_reg_index));
}

std::string FragmentProgramDecompiler::AddConst()
{
	const std::string name = std::string("fc") + std::to_string(m_size + 4 * 4);
	const std::string type = getFloatTypeName(4);

	if (m_parr.HasParam(PF_PARAM_UNIFORM, type, name))
	{
		return name;
	}

	auto data = reinterpret_cast<be_t<u32>*>(static_cast<char*>(m_prog.addr) + m_size + 4 * sizeof(u32));
	m_offset = 2 * 4 * sizeof(u32);
	u32 x = GetData(data[0]);
	u32 y = GetData(data[1]);
	u32 z = GetData(data[2]);
	u32 w = GetData(data[3]);

	const auto var = fmt::format("%s(%f, %f, %f, %f)", type, std::bit_cast<f32>(x), std::bit_cast<f32>(y), std::bit_cast<f32>(z), std::bit_cast<f32>(w));
	return m_parr.AddParam(PF_PARAM_UNIFORM, type, name, var);
}

std::string FragmentProgramDecompiler::AddTex()
{
	properties.has_tex_op = true;

	std::string sampler;
	switch (m_prog.get_texture_dimension(dst.tex_num))
	{
	case rsx::texture_dimension_extended::texture_dimension_1d:
		sampler = "sampler1D";
		break;
	case rsx::texture_dimension_extended::texture_dimension_cubemap:
		sampler = "samplerCube";
		break;
	case rsx::texture_dimension_extended::texture_dimension_2d:
		sampler = "sampler2D";
		break;
	case rsx::texture_dimension_extended::texture_dimension_3d:
		sampler = "sampler3D";
		break;
	}

	opflags |= OPFLAGS::texture_ref;
	return m_parr.AddParam(PF_PARAM_UNIFORM, sampler, std::string("tex") + std::to_string(dst.tex_num));
}

std::string FragmentProgramDecompiler::AddType3()
{
	return m_parr.AddParam(PF_PARAM_NONE, getFloatTypeName(4), "src3", getFloatTypeName(4) + "(1., 1., 1., 1.)");
}

std::string FragmentProgramDecompiler::AddX2d()
{
	return m_parr.AddParam(PF_PARAM_NONE, getFloatTypeName(4), "x2d", getFloatTypeName(4) + "(0., 0., 0., 0.)");
}

std::string FragmentProgramDecompiler::ClampValue(const std::string& code, u32 precision)
{
	// FP16 is expected to overflow a lot easier at 0+-65504
	// FP32 can still work up to 0+-3.4E38
	// See http://http.download.nvidia.com/developer/Papers/2005/FP_Specials/FP_Specials.pdf

	if (precision > 1 && precision < 5)
	{
		// Define precision_clamp
		properties.has_clamp = true;
	}

	switch (precision)
	{
	case RSX_FP_PRECISION_REAL:
		// Full 32-bit precision
		break;
	case RSX_FP_PRECISION_HALF:
		return "clamp16(" + code + ")";
	case RSX_FP_PRECISION_FIXED12:
		return "precision_clamp(" + code + ", -2., 2.)";
	case RSX_FP_PRECISION_FIXED9:
		return "precision_clamp(" + code + ", -1., 1.)";
	case RSX_FP_PRECISION_SATURATE:
		return "precision_clamp(" + code + ", 0., 1.)";
	case RSX_FP_PRECISION_UNKNOWN:
		// Doesn't seem to do anything to the input from hw tests, same as 0
		break;
	default:
		rsx_log.error("Unexpected precision modifier (%d)\n", precision);
		break;
	}

	return code;
}

bool FragmentProgramDecompiler::DstExpectsSca()
{
	int writes = 0;

	if (dst.mask_x) writes++;
	if (dst.mask_y) writes++;
	if (dst.mask_z) writes++;
	if (dst.mask_w) writes++;

	return (writes == 1);
}

std::string FragmentProgramDecompiler::Format(const std::string& code, bool ignore_redirects)
{
	const std::pair<std::string, std::function<std::string()>> repl_list[] =
	{
		{ "$$", []() -> std::string { return "$"; } },
		{ "$0", [this]() -> std::string {return GetSRC<SRC0>(src0);} },
		{ "$1", [this]() -> std::string {return GetSRC<SRC1>(src1);} },
		{ "$2", [this]() -> std::string {return GetSRC<SRC2>(src2);} },
		{ "$t", [this]() -> std::string { return "tex" + std::to_string(dst.tex_num);} },
		{ "$_i", [this]() -> std::string {return std::to_string(dst.tex_num);} },
		{ "$m", std::bind(std::mem_fn(&FragmentProgramDecompiler::GetMask), this) },
		{ "$ifcond ", [this]() -> std::string
			{
				const std::string& cond = GetCond();
				if (cond == "true") return "";
				return "if(" + cond + ") ";
			}
		},
		{ "$cond", std::bind(std::mem_fn(&FragmentProgramDecompiler::GetCond), this) },
		{ "$_c", std::bind(std::mem_fn(&FragmentProgramDecompiler::AddConst), this) },
		{ "$float4", [this]() -> std::string { return getFloatTypeName(4); } },
		{ "$float3", [this]() -> std::string { return getFloatTypeName(3); } },
		{ "$float2", [this]() -> std::string { return getFloatTypeName(2); } },
		{ "$float_t", [this]() -> std::string { return getFloatTypeName(1); } },
		{ "$half4", [this]() -> std::string { return getHalfTypeName(4); } },
		{ "$half3", [this]() -> std::string { return getHalfTypeName(3); } },
		{ "$half2", [this]() -> std::string { return getHalfTypeName(2); } },
		{ "$half_t", [this]() -> std::string { return getHalfTypeName(1); } },
		{ "$Ty", [this]() -> std::string { return (!device_props.has_native_half_support || !dst.fp16)? getFloatTypeName(4) : getHalfTypeName(4); } }
	};

	if (!ignore_redirects)
	{
		//Special processing redirects
		switch (dst.opcode)
		{
		case RSX_FP_OPCODE_TEXBEM:
		case RSX_FP_OPCODE_TXPBEM:
		{
			//Redirect parameter 0 to the x2d temp register for TEXBEM
			//TODO: Organize this a little better
			std::pair<std::string, std::string> repl[] = { { "$0", "x2d" } };
			std::string result = fmt::replace_all(code, repl);

			return fmt::replace_all(result, repl_list);
		}
		}
	}

	return fmt::replace_all(code, repl_list);
}

std::string FragmentProgramDecompiler::GetRawCond()
{
	static const char f[4] = { 'x', 'y', 'z', 'w' };

	std::string swizzle, cond;
	swizzle += f[src0.cond_swizzle_x];
	swizzle += f[src0.cond_swizzle_y];
	swizzle += f[src0.cond_swizzle_z];
	swizzle += f[src0.cond_swizzle_w];
	swizzle = swizzle == "xyzw" ? "" : "." + swizzle;

	if (src0.exec_if_gr && src0.exec_if_eq)
		cond = compareFunction(COMPARE::FUNCTION_SGE, AddCond() + swizzle, getFloatTypeName(4) + "(0., 0., 0., 0.)");
	else if (src0.exec_if_lt && src0.exec_if_eq)
		cond = compareFunction(COMPARE::FUNCTION_SLE, AddCond() + swizzle, getFloatTypeName(4) + "(0., 0., 0., 0.)");
	else if (src0.exec_if_gr && src0.exec_if_lt)
		cond = compareFunction(COMPARE::FUNCTION_SNE, AddCond() + swizzle, getFloatTypeName(4) + "(0., 0., 0., 0.)");
	else if (src0.exec_if_gr)
		cond = compareFunction(COMPARE::FUNCTION_SGT, AddCond() + swizzle, getFloatTypeName(4) + "(0., 0., 0., 0.)");
	else if (src0.exec_if_lt)
		cond = compareFunction(COMPARE::FUNCTION_SLT, AddCond() + swizzle, getFloatTypeName(4) + "(0., 0., 0., 0.)");
	else //if(src0.exec_if_eq)
		cond = compareFunction(COMPARE::FUNCTION_SEQ, AddCond() + swizzle, getFloatTypeName(4) + "(0., 0., 0., 0.)");

	return cond;
}

std::string FragmentProgramDecompiler::GetCond()
{
	if (src0.exec_if_gr && src0.exec_if_lt && src0.exec_if_eq)
	{
		return "true";
	}
	else if (!src0.exec_if_gr && !src0.exec_if_lt && !src0.exec_if_eq)
	{
		return "false";
	}

	return "any(" + GetRawCond() + ")";
}

void FragmentProgramDecompiler::AddCodeCond(const std::string& lhs, const std::string& rhs)
{
	if (src0.exec_if_gr && src0.exec_if_lt && src0.exec_if_eq)
	{
		AddCode(lhs + " = " + rhs + ";");
		return;
	}

	if (!src0.exec_if_gr && !src0.exec_if_lt && !src0.exec_if_eq)
	{
		AddCode("//" + lhs + " = " + rhs + ";");
		return;
	}

	std::string src_prefix;
	if (device_props.has_native_half_support && !this->dst.fp16)
	{
		// Target is not fp16 but src might be
		// Usually vecX a = f16vecX b is fine, but causes operator overload issues when used in a mix/lerp function
		// mix(f32, f16, bvec) causes compiler issues
		// NOTE: If dst is fp16 the src will already have been cast to match so this is not a problem in that case

		bool src_is_fp16 = false;
		if ((opflags & (OPFLAGS::texture_ref | OPFLAGS::src_cast_f32)) == 0 &&
			rhs.find("$0") != umax)
		{
			// Texture sample operations are full-width and are exempt
			src_is_fp16 = (src0.fp16 && src0.reg_type == RSX_FP_REGISTER_TYPE_TEMP);

			if (src_is_fp16 && rhs.find("$1") != umax)
			{
				// References operand 1
				src_is_fp16 = (src1.fp16 && src1.reg_type == RSX_FP_REGISTER_TYPE_TEMP);

				if (src_is_fp16 && rhs.find("$2") != umax)
				{
					// References operand 2
					src_is_fp16 = (src2.fp16 && src2.reg_type == RSX_FP_REGISTER_TYPE_TEMP);
				}
			}
		}

		if (src_is_fp16)
		{
			// LHS argument is of native half type, need to cast to proper type!
			if (rhs[0] != '(')
			{
				// Upcast inputs to processing function instead
				opflags |= OPFLAGS::src_cast_f32;
			}
			else
			{
				// No need to add explicit casts all over the place, just cast the result once
				src_prefix = "$Ty";
			}
		}
	}

	// NOTE: x = _select(x, y, cond) is equivalent to x = cond? y : x;
	const auto dst_var = ShaderVariable(lhs);
	const auto raw_cond = dst_var.add_mask(GetRawCond());
	const auto cond = dst_var.match_size(raw_cond);
	AddCode(lhs + " = _select(" + lhs + ", " + src_prefix + rhs + ", " + cond + ");");
}

template<typename T> std::string FragmentProgramDecompiler::GetSRC(T src)
{
	std::string ret;
	bool apply_precision_modifier = !!src1.input_prec_mod;

	switch (src.reg_type)
	{
	case RSX_FP_REGISTER_TYPE_TEMP:

		if (!src.fp16)
		{
			if (dst.opcode == RSX_FP_OPCODE_UP16 ||
				dst.opcode == RSX_FP_OPCODE_UP2 ||
				dst.opcode == RSX_FP_OPCODE_UP4 ||
				dst.opcode == RSX_FP_OPCODE_UPB ||
				dst.opcode == RSX_FP_OPCODE_UPG)
			{
				auto &reg = temp_registers[src.tmp_reg_index];
				if (reg.requires_gather(src.swizzle_x))
				{
					properties.has_gather_op = true;
					AddReg(src.tmp_reg_index, src.fp16);
					ret = getFloatTypeName(4) + reg.gather_r();
					break;
				}
			}
		}
		else if (src1.input_prec_mod == RSX_FP_PRECISION_HALF)
		{
			// clamp16() is not a cheap operation when emulated; avoid at all costs
			apply_precision_modifier = false;
		}

		ret += AddReg(src.tmp_reg_index, src.fp16);

		if (opflags & OPFLAGS::src_cast_f32 && src.fp16 && device_props.has_native_half_support)
		{
			// Upconvert if there is a chance for ambiguity
			ret = getFloatTypeName(4) + "(" + ret + ")";
		}

		break;

	case RSX_FP_REGISTER_TYPE_INPUT:
	{
		static const std::string reg_table[] =
		{
			"wpos",
			"diff_color", "spec_color",
			"fogc",
			"tc0", "tc1", "tc2", "tc3", "tc4", "tc5", "tc6", "tc7", "tc8", "tc9",
			"ssa"
		};

		// NOTE: Hw testing showed the following:
		// 1. Reading from registers 1 and 2 (COL0 and COL1) is clamped to (0, 1)
		// 2. Reading from registers 4-12 (inclusive) is not clamped, but..
		// 3. If the texcoord control mask is enabled, the last 2 values are always 0 and hpos.w!
		const std::string reg_var = (dst.src_attr_reg_num < std::size(reg_table))? reg_table[dst.src_attr_reg_num] : "unk";
		bool insert = true;

		switch (dst.src_attr_reg_num)
		{
		case 0x00:
		{
			// WPOS
			ret += reg_table[0];
			insert = false;
			break;
		}
		case 0x01:
		case 0x02:
		{
			// COL0, COL1
			if (!src2.use_index_reg)
			{
				ret += "_saturate(" + reg_var + ")";
				apply_precision_modifier = false;
			}
			else
			{
				// Raw access
				ret += reg_var;
			}
			break;
		}
		case 0x03:
		{
			// FOGC
			if (!src2.use_index_reg)
			{
				ret += reg_var;
			}
			else
			{
				// Raw access
				ret += "fog_c";
			}
			break;
		}
		case 0x4:
		case 0x5:
		case 0x6:
		case 0x7:
		case 0x8:
		case 0x9:
		case 0xA:
		case 0xB:
		case 0xC:
		case 0xD:
		{
			// TEX0 - TEX9
			// Texcoord 2d mask seems to reset the last 2 arguments to 0 and w if set
			const u8 texcoord = u8(dst.src_attr_reg_num) - 4;
			if (m_prog.texcoord_is_point_coord(texcoord))
			{
				// Point sprite coord generation. Stacks with the 2D override mask.
				if (m_prog.texcoord_is_2d(texcoord))
				{
					ret += getFloatTypeName(4) + "(gl_PointCoord, 0., in_w)";
					properties.has_w_access = true;
				}
				else
				{
					ret += getFloatTypeName(4) + "(gl_PointCoord, 1., 0.)";
				}
			}
			else if (m_prog.texcoord_is_2d(texcoord))
			{
				ret += getFloatTypeName(4) + "(" + reg_var + ".xy, 0., in_w)";
				properties.has_w_access = true;
			}
			else
			{
				ret += reg_var;
			}
			break;
		}
		default:
		{
			// SSA (winding direction register)
			// UNK
			if (reg_var == "unk")
			{
				rsx_log.error("Bad src reg num: %d", u32{ dst.src_attr_reg_num });
			}

			ret += reg_var;
			apply_precision_modifier = false;
			break;
		}
		}

		if (insert)
		{
			m_parr.AddParam(PF_PARAM_IN, getFloatTypeName(4), reg_var);
		}

		properties.in_register_mask |= (1 << dst.src_attr_reg_num);
	}
	break;

	case RSX_FP_REGISTER_TYPE_CONSTANT:
		ret += AddConst();
		apply_precision_modifier = false;
		break;

	case RSX_FP_REGISTER_TYPE_UNKNOWN: // ??? Used by a few games, what is it?
		rsx_log.error("Src type 3 used, opcode=0x%X, dst=0x%X s0=0x%X s1=0x%X s2=0x%X",
				dst.opcode, dst.HEX, src0.HEX, src1.HEX, src2.HEX);

		ret += AddType3();
		apply_precision_modifier = false;
		break;

	default:
		rsx_log.error("Bad src type %d", u32{ src.reg_type });
		Emu.Pause();
		break;
	}

	static const char f[4] = { 'x', 'y', 'z', 'w' };

	std::string swizzle;
	swizzle += f[src.swizzle_x];
	swizzle += f[src.swizzle_y];
	swizzle += f[src.swizzle_z];
	swizzle += f[src.swizzle_w];

	if (strncmp(swizzle.c_str(), f, 4) != 0) ret += "." + swizzle;

	// Warning: Modifier order matters. e.g neg should be applied after precision clamping (tested with Naruto UNS)
	if (src.abs) ret = "abs(" + ret + ")";
	if (apply_precision_modifier) ret = ClampValue(ret, src1.input_prec_mod);
	if (src.neg) ret = "-" + ret;

	return ret;
}

std::string FragmentProgramDecompiler::BuildCode()
{
	// Shader validation
	// Shader must at least write to one output for the body to be considered valid

	const bool fp16_out = !(m_ctrl & CELL_GCM_SHADER_CONTROL_32_BITS_EXPORTS);
	const std::string float4_type = (fp16_out && device_props.has_native_half_support)? getHalfTypeName(4) : getFloatTypeName(4);
	const std::string init_value = float4_type + "(0., 0., 0., 0.)";
	std::array<std::string, 4> output_register_names;
	std::array<u32, 4> ouput_register_indices = { 0, 2, 3, 4 };
	bool shader_is_valid = false;

	// Check depth export
	if (m_ctrl & CELL_GCM_SHADER_CONTROL_DEPTH_EXPORT)
	{
		// Hw tests show that the depth export register is default-initialized to 0 and not wpos.z!!
		m_parr.AddParam(PF_PARAM_NONE, getFloatTypeName(4), "r1", init_value);
		shader_is_valid = (!!temp_registers[1].h1_writes);
	}

	// Add the color output registers. They are statically written to and have guaranteed initialization (except r1.z which == wpos.z)
	// This can be used instead of an explicit clear pass in some games (Motorstorm)
	if (!fp16_out)
	{
		output_register_names = { "r0", "r2", "r3", "r4" };
	}
	else
	{
		output_register_names = { "h0", "h4", "h6", "h8" };
	}

	for (int n = 0; n < 4; ++n)
	{
		if (!m_parr.HasParam(PF_PARAM_NONE, float4_type, output_register_names[n]))
		{
			m_parr.AddParam(PF_PARAM_NONE, float4_type, output_register_names[n], init_value);
			continue;
		}

		const auto block_index = ouput_register_indices[n];
		shader_is_valid |= (!!temp_registers[block_index].h0_writes);
	}

	if (!shader_is_valid)
	{
		properties.has_no_output = true;

		if (!properties.has_discard_op)
		{
			// NOTE: Discard operation overrides output
			rsx_log.warning("Shader does not write to any output register and will be NOPed");
			main = "/*" + main + "*/";
		}
	}

	std::stringstream OS;
	insertHeader(OS);
	OS << "\n";
	insertConstants(OS);
	OS << "\n";
	insertInputs(OS);
	OS << "\n";
	insertOutputs(OS);
	OS << "\n";

	// Insert global function definitions
	insertGlobalFunctions(OS);

	std::string float4 = getFloatTypeName(4);
	const bool glsl = float4 == "vec4";

	if (properties.has_clamp)
	{
		std::string precision_func =
		"$float4 precision_clamp($float4 x, float _min, float _max)\n"
		"{\n"
		"	// Treat NaNs as 0\n"
		"	bvec4 nans = isnan(x);\n"
		"	x = _select(x, $float4(0., 0., 0., 0.), nans);\n"
		"	return clamp(x, _min, _max);\n"
		"}\n\n";

		if (device_props.has_native_half_support)
		{
			precision_func +=
			"$half4 precision_clamp($half4 x, float _min, float _max)\n"
			"{\n"
			"	// Treat NaNs as 0\n"
			"	bvec4 nans = isnan(x);\n"
			"	x = _select(x, $half4(0., 0., 0., 0.), nans);\n"
			"	return clamp(x, $half_t(_min), $half_t(_max));\n"
			"}\n\n";
		}

		OS << Format(precision_func);
	}

	if (!device_props.has_native_half_support)
	{
		// Accurate float to half clamping (preserves IEEE-754 NaN)
		std::string clamp_func =
		"$float4 clamp16($float4 x)\n"
		"{\n";

		if (glsl)
		{
			clamp_func +=
			"	uvec4 bits = floatBitsToUint(x);\n"
			"	uvec4 extend = uvec4(0x7f800000);\n"
			"	bvec4 test = equal(bits & extend, extend);\n"
			"	vec4 clamped = clamp(x, -65504., +65504.);\n"
			"	return _select(clamped, x, test);\n";
		}
		else
		{
			clamp_func +=
			"	if (!isnan(x.x) && !isinf(x.x)) x.x = clamp(x.x, -65504., +65504.);\n"
			"	if (!isnan(x.x) && !isinf(x.x)) x.x = clamp(x.x, -65504., +65504.);\n"
			"	if (!isnan(x.x) && !isinf(x.x)) x.x = clamp(x.x, -65504., +65504.);\n"
			"	if (!isnan(x.x) && !isinf(x.x)) x.x = clamp(x.x, -65504., +65504.);\n"
			"	return x;\n";
		}

		clamp_func +=
		"}\n\n";

		OS << Format(clamp_func);
	}
	else
	{
		// Define raw casts from f32->f16
		OS <<
		"#define clamp16(x) " << getHalfTypeName(4) << "(x)\n";
	}

	OS <<
	"#define _builtin_lit lit_legacy\n"
	"#define _builtin_log2 log2\n"
	"#define _builtin_normalize(x) (length(x) > 0? normalize(x) : x)\n" // HACK!! Workaround for some games that generate NaNs unless texture filtering exactly matches PS3 (BFBC)
	"#define _builtin_sqrt(x) sqrt(abs(x))\n"
	"#define _builtin_rcp(x) (1. / x)\n"
	"#define _builtin_rsq(x) (1. / _builtin_sqrt(x))\n"
	"#define _builtin_div(x, y) (x / y)\n\n";

	if (properties.has_divsq)
	{
		// Define RSX-compliant DIVSQ
		// If the numerator is 0, the result is always 0 even if the denominator is 0
		// NOTE: This operation is component-wise and cannot be accelerated with lerp/mix because these always return NaN if any of the choices is NaN
		std::string divsq_func =
			"$float4 _builtin_divsq($float4 a, float b)\n"
			"{\n"
			"	$float4 tmp = a / _builtin_sqrt(b);\n"
			"	$float4 choice = abs(a);\n";

		if (glsl)
		{
			divsq_func +=
				"	return _select(a, tmp, greaterThan(choice, vec4(0.)));\n";
		}
		else
		{
			divsq_func +=
				"	if (choice.x > 0.) a.x = tmp.x;\n"
				"	if (choice.y > 0.) a.y = tmp.y;\n"
				"	if (choice.z > 0.) a.z = tmp.z;\n"
				"	if (choice.w > 0.) a.w = tmp.w;\n"
				"	return a;\n";
		}

		divsq_func +=
			"}\n\n";

		OS << Format(divsq_func);
	}

	// Declare register gather/merge if needed
	if (properties.has_gather_op)
	{
		std::string float2 = getFloatTypeName(2);

		OS << float4 << " gather(" << float4 << " _h0, " << float4 << " _h1)\n";
		OS << "{\n";
		OS << "	float x = uintBitsToFloat(packHalf2x16(_h0.xy));\n";
		OS << "	float y = uintBitsToFloat(packHalf2x16(_h0.zw));\n";
		OS << "	float z = uintBitsToFloat(packHalf2x16(_h1.xy));\n";
		OS << "	float w = uintBitsToFloat(packHalf2x16(_h1.zw));\n";
		OS << "	return " << float4 << "(x, y, z, w);\n";
		OS << "}\n\n";

		OS << float2 << " gather(" << float4 << " _h)\n";
		OS << "{\n";
		OS << "	float x = uintBitsToFloat(packHalf2x16(_h.xy));\n";
		OS << "	float y = uintBitsToFloat(packHalf2x16(_h.zw));\n";
		OS << "	return " << float2 << "(x, y);\n";
		OS << "}\n\n";
	}

	insertMainStart(OS);
	OS << main << std::endl;
	insertMainEnd(OS);

	return OS.str();
}

bool FragmentProgramDecompiler::handle_sct_scb(u32 opcode)
{
	// Compliance notes based on HW tests:
	// DIV is IEEE compliant as is MUL, LG2, EX2. LG2 with negative input returns NaN as expected.
	// DIVSQ is not compliant. Result is 0 if numerator is 0 regardless of denominator
	// RSQ(0) and RCP(0) return INF as expected
	// RSQ ignores the sign of the inputs (Metro Last Light, GTA4)
	// SAT modifier flushes NaNs to 0
	// Some games that rely on broken DIVSQ behaviour include Dark Souls II and Super Puzzle Fighter II Turbo HD Remix

	switch (opcode)
	{
	case RSX_FP_OPCODE_ADD: SetDst("($0 + $1)"); return true;
	case RSX_FP_OPCODE_DIV: SetDst("_builtin_div($0, $1.x)"); return true;
	case RSX_FP_OPCODE_DIVSQ:
		SetDst("_builtin_divsq($0, $1.x)");
		properties.has_divsq = true;
		return true;
	case RSX_FP_OPCODE_DP2: SetDst(getFunction(FUNCTION::FUNCTION_DP2), OPFLAGS::op_extern); return true;
	case RSX_FP_OPCODE_DP3: SetDst(getFunction(FUNCTION::FUNCTION_DP3), OPFLAGS::op_extern); return true;
	case RSX_FP_OPCODE_DP4: SetDst(getFunction(FUNCTION::FUNCTION_DP4), OPFLAGS::op_extern); return true;
	case RSX_FP_OPCODE_DP2A: SetDst(getFunction(FUNCTION::FUNCTION_DP2A), OPFLAGS::op_extern); return true;
	case RSX_FP_OPCODE_MAD: SetDst("fma($0, $1, $2)", OPFLAGS::src_cast_f32); return true;
	case RSX_FP_OPCODE_MAX: SetDst("max($0, $1)", OPFLAGS::src_cast_f32); return true;
	case RSX_FP_OPCODE_MIN: SetDst("min($0, $1)", OPFLAGS::src_cast_f32); return true;
	case RSX_FP_OPCODE_MOV: SetDst("$0"); return true;
	case RSX_FP_OPCODE_MUL: SetDst("($0 * $1)"); return true;
	case RSX_FP_OPCODE_RCP: SetDst("_builtin_rcp($0.x).xxxx"); return true;
	case RSX_FP_OPCODE_RSQ: SetDst("_builtin_rsq($0.x).xxxx"); return true;
	case RSX_FP_OPCODE_SEQ: SetDst("$Ty(" + compareFunction(COMPARE::FUNCTION_SEQ, "$0", "$1") + ")", OPFLAGS::op_extern); return true;
	case RSX_FP_OPCODE_SFL: SetDst(getFunction(FUNCTION::FUNCTION_SFL), OPFLAGS::skip_type_cast); return true;
	case RSX_FP_OPCODE_SGE: SetDst("$Ty(" + compareFunction(COMPARE::FUNCTION_SGE, "$0", "$1") + ")", OPFLAGS::op_extern); return true;
	case RSX_FP_OPCODE_SGT: SetDst("$Ty(" + compareFunction(COMPARE::FUNCTION_SGT, "$0", "$1") + ")", OPFLAGS::op_extern); return true;
	case RSX_FP_OPCODE_SLE: SetDst("$Ty(" + compareFunction(COMPARE::FUNCTION_SLE, "$0", "$1") + ")", OPFLAGS::op_extern); return true;
	case RSX_FP_OPCODE_SLT: SetDst("$Ty(" + compareFunction(COMPARE::FUNCTION_SLT, "$0", "$1") + ")", OPFLAGS::op_extern); return true;
	case RSX_FP_OPCODE_SNE: SetDst("$Ty(" + compareFunction(COMPARE::FUNCTION_SNE, "$0", "$1") + ")", OPFLAGS::op_extern); return true;
	case RSX_FP_OPCODE_STR: SetDst(getFunction(FUNCTION::FUNCTION_STR), OPFLAGS::skip_type_cast); return true;

	// SCB-only ops
	case RSX_FP_OPCODE_COS: SetDst("cos($0.xxxx)"); return true;
	case RSX_FP_OPCODE_DST: SetDst("distance($0, $1).xxxx", OPFLAGS::src_cast_f32); return true;
	case RSX_FP_OPCODE_REFL: SetDst(getFunction(FUNCTION::FUNCTION_REFL), OPFLAGS::op_extern); return true;
	case RSX_FP_OPCODE_EX2: SetDst("exp2($0.xxxx)"); return true;
	case RSX_FP_OPCODE_FLR: SetDst("floor($0)"); return true;
	case RSX_FP_OPCODE_FRC: SetDst(getFunction(FUNCTION::FUNCTION_FRACT)); return true;
	case RSX_FP_OPCODE_LIT:
		SetDst("_builtin_lit($0)");
		properties.has_lit_op = true;
		return true;
	case RSX_FP_OPCODE_LIF: SetDst("$Ty(1.0, $0.y, ($0.y > 0 ? pow(2.0, $0.w) : 0.0), 1.0)", OPFLAGS::op_extern); return true;
	case RSX_FP_OPCODE_LRP: SetDst("$Ty($2 * (1 - $0) + $1 * $0)", OPFLAGS::skip_type_cast); return true;
	case RSX_FP_OPCODE_LG2: SetDst("_builtin_log2($0.x).xxxx"); return true;
		// Pack operations. See https://www.khronos.org/registry/OpenGL/extensions/NV/NV_fragment_program.txt
	case RSX_FP_OPCODE_PK2: SetDst(getFloatTypeName(4) + "(uintBitsToFloat(packHalf2x16($0.xy)))"); return true;
	case RSX_FP_OPCODE_PK4: SetDst(getFloatTypeName(4) + "(uintBitsToFloat(packSnorm4x8($0)))"); return true;
	case RSX_FP_OPCODE_PK16: SetDst(getFloatTypeName(4) + "(uintBitsToFloat(packSnorm2x16($0.xy)))"); return true;
	case RSX_FP_OPCODE_PKG:
		// Should be similar to PKB but with gamma correction, see description of PK4UBG in khronos page
	case RSX_FP_OPCODE_PKB: SetDst(getFloatTypeName(4) + "(uintBitsToFloat(packUnorm4x8($0)))"); return true;
	case RSX_FP_OPCODE_SIN: SetDst("sin($0.xxxx)"); return true;
	}
	return false;
}

bool FragmentProgramDecompiler::handle_tex_srb(u32 opcode)
{
	switch (opcode)
	{
	case RSX_FP_OPCODE_DDX: SetDst(getFunction(FUNCTION::FUNCTION_DFDX)); return true;
	case RSX_FP_OPCODE_DDY: SetDst(getFunction(FUNCTION::FUNCTION_DFDY)); return true;
	case RSX_FP_OPCODE_NRM: SetDst("_builtin_normalize($0.xyz).xyzz", OPFLAGS::src_cast_f32); return true;
	case RSX_FP_OPCODE_BEM: SetDst("$0.xyxy + $1.xxxx * $2.xzxz + $1.yyyy * $2.ywyw"); return true;
	case RSX_FP_OPCODE_TEXBEM:
		//Untested, should be x2d followed by TEX
		AddX2d();
		AddCode(Format("x2d = $0.xyxy + $1.xxxx * $2.xzxz + $1.yyyy * $2.ywyw;", true));
	case RSX_FP_OPCODE_TEX:
		AddTex();
		switch (m_prog.get_texture_dimension(dst.tex_num))
		{
		case rsx::texture_dimension_extended::texture_dimension_1d:
			SetDst(getFunction(FUNCTION::FUNCTION_TEXTURE_SAMPLE1D));
			return true;
		case rsx::texture_dimension_extended::texture_dimension_2d:
			if (m_prog.shadow_textures & (1 << dst.tex_num))
			{
				m_shadow_sampled_textures |= (1 << dst.tex_num);
				SetDst(getFunction(FUNCTION::FUNCTION_TEXTURE_SHADOW2D) + ".xxxx");
				return true;
			}
			if (m_prog.redirected_textures & (1 << dst.tex_num))
				SetDst(getFunction(FUNCTION::FUNCTION_TEXTURE_SAMPLE2D_DEPTH_RGBA));
			else
				SetDst(getFunction(FUNCTION::FUNCTION_TEXTURE_SAMPLE2D));
			m_2d_sampled_textures |= (1 << dst.tex_num);
			return true;
		case rsx::texture_dimension_extended::texture_dimension_cubemap:
			SetDst(getFunction(FUNCTION::FUNCTION_TEXTURE_SAMPLECUBE));
			return true;
		case rsx::texture_dimension_extended::texture_dimension_3d:
			SetDst(getFunction(FUNCTION::FUNCTION_TEXTURE_SAMPLE3D));
			return true;
		}
		return false;
	case RSX_FP_OPCODE_TXPBEM:
		//Untested, should be x2d followed by TXP
		AddX2d();
		AddCode(Format("x2d = $0.xyxy + $1.xxxx * $2.xzxz + $1.yyyy * $2.ywyw;", true));
	case RSX_FP_OPCODE_TXP:
		AddTex();
		switch (m_prog.get_texture_dimension(dst.tex_num))
		{
		case rsx::texture_dimension_extended::texture_dimension_1d:
			SetDst(getFunction(FUNCTION::FUNCTION_TEXTURE_SAMPLE1D_PROJ));
			return true;
		case rsx::texture_dimension_extended::texture_dimension_2d:
			//Note shadow comparison only returns a true/false result!
			if (m_prog.shadow_textures & (1 << dst.tex_num))
			{
				m_shadow_sampled_textures |= (1 << dst.tex_num);
				SetDst(getFunction(FUNCTION::FUNCTION_TEXTURE_SHADOW2D_PROJ) + ".xxxx");
			}
			else
				SetDst(getFunction(FUNCTION::FUNCTION_TEXTURE_SAMPLE2D_PROJ));
			return true;
		case rsx::texture_dimension_extended::texture_dimension_cubemap:
			SetDst(getFunction(FUNCTION::FUNCTION_TEXTURE_SAMPLECUBE_PROJ));
			return true;
		case rsx::texture_dimension_extended::texture_dimension_3d:
			SetDst(getFunction(FUNCTION::FUNCTION_TEXTURE_SAMPLE3D_PROJ));
			return true;
		}
		return false;
	case RSX_FP_OPCODE_TXD:
		AddTex();
		switch (m_prog.get_texture_dimension(dst.tex_num))
		{
		case rsx::texture_dimension_extended::texture_dimension_1d:
			SetDst(getFunction(FUNCTION::FUNCTION_TEXTURE_SAMPLE1D_GRAD));
			return true;
		case rsx::texture_dimension_extended::texture_dimension_2d:
			SetDst(getFunction(FUNCTION::FUNCTION_TEXTURE_SAMPLE2D_GRAD));
			m_2d_sampled_textures |= (1 << dst.tex_num);
			return true;
		case rsx::texture_dimension_extended::texture_dimension_cubemap:
			SetDst(getFunction(FUNCTION::FUNCTION_TEXTURE_SAMPLECUBE_GRAD));
			return true;
		case rsx::texture_dimension_extended::texture_dimension_3d:
			SetDst(getFunction(FUNCTION::FUNCTION_TEXTURE_SAMPLE3D_GRAD));
			return true;
		}
		return false;
	case RSX_FP_OPCODE_TXB:
		AddTex();
		switch (m_prog.get_texture_dimension(dst.tex_num))
		{
		case rsx::texture_dimension_extended::texture_dimension_1d:
			SetDst(getFunction(FUNCTION::FUNCTION_TEXTURE_SAMPLE1D_BIAS));
			return true;
		case rsx::texture_dimension_extended::texture_dimension_2d:
			SetDst(getFunction(FUNCTION::FUNCTION_TEXTURE_SAMPLE2D_BIAS));
			m_2d_sampled_textures |= (1 << dst.tex_num);
			return true;
		case rsx::texture_dimension_extended::texture_dimension_cubemap:
			SetDst(getFunction(FUNCTION::FUNCTION_TEXTURE_SAMPLECUBE_BIAS));
			return true;
		case rsx::texture_dimension_extended::texture_dimension_3d:
			SetDst(getFunction(FUNCTION::FUNCTION_TEXTURE_SAMPLE3D_BIAS));
			return true;
		}
		return false;
	case RSX_FP_OPCODE_TXL:
		AddTex();
		switch (m_prog.get_texture_dimension(dst.tex_num))
		{
		case rsx::texture_dimension_extended::texture_dimension_1d:
			SetDst(getFunction(FUNCTION::FUNCTION_TEXTURE_SAMPLE1D_LOD));
			return true;
		case rsx::texture_dimension_extended::texture_dimension_2d:
			SetDst(getFunction(FUNCTION::FUNCTION_TEXTURE_SAMPLE2D_LOD));
			m_2d_sampled_textures |= (1 << dst.tex_num);
			return true;
		case rsx::texture_dimension_extended::texture_dimension_cubemap:
			SetDst(getFunction(FUNCTION::FUNCTION_TEXTURE_SAMPLECUBE_LOD));
			return true;
		case rsx::texture_dimension_extended::texture_dimension_3d:
			SetDst(getFunction(FUNCTION::FUNCTION_TEXTURE_SAMPLE3D_LOD));
			return true;
		}
		return false;
	// Unpack operations. See https://www.khronos.org/registry/OpenGL/extensions/NV/NV_fragment_program.txt
	case RSX_FP_OPCODE_UP2: SetDst("unpackHalf2x16(floatBitsToUint($0.x)).xyxy"); return true;
	case RSX_FP_OPCODE_UP4: SetDst("unpackSnorm4x8(floatBitsToUint($0.x))"); return true;
	case RSX_FP_OPCODE_UP16: SetDst("unpackSnorm2x16(floatBitsToUint($0.x)).xyxy"); return true;
	case RSX_FP_OPCODE_UPG:
	// Same as UPB with gamma correction
	case RSX_FP_OPCODE_UPB: SetDst("(unpackUnorm4x8(floatBitsToUint($0.x)))"); return true;
	}
	return false;
}

std::string FragmentProgramDecompiler::Decompile()
{
	auto data = static_cast<be_t<u32>*>(m_prog.addr);
	m_size = 0;
	m_location = 0;
	m_loop_count = 0;
	m_code_level = 1;

	enum
	{
		FORCE_NONE,
		FORCE_SCT,
		FORCE_SCB,
	};

	int forced_unit = FORCE_NONE;

	while (true)
	{
		for (auto found = std::find(m_end_offsets.begin(), m_end_offsets.end(), m_size);
			found != m_end_offsets.end();
			found = std::find(m_end_offsets.begin(), m_end_offsets.end(), m_size))
		{
			m_end_offsets.erase(found);
			m_code_level--;
			AddCode("}");
			m_loop_count--;
		}

		for (auto found = std::find(m_else_offsets.begin(), m_else_offsets.end(), m_size);
			found != m_else_offsets.end();
			found = std::find(m_else_offsets.begin(), m_else_offsets.end(), m_size))
		{
			m_else_offsets.erase(found);
			m_code_level--;
			AddCode("}");
			AddCode("else");
			AddCode("{");
			m_code_level++;
		}

		dst.HEX = GetData(data[0]);
		src0.HEX = GetData(data[1]);
		src1.HEX = GetData(data[2]);
		src2.HEX = GetData(data[3]);

		m_offset = 4 * sizeof(u32);
		opflags = 0;

		const u32 opcode = dst.opcode | (src1.opcode_is_branch << 6);

		auto SIP = [&]()
		{
			switch (opcode)
			{
			case RSX_FP_OPCODE_BRK:
				if (m_loop_count) AddFlowOp("break");
				else rsx_log.error("BRK opcode found outside of a loop");
				break;
			case RSX_FP_OPCODE_CAL:
				rsx_log.error("Unimplemented SIP instruction: CAL");
				break;
			case RSX_FP_OPCODE_FENCT:
				AddCode("//FENCT");
				forced_unit = FORCE_SCT;
				break;
			case RSX_FP_OPCODE_FENCB:
				AddCode("//FENCB");
				forced_unit = FORCE_SCB;
				break;
			case RSX_FP_OPCODE_IFE:
				AddCode("if($cond)");
				if (src2.end_offset != src1.else_offset)
					m_else_offsets.push_back(src1.else_offset << 2);
				m_end_offsets.push_back(src2.end_offset << 2);
				AddCode("{");
				m_code_level++;
				break;
			case RSX_FP_OPCODE_LOOP:
				if (!src0.exec_if_eq && !src0.exec_if_gr && !src0.exec_if_lt)
				{
					AddCode(fmt::format("//$ifcond for(int i%u = %u; i%u < %u; i%u += %u) {} //-> %u //LOOP",
						m_loop_count, src1.init_counter, m_loop_count, src1.end_counter, m_loop_count, src1.increment, src2.end_offset));
				}
				else
				{
					AddCode(fmt::format("$ifcond for(int i%u = %u; i%u < %u; i%u += %u) //LOOP",
						m_loop_count, src1.init_counter, m_loop_count, src1.end_counter, m_loop_count, src1.increment));
					m_loop_count++;
					m_end_offsets.push_back(src2.end_offset << 2);
					AddCode("{");
					m_code_level++;
				}
				break;
			case RSX_FP_OPCODE_REP:
				if (!src0.exec_if_eq && !src0.exec_if_gr && !src0.exec_if_lt)
				{
					AddCode(fmt::format("//$ifcond for(int i%u = %u; i%u < %u; i%u += %u) {} //-> %u //REP",
						m_loop_count, src1.init_counter, m_loop_count, src1.end_counter, m_loop_count, src1.increment, src2.end_offset));
				}
				else
				{
					AddCode(fmt::format("if($cond) for(int i%u = %u; i%u < %u; i%u += %u) //REP",
						m_loop_count, src1.init_counter, m_loop_count, src1.end_counter, m_loop_count, src1.increment));
					m_loop_count++;
					m_end_offsets.push_back(src2.end_offset << 2);
					AddCode("{");
					m_code_level++;
				}
				break;
			case RSX_FP_OPCODE_RET:
				AddFlowOp("return");
				break;

			default:
				return false;
			}

			return true;
		};

		switch (opcode)
		{
		case RSX_FP_OPCODE_NOP: break;
		case RSX_FP_OPCODE_KIL:
			properties.has_discard_op = true;
			AddFlowOp("_kill()");
			break;

		default:
			int prev_force_unit = forced_unit;

			// Some instructions do not respect forced unit
			// Tested with Tales of Vesperia
			if (SIP()) break;
			if (handle_tex_srb(opcode)) break;

			// FENCT/FENCB do not actually reject instructions if they dont match the forced unit
			// Looks like they are optimization hints and not hard-coded forced paths
			if (handle_sct_scb(opcode)) break;
			forced_unit = FORCE_NONE;

			rsx_log.error("Unknown/illegal instruction: 0x%x (forced unit %d)", opcode, prev_force_unit);
			break;
		}

		m_size += m_offset;

		if (dst.end) break;

		verify(HERE), m_offset % sizeof(u32) == 0;
		data += m_offset / sizeof(u32);
	}

	while (m_code_level > 1)
	{
		rsx_log.error("Hanging block found at end of shader. Malformed shader?");

		m_code_level--;
		AddCode("}");
	}

	// flush m_code_level
	m_code_level = 1;
	std::string m_shader = BuildCode();
	main.clear();
	//	m_parr.params.clear();
	return m_shader;
}