From c3d75daad4e5ab4aa962bd8e66cdf3abbcdb4dea Mon Sep 17 00:00:00 2001 From: Yichao Yu Date: Sun, 28 May 2017 11:34:13 -0400 Subject: [PATCH] Add AArch32 and AArch64 CPU name and feature detection code Also implement internal runtime API. The detection code avoid using `/proc/cpuinfo` whenever possible and should be much more reliable than the one in LLVM. It also contains a much larger CPUID table to decode CPU names. Compare to X86, the feature encoding/decoding is more complex due to the way LLVM takes attributes. Certain information (arch version) also needs to be moved between name and feature list. --- src/features_aarch32.h | 28 + src/features_aarch64.h | 25 + src/processor.cpp | 36 +- src/processor.h | 6 + src/processor_arm.cpp | 1443 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 1504 insertions(+), 34 deletions(-) create mode 100644 src/features_aarch32.h create mode 100644 src/features_aarch64.h create mode 100644 src/processor_arm.cpp diff --git a/src/features_aarch32.h b/src/features_aarch32.h new file mode 100644 index 0000000000000..803d576c61548 --- /dev/null +++ b/src/features_aarch32.h @@ -0,0 +1,28 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +// AArch32 features definition +// hwcap +JL_FEATURE_DEF(neon, 12, 0) +JL_FEATURE_DEF(vfp3, 13, 0) +// JL_FEATURE_DEF(vfpv3d16, 14, 0) // d16 +JL_FEATURE_DEF(vfp4, 16, 0) +JL_FEATURE_DEF_NAME(hwdiv_arm, 17, 0, "hwdiv-arm") +JL_FEATURE_DEF(hwdiv, 18, 0) +JL_FEATURE_DEF(d32, 19, 0) // -d16 + +// hwcap2 +JL_FEATURE_DEF(crypto, 32 + 0, 0) +JL_FEATURE_DEF(crc, 32 + 4, 0) +// JL_FEATURE_DEF(ras, 32 + ???, 0) +// JL_FEATURE_DEF(fullfp16, 32 + ???, 0) + +// custom bits to match llvm model +JL_FEATURE_DEF(aclass, 32 * 2 + 0, 0) +JL_FEATURE_DEF(rclass, 32 * 2 + 1, 0) +JL_FEATURE_DEF(mclass, 32 * 2 + 2, 0) +JL_FEATURE_DEF(v7, 32 * 2 + 3, 0) +JL_FEATURE_DEF(v8, 32 * 2 + 4, 0) +JL_FEATURE_DEF(v8_1a, 32 * 2 + 5, 0) +JL_FEATURE_DEF(v8_2a, 32 * 2 + 6, 0) +JL_FEATURE_DEF(v8_3a, 32 * 2 + 7, 60000) +JL_FEATURE_DEF(v8_m_main, 32 * 2 + 8, 0) diff --git a/src/features_aarch64.h b/src/features_aarch64.h new file mode 100644 index 0000000000000..1cb869f06c4f0 --- /dev/null +++ b/src/features_aarch64.h @@ -0,0 +1,25 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +// AArch64 features definition +// hwcap +JL_FEATURE_DEF(crypto, 3, 0) +JL_FEATURE_DEF(crc, 7, 0) +JL_FEATURE_DEF(lse, 8, 40000) // ARMv8.1-Atomics +JL_FEATURE_DEF(fullfp16, 9, 0) +JL_FEATURE_DEF(rdm, 12, 50000) // ARMv8.1-SIMD +JL_FEATURE_DEF(jscvt, 13, UINT32_MAX) // Linux Kernel HWCAP name +JL_FEATURE_DEF(fcma, 14, UINT32_MAX) // Linux Kernel HWCAP name +JL_FEATURE_DEF(rcpc, 15, 60000) +JL_FEATURE_DEF(dcpop, 16, UINT32_MAX) // Linux Kernel HWCAP name +// JL_FEATURE_DEF(dotprod, ???, 60000) // ARMv8.2-DotProd +// JL_FEATURE_DEF(ras, ???, 0) +// JL_FEATURE_DEF(sve, ???, UINT32_MAX) + +// hwcap2 +// JL_FEATURE_DEF(?, 32 + ?, 0) + +// custom bits to match llvm model +JL_FEATURE_DEF(v8_1a, 32 * 2 + 0, 0) +JL_FEATURE_DEF(v8_2a, 32 * 2 + 1, 0) +JL_FEATURE_DEF(v8_3a, 32 * 2 + 2, 60000) +// JL_FEATURE_DEF(v8_4a, 32 * 2 + 3, ???) diff --git a/src/processor.cpp b/src/processor.cpp index f08fef5db4c79..ba5072efadf6a 100644 --- a/src/processor.cpp +++ b/src/processor.cpp @@ -797,41 +797,9 @@ static inline void dump_cpu_spec(uint32_t cpu, const FeatureList &features, #include "processor_x86.cpp" -#elif defined(_CPU_AARCH64_) +#elif defined(_CPU_AARCH64_) || defined(_CPU_ARM_) -// TODO -JL_DLLEXPORT jl_value_t *jl_get_cpu_name(void) -{ - return jl_cstr_to_string(jl_get_cpu_name_llvm().c_str()); -} - -// FZ, bit [24] -static const uint32_t fpcr_fz_mask = 1 << 24; - -static inline uint32_t get_fpcr_aarch64(void) -{ - uint32_t fpcr; - asm volatile("mrs %0, fpcr" : "=r"(fpcr)); - return fpcr; -} - -static inline void set_fpcr_aarch64(uint32_t fpcr) -{ - asm volatile("msr fpcr, %0" :: "r"(fpcr)); -} - -extern "C" JL_DLLEXPORT int32_t jl_get_zero_subnormals(void) -{ - return (get_fpcr_aarch64() & fpcr_fz_mask) != 0; -} - -extern "C" JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero) -{ - uint32_t fpcr = get_fpcr_aarch64(); - fpcr = isZero ? (fpcr | fpcr_fz_mask) : (fpcr & ~fpcr_fz_mask); - set_fpcr_aarch64(fpcr); - return 0; -} +#include "processor_arm.cpp" #else diff --git a/src/processor.h b/src/processor.h index 7b43aaca8a750..66d2b135b3291 100644 --- a/src/processor.h +++ b/src/processor.h @@ -110,6 +110,12 @@ typedef enum { #define JL_FEATURE_DEF(name, bit, llvmver) JL_X86_##name = bit, #include "features_x86.h" #undef JL_FEATURE_DEF +#define JL_FEATURE_DEF(name, bit, llvmver) JL_AArch32_##name = bit, +#include "features_aarch32.h" +#undef JL_FEATURE_DEF +#define JL_FEATURE_DEF(name, bit, llvmver) JL_AArch64_##name = bit, +#include "features_aarch64.h" +#undef JL_FEATURE_DEF } jl_cpu_feature_t; #undef JL_FEATURE_DEF_NAME diff --git a/src/processor_arm.cpp b/src/processor_arm.cpp new file mode 100644 index 0000000000000..5f8f3fae1a3da --- /dev/null +++ b/src/processor_arm.cpp @@ -0,0 +1,1443 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +// ARM (AArch32/AArch64) specific processor detection and dispatch + +#include +#include +#include +#include +#include +#include +#include + +#if defined(_CPU_AARCH64_) || __GLIBC_PREREQ(2, 16) +# include +#else +# define DYN_GETAUXVAL +#endif + +namespace ARM { +enum class CPU : uint32_t { + generic = 0, + + // Architecture targets + armv7_a, + armv7_m, + armv7e_m, + armv7_r, + armv8_a, + armv8_m_base, + armv8_m_main, + armv8_r, + armv8_1_a, + armv8_2_a, + armv8_3_a, + // armv8_4_a, + + // ARM + // armv6l + arm_mpcore, + arm_1136jf_s, + arm_1156t2f_s, + arm_1176jzf_s, + arm_cortex_m0, + arm_cortex_m1, + // armv7ml + arm_cortex_m3, + arm_cortex_m4, + arm_cortex_m7, + // armv7l + arm_cortex_a5, + arm_cortex_a7, + arm_cortex_a8, + arm_cortex_a9, + arm_cortex_a12, + arm_cortex_a15, + arm_cortex_a17, + arm_cortex_r4, + arm_cortex_r5, + arm_cortex_r7, + arm_cortex_r8, + // armv8ml + arm_cortex_m23, + arm_cortex_m33, + // armv8l + arm_cortex_a32, + arm_cortex_r52, + // aarch64 + arm_cortex_a35, + arm_cortex_a53, + arm_cortex_a55, + arm_cortex_a57, + arm_cortex_a72, + arm_cortex_a73, + arm_cortex_a75, + + // Cavium + // aarch64 + cavium_thunderx, + cavium_thunderx88, + cavium_thunderx88p1, + cavium_thunderx81, + cavium_thunderx83, + cavium_thunderx2t99, + cavium_thunderx2t99p1, + + // NVIDIA + // aarch64 + nvidia_denver1, + nvidia_denver2, + + // AppliedMicro + // aarch64 + apm_xgene1, + apm_xgene2, + apm_xgene3, + + // Qualcomm + // armv7l + qualcomm_scorpion, + qualcomm_krait, + // aarch64 + qualcomm_kyro, + qualcomm_falkor, + qualcomm_saphira, + + // Samsung + // aarch64 + samsung_exynos_m1, + samsung_exynos_m2, + samsung_exynos_m3, + + // Apple + // armv7l + apple_swift, + // aarch64 + apple_cyclone, + apple_typhoon, + apple_twister, + apple_hurricane, + + // Marvell + // armv7l + marvell_pj4, + + // Intel + // armv7l + intel_3735d, +}; + +#ifdef _CPU_AARCH64_ +static constexpr size_t feature_sz = 3; +static constexpr FeatureName feature_names[] = { +#define JL_FEATURE_DEF(name, bit, llvmver) {#name, bit, llvmver}, +#define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) {str, bit, llvmver}, +#include "features_aarch64.h" +#undef JL_FEATURE_DEF +#undef JL_FEATURE_DEF_NAME +}; +static constexpr uint32_t nfeature_names = sizeof(feature_names) / sizeof(FeatureName); + +template +static inline constexpr FeatureList get_feature_masks(Args... args) +{ + return ::get_feature_masks(args...); +} + +#define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) JL_FEATURE_DEF(name, bit, llvmver) +static constexpr auto feature_masks = get_feature_masks( +#define JL_FEATURE_DEF(name, bit, llvmver) bit, +#include "features_aarch64.h" +#undef JL_FEATURE_DEF + -1); +static const auto real_feature_masks = + feature_masks & FeatureList{{(uint32_t)-1, (uint32_t)-1, 0}}; + +namespace Feature { +enum : uint32_t { +#define JL_FEATURE_DEF(name, bit, llvmver) name = bit, +#include "features_aarch64.h" +#undef JL_FEATURE_DEF +}; +#undef JL_FEATURE_DEF_NAME +// This does not cover all dependencies (e.g. the ones that depends on arm versions) +static constexpr FeatureDep deps[] = { + {0, 0} // dummy +}; + +constexpr auto generic = get_feature_masks(); +constexpr auto armv8a_crc = get_feature_masks(crc); +constexpr auto armv8a_crc_crypto = armv8a_crc | get_feature_masks(crypto); +constexpr auto armv8_1a = armv8a_crc | get_feature_masks(v8_1a, lse, rdm); // lor, hpd +constexpr auto armv8_2a = armv8_1a | get_feature_masks(v8_2a); // ras +constexpr auto armv8_2a_crypto = armv8_2a | get_feature_masks(crypto); +constexpr auto armv8_3a = armv8_2a | get_feature_masks(v8_3a, rcpc); +constexpr auto armv8_3a_crypto = armv8_3a | get_feature_masks(crypto); + +constexpr auto arm_cortex_a32 = generic; // TODO? (crc, crypto) +constexpr auto arm_cortex_a35 = generic; // TODO? (crc, crypto) +constexpr auto arm_cortex_a53 = armv8a_crc; +constexpr auto arm_cortex_a55 = armv8_2a_crypto | get_feature_masks(rcpc); // dotprod; +constexpr auto arm_cortex_a57 = armv8a_crc; +constexpr auto arm_cortex_a72 = armv8a_crc; +constexpr auto arm_cortex_a73 = armv8a_crc; +constexpr auto arm_cortex_a75 = armv8_2a_crypto | get_feature_masks(rcpc); // dotprod; +constexpr auto cavium_thunderx = armv8a_crc_crypto; +constexpr auto cavium_thunderx88 = armv8a_crc_crypto; +constexpr auto cavium_thunderx88p1 = armv8a_crc_crypto; +constexpr auto cavium_thunderx81 = armv8a_crc_crypto; +constexpr auto cavium_thunderx83 = armv8a_crc_crypto; +constexpr auto cavium_thunderx2t99 = armv8a_crc_crypto | get_feature_masks(v8_1a); +constexpr auto cavium_thunderx2t99p1 = armv8a_crc_crypto | get_feature_masks(v8_1a); +constexpr auto nvidia_denver1 = generic; // TODO? (crc, crypto) +constexpr auto nvidia_denver2 = armv8a_crc_crypto; +constexpr auto apm_xgene1 = generic; +constexpr auto apm_xgene2 = generic; // TODO? +constexpr auto apm_xgene3 = generic; // TODO? +constexpr auto qualcomm_kyro = armv8a_crc_crypto; +constexpr auto qualcomm_falkor = armv8a_crc_crypto; +constexpr auto qualcomm_saphira = armv8_3a_crypto; +constexpr auto samsung_exynos_m1 = armv8a_crc_crypto; +constexpr auto samsung_exynos_m2 = armv8a_crc_crypto; +constexpr auto samsung_exynos_m3 = armv8a_crc_crypto; +constexpr auto apple_cyclone = armv8a_crc_crypto; +constexpr auto apple_typhoon = armv8a_crc_crypto; +constexpr auto apple_twister = armv8a_crc_crypto; +constexpr auto apple_hurricane = armv8a_crc_crypto; + +} + +static constexpr CPUSpec cpus[] = { + {"generic", CPU::generic, CPU::generic, 0, Feature::generic}, + {"armv8.1-a", CPU::armv8_1_a, CPU::generic, 0, Feature::armv8_1a}, + {"armv8.2-a", CPU::armv8_2_a, CPU::generic, 0, Feature::armv8_2a}, + {"armv8.3_a", CPU::armv8_3_a, CPU::generic, 0, Feature::armv8_3a}, + {"cortex-a35", CPU::arm_cortex_a35, CPU::generic, 0, Feature::arm_cortex_a35}, + {"cortex-a53", CPU::arm_cortex_a53, CPU::generic, 0, Feature::arm_cortex_a53}, + {"cortex-a55", CPU::arm_cortex_a55, CPU::arm_cortex_a53, UINT32_MAX, Feature::arm_cortex_a55}, + {"cortex-a57", CPU::arm_cortex_a57, CPU::generic, 0, Feature::arm_cortex_a57}, + {"cortex-a72", CPU::arm_cortex_a72, CPU::generic, 0, Feature::arm_cortex_a72}, + {"cortex-a73", CPU::arm_cortex_a73, CPU::generic, 0, Feature::arm_cortex_a73}, + {"cortex-a75", CPU::arm_cortex_a75, CPU::arm_cortex_a73, UINT32_MAX, Feature::arm_cortex_a75}, + {"thunderx", CPU::cavium_thunderx, CPU::generic, 50000, Feature::cavium_thunderx}, + {"thunderxt88", CPU::cavium_thunderx88, CPU::generic, 50000, Feature::cavium_thunderx88}, + {"thunderxt88p1", CPU::cavium_thunderx88p1, CPU::cavium_thunderx88, UINT32_MAX, + Feature::cavium_thunderx88p1}, + {"thunderxt81", CPU::cavium_thunderx81, CPU::generic, 50000, Feature::cavium_thunderx81}, + {"thunderxt83", CPU::cavium_thunderx83, CPU::generic, 50000, Feature::cavium_thunderx83}, + {"thunderx2t99", CPU::cavium_thunderx2t99, CPU::generic, 50000, + Feature::cavium_thunderx2t99}, + {"thunderx2t99p1", CPU::cavium_thunderx2t99p1, CPU::cavium_thunderx2t99, UINT32_MAX, + Feature::cavium_thunderx2t99p1}, + {"denver1", CPU::nvidia_denver1, CPU::generic, UINT32_MAX, Feature::nvidia_denver1}, + {"denver2", CPU::nvidia_denver2, CPU::generic, UINT32_MAX, Feature::nvidia_denver2}, + {"xgene1", CPU::apm_xgene1, CPU::generic, UINT32_MAX, Feature::apm_xgene1}, + {"xgene2", CPU::apm_xgene2, CPU::generic, UINT32_MAX, Feature::apm_xgene2}, + {"xgene3", CPU::apm_xgene3, CPU::generic, UINT32_MAX, Feature::apm_xgene3}, + {"kyro", CPU::qualcomm_kyro, CPU::generic, 0, Feature::qualcomm_kyro}, + {"falkor", CPU::qualcomm_falkor, CPU::generic, 40000, Feature::qualcomm_falkor}, + {"saphira", CPU::qualcomm_saphira, CPU::qualcomm_falkor, 60000, Feature::qualcomm_saphira}, + {"exynos-m1", CPU::samsung_exynos_m1, CPU::generic, 0, Feature::samsung_exynos_m1}, + {"exynos-m2", CPU::samsung_exynos_m2, CPU::samsung_exynos_m1, 40000, + Feature::samsung_exynos_m2}, + {"exynos-m3", CPU::samsung_exynos_m3, CPU::samsung_exynos_m2, 40000, + Feature::samsung_exynos_m3}, + {"cyclone", CPU::apple_cyclone, CPU::generic, 0, Feature::apple_cyclone}, + {"typhoon", CPU::apple_typhoon, CPU::apple_cyclone, UINT32_MAX, Feature::apple_typhoon}, + {"twister", CPU::apple_twister, CPU::apple_typhoon, UINT32_MAX, Feature::apple_twister}, + {"hurricane", CPU::apple_hurricane, CPU::apple_twister, UINT32_MAX, Feature::apple_hurricane}, +}; +#else +static constexpr size_t feature_sz = 3; +static constexpr FeatureName feature_names[] = { +#define JL_FEATURE_DEF(name, bit, llvmver) {#name, bit, llvmver}, +#define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) {str, bit, llvmver}, +#include "features_aarch32.h" +#undef JL_FEATURE_DEF +#undef JL_FEATURE_DEF_NAME +}; +static constexpr uint32_t nfeature_names = sizeof(feature_names) / sizeof(FeatureName); + +template +static inline constexpr FeatureList get_feature_masks(Args... args) +{ + return ::get_feature_masks(args...); +} + +#define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) JL_FEATURE_DEF(name, bit, llvmver) +static constexpr auto feature_masks = get_feature_masks( +#define JL_FEATURE_DEF(name, bit, llvmver) bit, +#include "features_aarch32.h" +#undef JL_FEATURE_DEF + -1); +static const auto real_feature_masks = + feature_masks & FeatureList{{(uint32_t)-1, (uint32_t)-1, 0}}; + +namespace Feature { +enum : uint32_t { +#define JL_FEATURE_DEF(name, bit, llvmver) name = bit, +#include "features_aarch32.h" +#undef JL_FEATURE_DEF +}; +#undef JL_FEATURE_DEF_NAME +// This does not cover all dependencies (e.g. the ones that depends on arm versions) +static constexpr FeatureDep deps[] = { + {neon, vfp3}, + {vfp4, vfp3}, + {crypto, neon}, +}; + +// These are the real base requirements of the specific architectures +constexpr auto _armv7m = get_feature_masks(v7, mclass, hwdiv); +constexpr auto _armv7a = get_feature_masks(v7, aclass); +constexpr auto _armv7r = get_feature_masks(v7, rclass); +constexpr auto _armv8m = get_feature_masks(v7, v8, mclass, hwdiv); +constexpr auto _armv8a = get_feature_masks(v7, v8, aclass, neon, vfp3, vfp4, d32, + hwdiv, hwdiv_arm); +constexpr auto _armv8r = get_feature_masks(v7, v8, rclass, neon, vfp3, vfp4, d32, + hwdiv, hwdiv_arm); + +// Set `generic` to match the feature requirement of the `C` code. +// we'll require at least these when compiling the sysimg. +#if __ARM_ARCH >= 8 +# if !defined(__ARM_ARCH_PROFILE) +constexpr auto generic = get_feature_masks(v7, v8, hwdiv); +# elif __ARM_ARCH_PROFILE == 'A' +constexpr auto generic = _armv8a; +# elif __ARM_ARCH_PROFILE == 'R' +constexpr auto generic = _armv8r; +# elif __ARM_ARCH_PROFILE == 'M' +constexpr auto generic = _armv8m; +# else +constexpr auto generic = get_feature_masks(v7, v8, hwdiv); +# endif +#elif __ARM_ARCH == 7 +# if !defined(__ARM_ARCH_PROFILE) +constexpr auto generic = get_feature_masks(v7); +# elif __ARM_ARCH_PROFILE == 'A' +constexpr auto generic = _armv7a; +# elif __ARM_ARCH_PROFILE == 'R' +constexpr auto generic = _armv7r; +# elif __ARM_ARCH_PROFILE == 'M' +constexpr auto generic = _armv7m; +# else +constexpr auto generic = get_feature_masks(v7); +# endif +#else +constexpr auto generic = get_feature_masks(); +#endif + +// All feature sets below should use or be or'ed with one of these (or generic). +// This makes sure that, for example, the `generic` target on `armv7-a` binary is equivalent +// to the `armv7-a` target. +constexpr auto armv7m = generic | _armv7m; +constexpr auto armv7a = generic | _armv7a; +constexpr auto armv7r = generic | _armv7r; +constexpr auto armv8m = generic | _armv8m; +constexpr auto armv8a = generic | _armv8a; +constexpr auto armv8r = generic | _armv8r; + +// armv7l +constexpr auto arm_cortex_a5 = armv7a; +constexpr auto arm_cortex_a7 = armv7a | get_feature_masks(vfp3, vfp4, neon); +constexpr auto arm_cortex_a8 = armv7a | get_feature_masks(d32, vfp3, neon); +constexpr auto arm_cortex_a9 = armv7a; +constexpr auto arm_cortex_a12 = armv7a | get_feature_masks(d32, vfp3, vfp4, neon); +constexpr auto arm_cortex_a15 = armv7a | get_feature_masks(d32, vfp3, vfp4, neon); +constexpr auto arm_cortex_a17 = armv7a | get_feature_masks(d32, vfp3, vfp4, neon); +constexpr auto arm_cortex_r4 = armv7r | get_feature_masks(vfp3, hwdiv); +constexpr auto arm_cortex_r5 = armv7r | get_feature_masks(vfp3, hwdiv, hwdiv_arm); +constexpr auto arm_cortex_r7 = armv7r | get_feature_masks(vfp3, hwdiv, hwdiv_arm); +constexpr auto arm_cortex_r8 = armv7r | get_feature_masks(vfp3, hwdiv, hwdiv_arm); +constexpr auto qualcomm_scorpion = armv7a | get_feature_masks(v7, aclass, vfp3, neon); +constexpr auto qualcomm_krait = armv7a | get_feature_masks(vfp3, vfp4, neon, hwdiv, hwdiv_arm); +constexpr auto apple_swift = armv7a | get_feature_masks(d32, vfp3, vfp4, neon, hwdiv, hwdiv_arm); +constexpr auto marvell_pj4 = armv7a | get_feature_masks(vfp3); +constexpr auto intel_3735d = armv7a | get_feature_masks(vfp3, neon); +// armv8ml +constexpr auto arm_cortex_m23 = armv8m; // unsupported +constexpr auto arm_cortex_m33 = armv8m | get_feature_masks(v8_m_main); // unsupported +// armv8l +constexpr auto armv8a_crc = armv8a | get_feature_masks(crc); +constexpr auto armv8_1a = armv8a_crc | get_feature_masks(v8_1a); +constexpr auto armv8_2a = armv8_1a | get_feature_masks(v8_2a); +constexpr auto armv8a_crc_crypto = armv8a_crc | get_feature_masks(crypto); +constexpr auto armv8_2a_crypto = armv8_2a | get_feature_masks(crypto); +constexpr auto armv8_3a = armv8_2a | get_feature_masks(v8_3a); +constexpr auto armv8_3a_crypto = armv8_3a | get_feature_masks(crypto); + +constexpr auto arm_cortex_a32 = armv8a; // TODO? (crc, crypto) +constexpr auto arm_cortex_r52 = armv8r; // TODO? (crc, crypto) +constexpr auto arm_cortex_a35 = armv8a; // TODO? (crc, crypto) +constexpr auto arm_cortex_a53 = armv8a_crc; +constexpr auto arm_cortex_a55 = armv8_2a_crypto; +constexpr auto arm_cortex_a57 = armv8a_crc; +constexpr auto arm_cortex_a72 = armv8a_crc; +constexpr auto arm_cortex_a73 = armv8a_crc; +constexpr auto arm_cortex_a75 = armv8_2a_crypto; +constexpr auto cavium_thunderx = armv8a_crc_crypto; +constexpr auto cavium_thunderx88 = armv8a_crc_crypto; +constexpr auto cavium_thunderx88p1 = armv8a_crc_crypto; +constexpr auto cavium_thunderx81 = armv8a_crc_crypto; +constexpr auto cavium_thunderx83 = armv8a_crc_crypto; +constexpr auto cavium_thunderx2t99 = armv8a_crc_crypto | get_feature_masks(v8_1a); +constexpr auto cavium_thunderx2t99p1 = armv8a_crc_crypto | get_feature_masks(v8_1a); +constexpr auto nvidia_denver1 = armv8a; // TODO? (crc, crypto) +constexpr auto nvidia_denver2 = armv8a_crc_crypto; +constexpr auto apm_xgene1 = armv8a; +constexpr auto apm_xgene2 = armv8a; // TODO? +constexpr auto apm_xgene3 = armv8a; // TODO? +constexpr auto qualcomm_kyro = armv8a_crc_crypto; +constexpr auto qualcomm_falkor = armv8a_crc_crypto; +constexpr auto qualcomm_saphira = armv8_3a_crypto; +constexpr auto samsung_exynos_m1 = armv8a_crc_crypto; +constexpr auto samsung_exynos_m2 = armv8a_crc_crypto; +constexpr auto samsung_exynos_m3 = armv8a_crc_crypto; +constexpr auto apple_cyclone = armv8a_crc_crypto; +constexpr auto apple_typhoon = armv8a_crc_crypto; +constexpr auto apple_twister = armv8a_crc_crypto; +constexpr auto apple_hurricane = armv8a_crc_crypto; + +} + +static constexpr CPUSpec cpus[] = { + {"generic", CPU::generic, CPU::generic, 0, Feature::generic}, + // armv6 + {"mpcore", CPU::arm_mpcore, CPU::generic, 0, Feature::generic}, + {"arm1136jf-s", CPU::arm_1136jf_s, CPU::generic, 0, Feature::generic}, + {"arm1156t2f-s", CPU::arm_1156t2f_s, CPU::generic, 0, Feature::generic}, + {"arm1176jzf-s", CPU::arm_1176jzf_s, CPU::generic, 0, Feature::generic}, + {"cortex-m0", CPU::arm_cortex_m0, CPU::generic, 0, Feature::generic}, + {"cortex-m1", CPU::arm_cortex_m1, CPU::generic, 0, Feature::generic}, + // armv7ml + {"armv7-m", CPU::armv7_m, CPU::generic, 0, Feature::armv7m}, + {"armv7e-m", CPU::armv7e_m, CPU::generic, 0, Feature::armv7m}, + {"cortex-m3", CPU::arm_cortex_m3, CPU::generic, 0, Feature::armv7m}, + {"cortex-m4", CPU::arm_cortex_m4, CPU::generic, 0, Feature::armv7m}, + {"cortex-m7", CPU::arm_cortex_m7, CPU::generic, 0, Feature::armv7m}, + // armv7l + {"armv7-a", CPU::armv7_a, CPU::generic, 0, Feature::armv7a}, + {"armv7-r", CPU::armv7_r, CPU::generic, 0, Feature::armv7r}, + {"cortex-a5", CPU::arm_cortex_a5, CPU::generic, 0, Feature::arm_cortex_a5}, + {"cortex-a7", CPU::arm_cortex_a7, CPU::generic, 0, Feature::arm_cortex_a7}, + {"cortex-a8", CPU::arm_cortex_a8, CPU::generic, 0, Feature::arm_cortex_a8}, + {"cortex-a9", CPU::arm_cortex_a9, CPU::generic, 0, Feature::arm_cortex_a9}, + {"cortex-a12", CPU::arm_cortex_a12, CPU::generic, 0, Feature::arm_cortex_a12}, + {"cortex-a15", CPU::arm_cortex_a15, CPU::generic, 0, Feature::arm_cortex_a15}, + {"cortex-a17", CPU::arm_cortex_a17, CPU::generic, 0, Feature::arm_cortex_a17}, + {"cortex-r4", CPU::arm_cortex_r4, CPU::generic, 0, Feature::arm_cortex_r4}, + {"cortex-r5", CPU::arm_cortex_r5, CPU::generic, 0, Feature::arm_cortex_r5}, + {"cortex-r7", CPU::arm_cortex_r7, CPU::generic, 0, Feature::arm_cortex_r7}, + {"cortex-r8", CPU::arm_cortex_r8, CPU::generic, 0, Feature::arm_cortex_r8}, + {"scorpion", CPU::qualcomm_scorpion, CPU::armv7_a, UINT32_MAX, Feature::qualcomm_scorpion}, + {"krait", CPU::qualcomm_krait, CPU::generic, 0, Feature::qualcomm_krait}, + {"swift", CPU::apple_swift, CPU::generic, 0, Feature::apple_swift}, + {"pj4", CPU::marvell_pj4, CPU::armv7_a, UINT32_MAX, Feature::marvell_pj4}, + {"3735d", CPU::intel_3735d, CPU::armv7_a, UINT32_MAX, Feature::intel_3735d}, + + // armv8ml + {"armv8-m.base", CPU::armv8_m_base, CPU::generic, 0, Feature::armv8m}, + {"armv8-m.main", CPU::armv8_m_main, CPU::generic, 0, Feature::armv8m}, + {"cortex-m23", CPU::arm_cortex_m23, CPU::armv8_m_base, 50000, Feature::arm_cortex_m23}, + {"cortex-m33", CPU::arm_cortex_m33, CPU::armv8_m_main, 50000, Feature::arm_cortex_m33}, + + // armv8l + {"armv8-a", CPU::armv8_a, CPU::generic, 0, Feature::armv8a}, + {"armv8-r", CPU::armv8_r, CPU::generic, 0, Feature::armv8r}, + {"armv8.1-a", CPU::armv8_1_a, CPU::generic, 0, Feature::armv8_1a}, + {"armv8.2-a", CPU::armv8_2_a, CPU::generic, 0, Feature::armv8_2a}, + {"armv8.3-a", CPU::armv8_3_a, CPU::generic, 0, Feature::armv8_3a}, + {"cortex-a32", CPU::arm_cortex_a32, CPU::generic, 0, Feature::arm_cortex_a32}, + {"cortex-r52", CPU::arm_cortex_r52, CPU::armv8_r, 40000, Feature::arm_cortex_r52}, + {"cortex-a35", CPU::arm_cortex_a35, CPU::generic, 0, Feature::arm_cortex_a35}, + {"cortex-a53", CPU::arm_cortex_a53, CPU::generic, 0, Feature::arm_cortex_a53}, + {"cortex-a55", CPU::arm_cortex_a55, CPU::arm_cortex_a53, 60000, Feature::arm_cortex_a55}, + {"cortex-a57", CPU::arm_cortex_a57, CPU::generic, 0, Feature::arm_cortex_a57}, + {"cortex-a72", CPU::arm_cortex_a72, CPU::generic, 0, Feature::arm_cortex_a72}, + {"cortex-a73", CPU::arm_cortex_a73, CPU::generic, 0, Feature::arm_cortex_a73}, + {"cortex-a75", CPU::arm_cortex_a75, CPU::arm_cortex_a73, 60000, Feature::arm_cortex_a75}, + {"thunderx", CPU::cavium_thunderx, CPU::armv8_a, UINT32_MAX, Feature::cavium_thunderx}, + {"thunderx88", CPU::cavium_thunderx88, CPU::armv8_a, UINT32_MAX, Feature::cavium_thunderx88}, + {"thunderx88p1", CPU::cavium_thunderx88p1, CPU::armv8_a, UINT32_MAX, + Feature::cavium_thunderx88p1}, + {"thunderx81", CPU::cavium_thunderx81, CPU::armv8_a, UINT32_MAX, + Feature::cavium_thunderx81}, + {"thunderx83", CPU::cavium_thunderx83, CPU::armv8_a, UINT32_MAX, + Feature::cavium_thunderx83}, + {"thunderx2t99", CPU::cavium_thunderx2t99, CPU::armv8_a, UINT32_MAX, + Feature::cavium_thunderx2t99}, + {"thunderx2t99p1", CPU::cavium_thunderx2t99p1, CPU::armv8_a, UINT32_MAX, + Feature::cavium_thunderx2t99p1}, + {"denver1", CPU::nvidia_denver1, CPU::arm_cortex_a53, UINT32_MAX, Feature::nvidia_denver1}, + {"denver2", CPU::nvidia_denver2, CPU::arm_cortex_a57, UINT32_MAX, Feature::nvidia_denver2}, + {"xgene1", CPU::apm_xgene1, CPU::armv8_a, UINT32_MAX, Feature::apm_xgene1}, + {"xgene2", CPU::apm_xgene2, CPU::armv8_a, UINT32_MAX, Feature::apm_xgene2}, + {"xgene3", CPU::apm_xgene3, CPU::armv8_a, UINT32_MAX, Feature::apm_xgene3}, + {"kyro", CPU::qualcomm_kyro, CPU::armv8_a, UINT32_MAX, Feature::qualcomm_kyro}, + {"falkor", CPU::qualcomm_falkor, CPU::armv8_a, UINT32_MAX, Feature::qualcomm_falkor}, + {"saphira", CPU::qualcomm_saphira, CPU::armv8_a, UINT32_MAX, Feature::qualcomm_saphira}, + {"exynos-m1", CPU::samsung_exynos_m1, CPU::generic, 0, Feature::samsung_exynos_m1}, + {"exynos-m2", CPU::samsung_exynos_m2, CPU::samsung_exynos_m1, 40000, + Feature::samsung_exynos_m2}, + {"exynos-m3", CPU::samsung_exynos_m3, CPU::samsung_exynos_m2, 40000, + Feature::samsung_exynos_m3}, + {"cyclone", CPU::apple_cyclone, CPU::generic, 0, Feature::apple_cyclone}, + {"typhoon", CPU::apple_typhoon, CPU::apple_cyclone, UINT32_MAX, Feature::apple_typhoon}, + {"twister", CPU::apple_twister, CPU::apple_typhoon, UINT32_MAX, Feature::apple_twister}, + {"hurricane", CPU::apple_hurricane, CPU::apple_twister, UINT32_MAX, Feature::apple_hurricane}, +}; +#endif +static constexpr size_t ncpu_names = sizeof(cpus) / sizeof(cpus[0]); + +// auxval reader + +#ifndef AT_HWCAP +# define AT_HWCAP 16 +#endif +#ifndef AT_HWCAP2 +# define AT_HWCAP2 26 +#endif + +#if defined(DYN_GETAUXVAL) +static bool getauxval_dlsym(unsigned long type, unsigned long *val) +{ + static auto getauxval_p = (unsigned long (*)(unsigned long)) + jl_dlsym_e(jl_dlopen(nullptr, JL_RTLD_LOCAL), "getauxval"); + if (getauxval_p) { + *val = getauxval_p(type); + return true; + } + return false; +} + +static unsigned long getauxval_procfs(unsigned long type) +{ + int fd = open("/proc/self/auxv", O_RDONLY); + if (fd == -1) + return 0; + unsigned long val = 0; + unsigned long buff[2]; + while (read(fd, buff, sizeof(buff)) == sizeof(buff)) { + if (buff[0] == 0) + break; + if (buff[0] == type) { + val = buff[1]; + break; + } + } + close(fd); + return val; +} + +static inline unsigned long jl_getauxval(unsigned long type) +{ + unsigned long val; + if (getauxval_dlsym(type, &val)) + return val; + return getauxval_procfs(type); +} +#else +static inline unsigned long jl_getauxval(unsigned long type) +{ + return getauxval(type); +} +#endif + +struct CPUID { + uint8_t implementer; + uint8_t variant; + uint16_t part; + bool operator<(const CPUID &right) const + { + if (implementer < right.implementer) + return true; + if (implementer > right.implementer) + return false; + if (part < right.part) + return true; + if (part > right.part) + return false; + return variant < right.variant; + } +}; + +// /sys/devices/system/cpu/cpu/regs/identification/midr_el1 reader +static inline void get_cpuinfo_sysfs(std::set &res) +{ + // This only works on a 64bit 4.7+ kernel + auto dir = opendir("/sys/devices/system/cpu"); + if (!dir) + return; + while (auto entry = readdir(dir)) { + if (entry->d_type != DT_DIR) + continue; + if (strncmp(entry->d_name, "cpu", 3) != 0) + continue; + std::stringstream stm; + stm << "/sys/devices/system/cpu/" << entry->d_name << "/regs/identification/midr_el1"; + std::ifstream file(stm.str()); + if (!file) + continue; + uint64_t val = 0; + file >> std::hex >> val; + if (!file) + continue; + CPUID cpuid = { + uint8_t(val >> 24), + uint8_t((val >> 20) & 0xf), + uint16_t((val >> 4) & 0xfff) + }; + res.insert(cpuid); + } + closedir(dir); +} + +// Use an external template since lambda's can't be templated in C++11 +template +static inline bool try_read_procfs_line(llvm::StringRef line, const char *prefix, T &out, + bool &flag, F &&reset) +{ + if (!line.startswith(prefix)) + return false; + if (flag) + reset(); + flag = line.substr(strlen(prefix)).ltrim("\t :").getAsInteger(0, out); + return true; +} + +// /proc/cpuinfo reader +static inline void get_cpuinfo_procfs(std::set &res) +{ + std::ifstream file("/proc/cpuinfo"); + CPUID cpuid = {0, 0, 0}; + bool impl = false; + bool part = false; + bool var = false; + auto reset = [&] () { + if (impl && part) + res.insert(cpuid); + impl = false; + part = false; + var = false; + memset(&cpuid, 0, sizeof(cpuid)); + }; + for (std::string line; std::getline(file, line);) { + if (line.empty()) { + reset(); + continue; + } + try_read_procfs_line(line, "CPU implementer", cpuid.implementer, impl, reset) || + try_read_procfs_line(line, "CPU variant", cpuid.variant, var, reset) || + try_read_procfs_line(line, "CPU part", cpuid.part, part, reset); + } + reset(); +} + +static std::set get_cpuinfo(void) +{ + std::set res; + get_cpuinfo_sysfs(res); + if (res.empty()) + get_cpuinfo_procfs(res); + return res; +} + +static CPU get_cpu_name(CPUID cpuid) +{ + switch (cpuid.implementer) { + case 0x41: // ARM + switch (cpuid.part) { + case 0xb02: return CPU::arm_mpcore; + case 0xb36: return CPU::arm_1136jf_s; + case 0xb56: return CPU::arm_1156t2f_s; + case 0xb76: return CPU::arm_1176jzf_s; + case 0xc20: return CPU::arm_cortex_m0; + case 0xc21: return CPU::arm_cortex_m1; + case 0xc23: return CPU::arm_cortex_m3; + case 0xc24: return CPU::arm_cortex_m4; + case 0xc27: return CPU::arm_cortex_m7; + case 0xd20: return CPU::arm_cortex_m23; + case 0xd21: return CPU::arm_cortex_m33; + case 0xc05: return CPU::arm_cortex_a5; + case 0xc07: return CPU::arm_cortex_a7; + case 0xc08: return CPU::arm_cortex_a8; + case 0xc09: return CPU::arm_cortex_a9; + case 0xc0d: return CPU::arm_cortex_a12; + case 0xc0f: return CPU::arm_cortex_a15; + case 0xc0e: return CPU::arm_cortex_a17; + case 0xc14: return CPU::arm_cortex_r4; + case 0xc15: return CPU::arm_cortex_r5; + case 0xc17: return CPU::arm_cortex_r7; + case 0xc18: return CPU::arm_cortex_r8; + case 0xd13: return CPU::arm_cortex_r52; + case 0xd01: return CPU::arm_cortex_a32; + case 0xd04: return CPU::arm_cortex_a35; + case 0xd03: return CPU::arm_cortex_a53; + case 0xd05: return CPU::arm_cortex_a55; + case 0xd07: return CPU::arm_cortex_a57; + case 0xd08: return CPU::arm_cortex_a72; + case 0xd09: return CPU::arm_cortex_a73; + case 0xd0a: return CPU::arm_cortex_a75; + default: return CPU::generic; + } + case 0x42: // Broadcom (Cavium) + switch (cpuid.part) { + case 0x516: return CPU::cavium_thunderx2t99p1; + default: return CPU::generic; + } + case 0x43: // Cavium + switch (cpuid.part) { + case 0xa0: return CPU::cavium_thunderx; + case 0xa1: + if (cpuid.variant == 0) + return CPU::cavium_thunderx88p1; + return CPU::cavium_thunderx88; + case 0xa2: return CPU::cavium_thunderx81; + case 0xa3: return CPU::cavium_thunderx83; + case 0xaf: return CPU::cavium_thunderx2t99; + default: return CPU::generic; + } + case 0x4e: // NVIDIA + switch (cpuid.part) { + case 0x000: return CPU::nvidia_denver1; + case 0x003: return CPU::nvidia_denver2; + default: return CPU::generic; + } + case 0x50: // AppliedMicro + // x-gene 2 + // x-gene 3 + switch (cpuid.part) { + case 0x000: return CPU::apm_xgene1; + default: return CPU::generic; + } + case 0x51: // Qualcomm + switch (cpuid.part) { + case 0x00f: + case 0x02d: + return CPU::qualcomm_scorpion; + case 0x04d: + case 0x06f: + return CPU::qualcomm_krait; + case 0x201: + case 0x205: + case 0x211: + return CPU::qualcomm_kyro; + case 0x800: + case 0x801: + return CPU::arm_cortex_a73; // second-generation Kryo + case 0xc00: + return CPU::qualcomm_falkor; + case 0xc01: + return CPU::qualcomm_saphira; + default: return CPU::generic; + } + case 0x53: // Samsung + // exynos-m2 + // exynos-m3 + switch (cpuid.part) { + case 0x001: return CPU::samsung_exynos_m1; + default: return CPU::generic; + } + case 0x56: // Marvell + switch (cpuid.part) { + case 0x581: + case 0x584: + return CPU::marvell_pj4; + default: return CPU::generic; + } + case 0x67: // Apple + // swift + // cyclone + // twister + // hurricane + switch (cpuid.part) { + case 0x072: return CPU::apple_typhoon; + default: return CPU::generic; + } + case 0x69: // Intel + switch (cpuid.part) { + case 0x001: return CPU::intel_3735d; + default: return CPU::generic; + } + default: + return CPU::generic; + } +} + +static std::pair get_elf_arch(void) +{ +#ifdef _CPU_AARCH64_ + return std::make_pair(8, 'A'); +#else + int ver = 0; + char profile = 0; + struct utsname name; + if (uname(&name) >= 0) { + // name.machine is the elf_platform in the kernel. + if (strcmp(name.machine, "armv6l") == 0) { + ver = 6; + } + else if (strcmp(name.machine, "armv7l") == 0) { + ver = 7; + } + else if (strcmp(name.machine, "armv7ml") == 0) { + ver = 7; + profile = 'M'; + } + else if (strcmp(name.machine, "armv8l") == 0 || strcmp(name.machine, "aarch64") == 0) { + ver = 8; + } + } + if (__ARM_ARCH > ver) + ver = __ARM_ARCH; +# if __ARM_ARCH > 6 && defined(__ARM_ARCH_PROFILE) + profile = __ARM_ARCH_PROFILE; +# endif + return std::make_pair(ver, profile); +#endif +} + +static inline const CPUSpec *find_cpu(uint32_t cpu) +{ + return ::find_cpu(cpu, cpus, ncpu_names); +} + +static inline const CPUSpec *find_cpu(llvm::StringRef name) +{ + return ::find_cpu(name, cpus, ncpu_names); +} + +static inline const char *find_cpu_name(uint32_t cpu) +{ + return ::find_cpu_name(cpu, cpus, ncpu_names); +} + +static std::pair feature_arch_version(const FeatureList &feature) +{ +#ifdef _CPU_AARCH64_ + return std::make_pair(8, false); +#else + if (test_nbit(feature, Feature::v8)) + return std::make_pair(8, test_nbit(feature, Feature::mclass)); + if (test_nbit(feature, Feature::v7)) + return std::make_pair(7, test_nbit(feature, Feature::mclass)); + return std::make_pair(6, false); +#endif +} + +static CPU generic_for_arch(std::pair arch) +{ +#ifdef _CPU_AARCH64_ + return CPU::generic; +#else +# if defined(__ARM_ARCH_PROFILE) + char klass = __ARM_ARCH_PROFILE; +# else + char klass = arch.second ? 'M' : 'A'; +# endif + if (arch.first >= 8) { + if (klass == 'M') { + return CPU::armv8_m_base; + } + else if (klass == 'R') { + return CPU::armv8_r; + } + else { + return CPU::armv8_a; + } + } + else if (arch.first == 7) { + if (klass == 'M') { + return CPU::armv7_m; + } + else if (klass == 'R') { + return CPU::armv7_r; + } + else { + return CPU::armv7_a; + } + } + return CPU::generic; +#endif +} + +static bool check_cpu_arch_ver(uint32_t cpu, std::pair arch) +{ + auto spec = find_cpu(cpu); + // This happens on AArch64 and indicates that the cpu name isn't a valid aarch64 CPU + if (!spec) + return false; + auto cpu_arch = feature_arch_version(spec->features); + if (arch.second != cpu_arch.second) + return false; + if (arch.first > cpu_arch.first) + return false; + return true; +} + +static void shrink_big_little(std::vector> &list, + const CPU *cpus, uint32_t ncpu) +{ + auto find = [&] (uint32_t name) { + for (uint32_t i = 0; i < ncpu; i++) { + if (cpus[i] == CPU(name)) { + return (int)i; + } + } + return -1; + }; + int maxidx = -1; + for (auto &ele: list) { + int idx = find(ele.first); + if (idx > maxidx) { + maxidx = idx; + } + } + if (maxidx >= 0) { + list.erase(std::remove_if(list.begin(), list.end(), [&] (std::pair &ele) { + int idx = find(ele.first); + return idx != -1 && idx < maxidx; + }), list.end()); + } +} + +static inline const std::pair> &get_host_cpu() +{ + static const auto host_cpu = [] { + FeatureList features = {}; + // Here we assume that only the lower 32bit are used on aarch64 + // Change the cast here when that's not the case anymore (and when there's features in the + // high bits that we want to detect). + features[0] = (uint32_t)jl_getauxval(AT_HWCAP); + features[1] = (uint32_t)jl_getauxval(AT_HWCAP2); + auto cpuinfo = get_cpuinfo(); + auto arch = get_elf_arch(); +#ifdef _CPU_ARM_ + if (arch.first >= 7) { + if (arch.second == 'M') { + set_bit(features, Feature::mclass, true); + } + else if (arch.second == 'R') { + set_bit(features, Feature::rclass, true); + } + else if (arch.second == 'A') { + set_bit(features, Feature::aclass, true); + } + } + switch (arch.first) { + case 8: + set_bit(features, Feature::v8, true); + JL_FALLTHROUGH; + case 7: + set_bit(features, Feature::v7, true); + break; + default: + break; + } +#endif + + std::set cpus; + std::vector> list; + for (auto info: cpuinfo) { + auto name = (uint32_t)get_cpu_name(info); + if (name == 0) + continue; + if (!check_cpu_arch_ver(name, arch)) + continue; + if (cpus.insert(name).second) { + features = features | find_cpu(name)->features; + list.emplace_back(name, info); + } + } + // Not all elements/pairs are valid + static constexpr CPU v8order[] = { + CPU::arm_cortex_a32, + CPU::arm_cortex_a35, + CPU::arm_cortex_a53, + CPU::arm_cortex_a55, + CPU::arm_cortex_a57, + CPU::arm_cortex_a72, + CPU::arm_cortex_a73, + CPU::arm_cortex_a75, + CPU::nvidia_denver2, + CPU::samsung_exynos_m1 + }; + shrink_big_little(list, v8order, sizeof(v8order) / sizeof(CPU)); +#ifdef _CPU_ARM_ + // Not all elements/pairs are valid + static constexpr CPU v7order[] = { + CPU::arm_cortex_a5, + CPU::arm_cortex_a7, + CPU::arm_cortex_a8, + CPU::arm_cortex_a9, + CPU::arm_cortex_a12, + CPU::arm_cortex_a15, + CPU::arm_cortex_a17 + }; + shrink_big_little(list, v7order, sizeof(v7order) / sizeof(CPU)); +#endif + uint32_t cpu = 0; + if (list.empty()) { + cpu = (uint32_t)generic_for_arch(arch); + } + else { + // This also covers `list.size() > 1` case which means there's a unknown combination + // consists of CPU's we know. Unclear what else we could try so just randomly return + // one... + cpu = list[0].first; + } + // Ignore feature bits that we are not interested in. + mask_features(feature_masks, &features[0]); + + return std::make_pair(cpu, features); + }(); + return host_cpu; +} + +static bool is_generic_cpu_name(uint32_t cpu) +{ + switch ((CPU)cpu) { + case CPU::generic: + case CPU::armv7_a: + case CPU::armv7_m: + case CPU::armv7e_m: + case CPU::armv7_r: + case CPU::armv8_a: + case CPU::armv8_m_base: + case CPU::armv8_m_main: + case CPU::armv8_r: + case CPU::armv8_1_a: + case CPU::armv8_2_a: + case CPU::armv8_3_a: + return true; + default: + return false; + } +} + +static inline const std::string &host_cpu_name() +{ + static std::string name = [] { + if (is_generic_cpu_name(get_host_cpu().first)) { + auto llvm_name = jl_get_cpu_name_llvm(); + if (llvm_name != "generic") { + return llvm_name; + } + } + return std::string(find_cpu_name(get_host_cpu().first)); + }(); + return name; +} + +template +static inline void enable_depends(FeatureList &features) +{ + if (test_nbit(features, Feature::v8_3a)) + set_bit(features, Feature::v8_2a, true); + if (test_nbit(features, Feature::v8_2a)) + set_bit(features, Feature::v8_1a, true); + if (test_nbit(features, Feature::v8_1a)) + set_bit(features, Feature::crc, true); +#ifdef _CPU_ARM_ + if (test_nbit(features, Feature::v8_1a)) { + set_bit(features, Feature::v8, true); + set_bit(features, Feature::aclass, true); + } + if (test_nbit(features, Feature::v8_m_main)) { + set_bit(features, Feature::v8, true); + set_bit(features, Feature::mclass, true); + } + if (test_nbit(features, Feature::v8)) { + set_bit(features, Feature::v7, true); + if (test_nbit(features, Feature::aclass)) { + set_bit(features, Feature::neon, true); + set_bit(features, Feature::vfp3, true); + set_bit(features, Feature::vfp4, true); + set_bit(features, Feature::hwdiv_arm, true); + set_bit(features, Feature::hwdiv, true); + set_bit(features, Feature::d32, true); + } + } + ::enable_depends(features, Feature::deps, sizeof(Feature::deps) / sizeof(FeatureDep)); +#else + if (test_nbit(features, Feature::v8_1a)) { + set_bit(features, Feature::lse, true); + set_bit(features, Feature::rdm, true); + } +#endif +} + +template +static inline void disable_depends(FeatureList &features) +{ +#ifdef _CPU_ARM_ + ::disable_depends(features, Feature::deps, sizeof(Feature::deps) / sizeof(FeatureDep)); +#endif +} + +static const std::vector> &get_cmdline_targets(void) +{ + auto feature_cb = [] (const char *str, size_t len, FeatureList &list) { + auto fbit = find_feature_bit(feature_names, nfeature_names, str, len); + if (fbit == (uint32_t)-1) + return false; + set_bit(list, fbit, true); + return true; + }; + return ::get_cmdline_targets(feature_cb); +} + +static std::vector> jit_targets; + +static TargetData arg_target_data(const TargetData &arg, bool require_host) +{ + TargetData res = arg; + const FeatureList *cpu_features = nullptr; + if (res.name == "native") { + res.name = host_cpu_name(); + cpu_features = &get_host_cpu().second; + } + else if (auto spec = find_cpu(res.name)) { + cpu_features = &spec->features; + } + else { + res.en.flags |= JL_TARGET_UNKNOWN_NAME; + } + if (cpu_features) { + for (size_t i = 0; i < feature_sz; i++) { + res.en.features[i] |= (*cpu_features)[i]; + } + } + enable_depends(res.en.features); + for (size_t i = 0; i < feature_sz; i++) + res.en.features[i] &= ~res.dis.features[i]; + if (require_host) { + for (size_t i = 0; i < feature_sz; i++) { + res.en.features[i] &= get_host_cpu().second[i]; + } + } + disable_depends(res.en.features); + if (cpu_features) { + // If the base feature if known, fill in the disable features + for (size_t i = 0; i < feature_sz; i++) { + res.dis.features[i] = feature_masks[i] & ~res.en.features[i]; + } + } + return res; +} + +static int max_vector_size(const FeatureList &features) +{ +#ifdef _CPU_ARM_ + if (test_nbit(features, Feature::neon)) + return 16; + return 8; +#else + // TODO SVE + return 16; +#endif +} + +static uint32_t sysimg_init_cb(const void *id) +{ + // First see what target is requested for the JIT. + auto &cmdline = get_cmdline_targets(); + TargetData target = arg_target_data(cmdline[0], true); + // Then find the best match in the sysimg + auto sysimg = deserialize_target_data((const uint8_t*)id); + auto match = match_sysimg_targets(sysimg, target, max_vector_size); + // Now we've decided on which sysimg version to use. + // Make sure the JIT target is compatible with it and save the JIT target. + if (match.vreg_size != max_vector_size(target.en.features) && + (sysimg[match.best_idx].en.flags & JL_TARGET_VEC_CALL)) { +#ifdef _CPU_ARM_ + unset_bits(target.en.features, Feature::neon); +#endif + } + jit_targets.push_back(std::move(target)); + return match.best_idx; +} + +static void ensure_jit_target(bool imaging) +{ + auto &cmdline = get_cmdline_targets(); + check_cmdline(cmdline, imaging); + if (!jit_targets.empty()) + return; + for (auto &arg: cmdline) { + auto data = arg_target_data(arg, jit_targets.empty()); + jit_targets.push_back(std::move(data)); + } + auto ntargets = jit_targets.size(); + // Now decide the clone condition. + for (size_t i = 1; i < ntargets; i++) { + auto &t = jit_targets[i]; + if (t.en.flags & JL_TARGET_CLONE_ALL) + continue; + // The most useful one in general... + t.en.flags |= JL_TARGET_CLONE_LOOP; +#ifdef _CPU_ARM_ + auto &features0 = jit_targets[t.base].en.features; + static constexpr uint32_t clone_math[] = {Feature::vfp3, Feature::vfp4, Feature::neon}; + for (auto fe: clone_math) { + if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { + t.en.flags |= JL_TARGET_CLONE_MATH; + break; + } + } + static constexpr uint32_t clone_simd[] = {Feature::neon}; + for (auto fe: clone_simd) { + if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { + t.en.flags |= JL_TARGET_CLONE_SIMD; + break; + } + } +#endif + } +} + +static std::pair> +get_llvm_target_noext(const TargetData &data) +{ + std::string name = data.name; + auto *spec = find_cpu(name); + while (spec) { + if (spec->llvmver <= JL_LLVM_VERSION) + break; + spec = find_cpu((uint32_t)spec->fallback); + name = spec->name; + } + auto features = data.en.features; + if (spec) { + if (is_generic_cpu_name((uint32_t)spec->cpu)) { + features = features | spec->features; + name = "generic"; + } + } + std::vector feature_strs; + for (auto &fename: feature_names) { + if (fename.llvmver > JL_LLVM_VERSION) + continue; + if (fename.bit >= 32 * 2) + break; + const char *fename_str = fename.name; + bool enable = test_nbit(features, fename.bit); + bool disable = test_nbit(data.dis.features, fename.bit); +#ifdef _CPU_ARM_ + if (fename.bit == Feature::d32) { + if (enable) { + feature_strs.push_back("-d16"); + } + else if (disable) { + feature_strs.push_back("+d16"); + } + continue; + } +#endif + if (enable) { + feature_strs.insert(feature_strs.begin(), std::string("+") + fename_str); + } + else if (disable) { + feature_strs.push_back(std::string("-") + fename_str); + } + } + if (test_nbit(features, Feature::v8_2a)) + feature_strs.push_back("+v8.2a"); + if (test_nbit(features, Feature::v8_1a)) + feature_strs.push_back("+v8.1a"); +#ifdef _CPU_ARM_ + if (test_nbit(features, Feature::v8_m_main)) { + feature_strs.push_back("+v8m.main"); + feature_strs.push_back("+armv8-m.main"); + } + if (test_nbit(features, Feature::aclass)) + feature_strs.push_back("+aclass"); + if (test_nbit(features, Feature::rclass)) + feature_strs.push_back("+rclass"); + if (test_nbit(features, Feature::mclass)) + feature_strs.push_back("+mclass"); + if (test_nbit(features, Feature::v8)) { + feature_strs.push_back("+v8"); + if (test_nbit(features, Feature::aclass)) + feature_strs.push_back("+armv8-a"); + if (test_nbit(features, Feature::rclass)) + feature_strs.push_back("+armv8-r"); + if (test_nbit(features, Feature::mclass)) { + feature_strs.push_back("+v8m"); + feature_strs.push_back("+armv8-m.base"); + } + } + if (test_nbit(features, Feature::v7)) { + feature_strs.push_back("+v7"); + if (test_nbit(features, Feature::aclass)) + feature_strs.push_back("+armv7-a"); + if (test_nbit(features, Feature::rclass)) + feature_strs.push_back("+armv7-r"); + if (test_nbit(features, Feature::mclass)) + feature_strs.push_back("+armv7-m"); + } + feature_strs.push_back("+v6"); + feature_strs.push_back("+vfp2"); +#else + feature_strs.push_back("+neon"); + feature_strs.push_back("+fp-armv8"); +#endif + return std::make_pair(std::move(name), std::move(feature_strs)); +} + +static std::pair> +get_llvm_target_vec(const TargetData &data) +{ + auto res0 = get_llvm_target_noext(data); + append_ext_features(res0.second, data.ext_features); + return res0; +} + +static std::pair +get_llvm_target_str(const TargetData &data) +{ + auto res0 = get_llvm_target_noext(data); + auto features = join_feature_strs(res0.second); + append_ext_features(features, data.ext_features); + return std::make_pair(std::move(res0.first), std::move(features)); +} + +static FeatureList get_max_feature(void) +{ +#ifdef _CPU_ARM_ + auto arch = get_elf_arch(); + auto features = real_feature_masks; + if (arch.second == 0) + arch.second = 'A'; + set_bit(features, Feature::v7, true); + set_bit(features, Feature::v8, true); + if (arch.second == 'M') { + set_bit(features, Feature::mclass, true); + set_bit(features, Feature::v8_m_main, true); + } + else if (arch.second == 'R') { + set_bit(features, Feature::rclass, true); + } + else if (arch.second == 'A') { + set_bit(features, Feature::aclass, true); + set_bit(features, Feature::v8_1a, true); + set_bit(features, Feature::v8_2a, true); + } + return features; +#else + // There isn't currently any conflicting features on AArch64 + return feature_masks; +#endif +} + +} + +using namespace ARM; + +JL_DLLEXPORT void jl_dump_host_cpu(void) +{ + dump_cpu_spec(get_host_cpu().first, get_host_cpu().second, feature_names, nfeature_names, + cpus, ncpu_names); +} + +JL_DLLEXPORT jl_value_t *jl_get_cpu_name(void) +{ + return jl_cstr_to_string(host_cpu_name().c_str()); +} + +jl_sysimg_fptrs_t jl_init_processor_sysimg(void *hdl) +{ + if (!jit_targets.empty()) + jl_error("JIT targets already initialized"); + return parse_sysimg(hdl, sysimg_init_cb); +} + +std::pair> jl_get_llvm_target(bool imaging, uint32_t &flags) +{ + ensure_jit_target(imaging); + flags = jit_targets[0].en.flags; + return get_llvm_target_vec(jit_targets[0]); +} + +const std::pair &jl_get_llvm_disasm_target(void) +{ + // RAS is not currently detectable AFAICT + auto max_feature = get_max_feature(); + static const auto res = get_llvm_target_str(TargetData{host_cpu_name(), + JL_LLVM_VERSION >= 60000 ? "+dotprod,+ras" : "+ras", + {max_feature, 0}, {feature_masks & ~max_feature, 0}, 0}); + return res; +} + +std::vector jl_get_llvm_clone_targets(void) +{ + if (jit_targets.empty()) + jl_error("JIT targets not initialized"); + std::vector res; + for (auto &target: jit_targets) { + auto features_en = target.en.features; + auto features_dis = target.dis.features; + for (auto &fename: feature_names) { + if (fename.llvmver > JL_LLVM_VERSION) { + unset_bits(features_en, fename.bit); + unset_bits(features_dis, fename.bit); + } + } + ARM::disable_depends(features_en); + jl_target_spec_t ele; + std::tie(ele.cpu_name, ele.cpu_features) = get_llvm_target_str(target); + ele.data = serialize_target_data(target.name, features_en, features_dis, + target.ext_features); + ele.flags = target.en.flags; + ele.base = target.base; + res.push_back(ele); + } + return res; +} + +extern "C" int jl_test_cpu_feature(jl_cpu_feature_t feature) +{ + if (feature >= 32 * feature_sz) + return 0; + return test_nbit(&get_host_cpu().second[0], feature); +} + +#ifdef _CPU_AARCH64_ +// FZ, bit [24] +static constexpr uint32_t fpcr_fz_mask = 1 << 24; + +static inline uint32_t get_fpcr_aarch64(void) +{ + uint32_t fpcr; + asm volatile("mrs %0, fpcr" : "=r"(fpcr)); + return fpcr; +} + +static inline void set_fpcr_aarch64(uint32_t fpcr) +{ + asm volatile("msr fpcr, %0" :: "r"(fpcr)); +} + +extern "C" JL_DLLEXPORT int32_t jl_get_zero_subnormals(void) +{ + return (get_fpcr_aarch64() & fpcr_fz_mask) != 0; +} + +extern "C" JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero) +{ + uint32_t fpcr = get_fpcr_aarch64(); + fpcr = isZero ? (fpcr | fpcr_fz_mask) : (fpcr & ~fpcr_fz_mask); + set_fpcr_aarch64(fpcr); + return 0; +} +#else +extern "C" JL_DLLEXPORT int32_t jl_get_zero_subnormals(void) +{ + return 0; +} + +extern "C" JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero) +{ + return isZero; +} +#endif