-
Notifications
You must be signed in to change notification settings - Fork 56
/
SmartStart64.S
481 lines (428 loc) · 18.9 KB
/
SmartStart64.S
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
;/*"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
;@" "
;@" Filename: smartstart64.s "
;@" Copyright(c): Leon de Boer(LdB) 2017 "
;@" Version: 2.03 "
;@" "
;@"***************[ THIS CODE IS FREEWARE UNDER CC Attribution]*************"
;@" "
;@" This sourcecode is released for the purpose to promote programming "
;@" on the Raspberry Pi. You may redistribute it and/or modify with the "
;@" following disclaimer and condition. "
;@" "
;@" The SOURCE CODE is distributed "AS IS" WITHOUT WARRANTIES AS TO "
;@" PERFORMANCE OF MERCHANTABILITY WHETHER EXPRESSED OR IMPLIED. "
;@" Redistributions of source code must retain the copyright notices to "
;@" maintain the author credit (attribution) . "
;@" "
;@"*************************************************************************"
;@" "
;@" This code expands on my earlier SmartStart bootstub assembler for "
;@" the Pi3. It directly supports multicore operation in C/C++. To do that "
;@" it provides stack space to each core and provides a modified bootloader "
;@" spinlock that protects against registers X0-X7 trashed. As any C/C++ "
;@" 64 bit compiler will trash those registers, to use C/C++ in multicore "
;@" programming this protection must be done. "
;@" This is a matching paired AARCH64 stub for the 64bit linker file "
;@" and carries the samme arrangement as AARCH32 to ease porting. "
;@" "
;@"+++++++++++++++++++++++[ REVISIONS ]+++++++++++++++++++++++++++++++++++++"
;@" 1.01 Initial release .. Pi autodetection main aim "
;@" 1.02 Many functions moved out C to aide 32/64 bit compatability "
;@" 2.01 Futher reductions to bare minmum assembeler code "
;@" 2.02 Multicore functionality added "
;@" 2.03 Timer Irq support added "
;@"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"*/
.section ".text.startup", "ax", %progbits
.balign 4
.globl _start
_start:
adr x12, _start // Hold boot address in high register R12
ldr x1, =RPi_BootAddr // Address of RPi_BootAddr
str w12, [x1] // Store the boot address
ldr x0, =0x3F000000 // No need to detect address in 64bit mode it has to be 0x3F000000 on Pi3
ldr x1, =RPi_IO_Base_Addr // Address of RPi_IO_Base_Addr
str w0, [x1] // Store the IO base address
mov x0, #1 // Multicore support starts as 1 core
ldr x1, =RPi_CoresReady // Address of RPi_CoresReady
str w0, [x1] // Store the CoresReady count as 1
mov x0, #0x98 // Compiled for ARM8 CPU in AARCH64 and supports 4 cores
ldr x1, =RPi_CompileMode // Address of RPi_CompileMode
str w0, [x1] // Store the compiler mode
mrs x0, midr_el1 // Read CPU Id register
ldr x1, =RPi_CpuId // Address of RPi_CpuId
str w0, [x1] // Store the CPU id
//"================================================================"
// Initialize Generic Timers for Core0
//"================================================================"
mrs x0, cnthctl_el2
orr x0, x0, #0x3 /* Enable EL1 access to timers */
msr cnthctl_el2, x0
msr cntvoff_el2, xzr
/* Not sure if I want this yet */
mrs x0, cntkctl_el1
orr x0, x0, #0x3 /* Enable EL0 access to timers */
msr cntkctl_el1, x0
//"================================================================"
// Setup stack pointers for each core and each CPU operation mode
//"================================================================"
multicore_start:
ldr x2, = __EL2_stack_core0 // Address of EL2_stack_core0 stack pointer value
ldr x3, = __EL1_stack_core0 // Address of EL1_stack_core0 stack pointer value
ldr x4, = __EL0_stack_core0 // Address of EL0_stack_core0 stack pointer value
mrs x6, mpidr_el1 // Read core id on ARM8
ands x6, x6, #0x3 // Make cpu id bitmask
beq set_stacks // Ready to set core 0 stacks
ldr x2, = __EL2_stack_core1 // Address of EL2_stack_core1 stack pointer value
ldr x3, = __EL1_stack_core1 // Address of EL1_stack_core1 stack pointer value
ldr x4, = __EL0_stack_core1 // Address of EL0_stack_core1 stack pointer value
cmp x6, #1 // Check cpu id for core 1
beq set_stacks // Ready to set core 1 stacks
ldr x2, = __EL2_stack_core2 // Address of EL2_stack_core2 stack pointer value
ldr x3, = __EL1_stack_core2 // Address of EL1_stack_core2 stack pointer value
ldr x4, = __EL0_stack_core2 // Address of EL0_stack_core2 stack pointer value
cmp x6, #2 // Check cpu id for core 2
beq set_stacks // Ready to set core 1 stacks
ldr x2, = __EL2_stack_core3 // Address of EL2_stack_core3 stack pointer value
ldr x3, = __EL1_stack_core3 // Address of EL1_stack_core3 stack pointer value
ldr x4, = __EL0_stack_core3 // Address of EL0_stack_core3 stack pointer value
set_stacks:
mov sp, x2 /* EL2 stack set */
msr sp_el1, x3 /* EL1 stack set */
msr sp_el0, x4 /* EL0 stack set */
//"================================================================"
// Initilize MPID/MPIDR registers for all Cores
//"================================================================"
mrs x0, midr_el1
mrs x1, mpidr_el1
msr vpidr_el2, x0
msr vmpidr_el2, x1
//"================================================================"
// Disable coprocessor traps for all Cores
//"================================================================"
mov x0, #0x33ff
msr cptr_el2, x0 // Disable coprocessor traps to EL2
msr hstr_el2, xzr // Disable coprocessor traps to EL2
mov x0, #3 << 20
msr cpacr_el1, x0 // Enable FP/SIMD at EL1
//"================================================================"
// Initialize HCR_EL2 so EL1 is 64 bits for all Cores
//"================================================================"
mov x0, #(1 << 31) // 64bit EL1
msr hcr_el2, x0
//"================================================================"
// Initialize SCTLR_EL1 for all Cores
//"================================================================"
/* RES1 bits (29,28,23,22,20,11) to 1
* RES0 bits (31,30,27,21,17,13,10,6) +
* UCI,EE,EOE,WXN,nTWE,nTWI,UCT,DZE,I,UMA,SED,ITD,
* CP15BEN,SA0,SA,C,A,M to 0 */
mov x0, #0x0800
movk x0, #0x30d0, lsl #16
orr x0, x0, #(0x1 << 2) // The C bit on (data cache).
orr x0, x0, #(0x1 << 12) // The I bit on (instruction cache)
msr sctlr_el1, x0
//"================================================================"
// Return to the EL1_SP1 mode from EL2 for all Cores
//"================================================================"
mov x0, #0x3c5 // EL1_SP1 | D | A | I | F
msr spsr_el2, x0 // Set spsr_el2 with settings
adr x0, exit_el1 // Address to exit EL2
msr elr_el2, x0 // Set elevated return register
eret // Call elevated return
exit_el1:
mrs x6, mpidr_el1 // Read core id on ARM8
and x6, x6, #0x3 // Make cpu id bitmask
cbz x6, cpu0_exit_multicore_park // Core0 continues on
//"================================================================"
// Now park Core 1,2,3 into secondary spinloop on BCM2837
//"================================================================"
ldr x1, =RPi_CoresReady // Address of CoreReady count
ldr w0, [x1] // Load current core count
add w0, w0, #1 // Add one as core about to go ready
str w0, [x1] // Update CoreReady count
b StartSecondarySpin // Jump to setup secondary spin
cpu0_exit_multicore_park:
//"================================================================"
// Set vector table for EL1 for Core0 (All others cores parked)
//"================================================================"
ldr x0, =VectorTable
msr vbar_el1,x0
//"================================================================"
// About to go to into C kernel clear BSS (Core0 only)
//"================================================================"
ldr x3, =__bss_end__
ldr x0, =__bss_start__
cmp x0, x3
bcs .bss_cleared
.bss_zero_loop:
str wzr, [x0], 4
cmp x3, x0
bhi .bss_zero_loop
.bss_cleared:
//"================================================================"
// Core0 will bring Core 1,2,3 to secondary spin
//"================================================================"
#define spin_cpu1 0xe0
mov x1, #spin_cpu1 // Spin core1 jump address
ldr x2, =multicore_start // Function we are going to call
str x2, [x1] // Store the function address to core1
sev // Wake core1 up
ldr x3, =RPi_CoresReady // Set CoresReady count address
.WaitCore1ACK:
ldr w1, [x3] // Read CoresReady count
cmp w1, #2 // Wait for setting of second core ready
bne .WaitCore1ACK // Core1 not ready so read again
#define spin_cpu2 0xe8
mov x1, #spin_cpu2 // Spin core2 jump address
ldr x2, =multicore_start // Function we are going to call
str x2, [x1] // Store the function address to core2
sev // Wake core2 up
ldr x3, =RPi_CoresReady // Set CoresReady count address
.WaitCore2ACK:
ldr w1, [x3] // Read CoresReady count
cmp w1, #3 // Wait for setting of third core ready
bne .WaitCore2ACK // Core2 not ready so read again
#define spin_cpu3 0xf0
mov x1, #spin_cpu3 // Spin core3 jump address
ldr x2, =multicore_start // Function we are going to call
str x2, [x1] // Store the function address to core3
sev // Wake core3 up
ldr x3, =RPi_CoresReady // Set CoresReady count address
.WaitCore3ACK:
ldr w1, [x3] // Read CoresReady count
cmp w1, #4 // Wait for setting of third core ready
bne .WaitCore3ACK // Core3 not ready so read again
//"================================================================"
// Finally that all done Core0 jumps to the C compiler entry point
//"================================================================"
b kernel_main // Jump out to C kernel
/*================================================================"
Just safety incase C code ever returns back
"================================================================*/
hang:
b hang
.balign 4
.ltorg /* Tell assembler ltorg data for code above can go here */
/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++}
{ Modified bootloader Spin loop but tolerant on registers X0-X7 for C }
{++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
#define spin_cpu0 0xd8
.balign 4
StartSecondarySpin:
mrs x6, MPIDR_EL1 // Fetch core Id
and x6, x6, #0x3 // Create 2 bit mask of core Id
mov x5, #spin_cpu0 // Load address of spins
mov x1, #0 // zero x1 register
str x1, [x5, x6, lsl #3] // Make sure caller address is zeroed
secondary_spin:
wfe // This is all I really want processor to sleep
ldr x4, [x5, x6, lsl #3] // Fetch address that has been set
cbz x4, secondary_spin // If zero spin
mov x0, #0
str x0, [x5, x6, lsl #3] // Zero caller address
mov x1, #0
mov x2, #0
mov x3, #0
blr x4 // Call the function set
b StartSecondarySpin // Loop back to spinlock
.globl EnableInterrupts
EnableInterrupts:
msr daifclr,#2
ret
.globl DisableInterrupts
DisableInterrupts:
msr daifset,#2
ret
.globl CoreExecute
CoreExecute:
ands x0, x0, #255
beq CoreExecuteFail
ldr x3, =RPi_CoresReady
ldr w2, [x3] // Fetch cores ready count
cmp w0, w2
bcs CoreExecuteFail
mov x6, #0
mov w6, w0
mov x5, #spin_cpu0 // Load address of spins
str x1, [x5, x6, lsl #3] // Save caller address
dsb sy
sev
mov x0, #1
ret
CoreExecuteFail:
mov x0, #0
ret
.balign 4
.ltorg // Tell assembler ltorg data for this code can go here
/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++}
{ IRQ HELPER ROUTINES PROVIDE BY RPi-SmartStart API }
{++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
/* "PROVIDE C FUNCTION: TimerIrqHandler setTimerIrqAddress ( TimerIrqHandler* ARMaddress);" */
.section .text.setTimerIrqAddress, "ax", %progbits
.balign 4
.globl setTimerIrqAddress;
.type setTimerIrqAddress, %function
setTimerIrqAddress:
msr daifset,#2 // Disable irq interrupts as we are clearly changing call
ldr x1, =RPi_TimerIrqAddr // Load address of function to call on interrupt
ldr x2, [x1] // Load current irq call address
str x0, [x1] // Store the new function pointer address we were given
mov x0, x2 // return the old call function
ret // Return
.balign 4
.ltorg // Tell assembler ltorg data for this code can go here
.size setTimerIrqAddress, .-setTimerIrqAddress
/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++}
{ VC4 ADDRESS HELPER ROUTINES PROVIDE BY RPi-SmartStart API }
{++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
/* "PROVIDE C FUNCTION: uint32_t ARMaddrToGPUaddr (void* ARMaddress);" */
.section .text.ARMaddrToGPUaddr, "ax", %progbits
.balign 4
.globl ARMaddrToGPUaddr;
.type ARMaddrToGPUaddr, %function
//"================================================================"
// ARMaddrToGPUaddr -- AARCH64 Pi3 code
// C Function: uint32_t ARMaddrToGPUaddr (void* ARMaddress);
// Entry: x0 will have ARMAddress value
//"================================================================"
ARMaddrToGPUaddr:
mov x1, #0xC0000000 // ARM to VC conversion value
orr x0, x0, x1 // Create bus address
ret // Return
.balign 4
.ltorg // Tell assembler ltorg data for this code can go here
.size ARMaddrToGPUaddr, .-ARMaddrToGPUaddr
/* "PROVIDE C FUNCTION: uint32_t GPUaddrToARMaddr (uint32_t BUSaddress);" */
.section .text.GPUaddrToARMaddr, "ax", %progbits
.balign 4
.globl GPUaddrToARMaddr;
.type GPUaddrToARMaddr, %function
//"================================================================"
// GPUaddrToARMaddr -- AARCH64 Pi3 code
// C Function: uint32_t GPUaddrToARMaddr (uint32_t BUSaddress);
// Entry: x0 will have GPUAddress value
//"================================================================"
GPUaddrToARMaddr:
mov x1, #0xC0000000 // ARM to VC conversion value
bic x0, x0, x1 // Create arm address
ret // Return
.balign 4
.ltorg // Tell assembler ltorg data for this code can go here
.size GPUaddrToARMaddr, .-GPUaddrToARMaddr
/* Re-entrant interrupt handler stub */
/* http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0024a/ch10s05.html */
.globl irq_handler
irq_handler:
stp x29, x30, [sp, #-16]!
stp x27, x28, [sp, #-16]!
stp x25, x26, [sp, #-16]!
stp x23, x24, [sp, #-16]!
stp x21, x22, [sp, #-16]!
stp x19, x20, [sp, #-16]!
stp x17, x18, [sp, #-16]!
stp x15, x16, [sp, #-16]!
stp x13, x14, [sp, #-16]!
stp x11, x12, [sp, #-16]!
stp x9, x10, [sp, #-16]!
stp x7, x8, [sp, #-16]!
stp x5, x6, [sp, #-16]!
stp x3, x4, [sp, #-16]!
stp x1, x2, [sp, #-16]!
str x0, [sp, #-16]!
mrs x1, SPSR_EL1
mrs x2, ELR_EL1
stp x1, x2, [sp, #-16]!
// if (IRQ->IRQBasicPending.Timer_IRQ_pending) { // Check irq on timer is triggered
// ARMTIMER->Clear = 1; // Write any value to register to clear irq ... PAGE 198
// IRQ->IRQPending1 &= ~0x1; // Clear timer pending irq bit 0
// }
ldr x0, =RPi_IO_Base_Addr
ldr w0, [x0] // Fetch Pi IO base address
mov w3, 45568 // W3 = 0xB200
add w1, w0, w3 // W1 = Pi IO base addres + 0xB200
ldr w2, [x1] // W2 = IRQ->IRQBasicPending
tbz x2, 0, .TimerIrqNotPending // If timer IRQ pending not yet exit
mov w2, 46080 // W2 = 0xB400
add w0, w0, w2 // Add Pi Base Addr + 0xB400
str w2, [x0, 12] // Store W2 to IRQPending1 clearing it
ldr w0, [x1, 4] // W0 = IRQPending1
and w0, w0, -2 // Clear timer pending irq bit 0
str w0, [x1, 4] // Write IRQPending1 clearing
.TimerIrqNotPending:
msr daifclr,#2 // Enable irq interrupts
ldr x0, =RPi_TimerIrqAddr // Address to TimerIrqAddr
ldr x0, [x0] // Load TimerIrqAddr value
cbz x0, no_irqset // If zero no irq set
blr x0 // Call Irqhandler that has been set
msr daifset,#2 // Disable irq interrupts
no_irqset:
ldp x1, x2, [sp], #16
msr ELR_EL1, x2
msr SPSR_EL1, x1
ldr x0, [sp], #16
ldp x1, x2, [sp], #16
ldp x3, x4, [sp], #16
ldp x5, x6, [sp], #16
ldp x7, x8, [sp], #16
ldp x9, x10, [sp], #16
ldp x11, x12, [sp], #16
ldp x13, x14, [sp], #16
ldp x15, x16, [sp], #16
ldp x17, x18, [sp], #16
ldp x19, x20, [sp], #16
ldp x21, x22, [sp], #16
ldp x23, x24, [sp], #16
ldp x25, x26, [sp], #16
ldp x27, x28, [sp], #16
ldp x29, x30, [sp], #16
eret
/* macro to align handlers every 0x80 bytes */
.macro vector handler
.balign 0x80
b \handler
.endm
.balign 0x800
.globl VectorTable
VectorTable:
/* from current EL with sp_el0 */
vector _start /* Synchronous */
vector hang /* IRQ */
vector hang /* FIQ */
vector hang /* SErrorStub */
/* from current EL with sp_elx, x != 0 */
vector hang /* Synchronous */
vector irq_handler /* IRQ */
vector hang /* FIQ */
vector hang /* SErrorStub */
/* from lower EL, target EL minus 1 is AArch64 */
vector hang /* Synchronous */
vector hang /* IRQ */
vector hang /* FIQ */
vector hang /* SErrorStub */
/* from lower EL, target EL minus 1 is AArch32 */
vector hang /* Synchronous */
vector hang /* IRQ */
vector hang /* FIQ */
vector hang /* SErrorStub */
/****************************************************************
DATA FOR SMARTSTART64 EXPOSED TO INTERFACE
****************************************************************/
.section ".data.smartstart64", "aw"
.balign 4
.globl RPi_IO_Base_Addr; // Make sure Pi_IO_Base_Addr label is global
RPi_IO_Base_Addr : .4byte 0; // Peripheral Base addr is 4 byte variable in 64bit mode
.globl RPi_BootAddr; // Make sure RPi_BootAddr label is global
RPi_BootAddr : .4byte 0; // CPU boot address is 4 byte variable in 64bit mode
.globl RPi_CoresReady; // Make sure RPi_CoresReady label is global
RPi_CoresReady : .4byte 0; // CPU cores ready for use is 4 byte variable in 32bit mode
.globl RPi_CPUBootMode; // Make sure RPi_CPUBootMode label is global
RPi_CPUBootMode : .4byte 0; // CPU Boot Mode is 4 byte variable in 64bit mode
.globl RPi_CpuId; // Make sure RPi_CpuId label is global
RPi_CpuId : .4byte 0; // CPU Id is 4 byte variable in 64bit mode
.globl RPi_CompileMode; // Make sure RPi_CompileMode label is global
RPi_CompileMode : .4byte 0; // Compile mode is 4 byte variable in 64bit mode
/****************************************************************
DATA FOR SMARTSTART64 NOT EXPOSED TO INTERFACE
****************************************************************/
RPi_TimerIrqAddr : .8byte 0; // Timer Irq Address