| CS 359, COMPUTER ARCHITECTURE, YANYAN SHEN, SPRING 2017       |
|---------------------------------------------------------------|
| Project2: Optimizing the Performance of a Pipelined Processor |
|                                                               |

GROUP LEADER: LI MINCHAO 515030910361

Group Member: Wang Chenyang 515030910383

Group Member: Qiang Zhiwen 515030910367

Contact Email: marshallee413lmc@gmail.com

# 1 Preknowledge

I have read the CSAPP book and concluded several basic points which are essential to solving the problems in Part A, B, and C.

• There are 8 registers in the Y86 system

Each of these registers stores a word. Among then register %esp is used as a stack pointer by the push, pop, call, and return instructions.

- There are three single-bit condition codes, ZF, SF, and OF, storing information about the effect of the most recent arithmetic or logical instruction.
- The Y86 instruction set is largely a subset of the IA32 instruction set but it still have some differences. The picture below shows the Y86 instruction set.



• In the Y86 system, the stack starts at a certain address and grows toward lower addresses, which prevents space conflict.

# 2 Part A

## 2.1 Description

- The program **sum.ys** was used to sum linked list elements iteratively. We are supposed to add the sum of a list from head to tail using the Y86 codeing rules.
- The program **rsum.ys** is similar to the **sum.ys**, except it sums linked list elements recursively . We are supposed to add the sum of a list from head to tail using the Y86 codeing rules.
- The program **copy.ys** is used for two purposes. First it copies a block of words from one part of memory to another area of memory. Second it computes the checksum (Xor) of all the words copied.

## 2.2 Solution

sum.ys

```
# Initial code
  irmovl Stack, %esp
   rrmovl %esp,%ebp
   irmovl ele1, %edx
   #pushl %edx
   call sum_list
   halt
   # Sample linked list
   .align 4
   ele1:
   .\log 0x00a
12
   .long ele2
13
   ele2:
14
   .\log 0x0b0
   .long ele3
16
   ele3:
17
   .long 0xc00
   .long 0
19
20
   sum_list:
21
   pushl %ebp
                               # Save %ebp
22
                               \# val = 0
   xorl %eax, %eax
   rrmovl %esp,%ebp
                               # Set frame ptr
   #pushl %edx
                               # Get ls
   \#mrmovl 8(\%ebp),\%edx
   andl %edx,%edx
                               \# 1s == 0?
27
   je L4
                               # Yes, goto done
28
29
   L5:
                               # Loop:
30
   mrmovl (%edx),% esi
                               \# t = ls \rightarrow val
   addl %esi,%eax
                               # val += t
                               \# ls = ls \rightarrow next
   mrmovl 4(\% \text{edx}), \% \text{edx}
   andl %edx,%edx
                               \# 1s == 0?
```

```
jne L5
                              # No, goto done
36
                              # Done:
  L4:
37
  rrmovl %ebp,%esp
                              # Restore %esp
  popl %ebp
                              # Restore %ebp
39
  ret
                              # Return
40
41
  . pos 0x100
42
  Stack:
```

#### rsum.ys

```
# Execution begins at address 0
   . pos 0
   init:
                     irmovl Stack, %esp
   irmovl Stack, %ebp
  jmp Main
  # Sample linked list
   . align 4
   ele1:
                     .long 0x00a
   .long ele2
   ele2:
                     .\log 0x0b0
11
   .long ele3
12
   ele3:
                     .\log 0xc00
13
   .long 0
14
   Main:
                     irmovl ele1, %edx
16
   pushl %ebp
17
   rrmovl %esp, %ebp
18
   pushl %edx
19
   call rsum_list
20
   rrmovl %ebp, %esp
   popl %ebp
   halt
23
  # rsum_list - Recursive version of sum_list
25
  # int rsum_list(list_ptr ls)
26
   rsum_list:
                     pushl
                                       %ebp
27
   rrmovl
                     %esp,%ebp
                     0x8(\%ebp),\%edx
                                       # ls
   mrmovl
                     %eax,%eax
                                       # val=0
   xorl
                     %ebx
                                       # save %ebx
   pushl
31
                     %edx,%edx
                                       \# ls == 0?
   andl
32
                     End
                                       # if so, gotoEnd
   jе
33
                     (\%edx),\%ebx
  mrmovl
                                       # ls -> val
34
                     0x4(\%edx),\%ecx
  mrmovl
                                       \# ls \rightarrow next
35
   pushl
                     %ecx
  # push ls->next as the first parameter
   call
                     rsum_list
  # call rsum_list by recursion
```

```
%ebx,%eax
   addl
                                        \# val += ls -> val
                     mrmovl 0 x fffffffc (%ebp), %ebx
  End:
41
  # restore %ebx
42
                     %ebp, %esp
   rrmovl
   popl
                     %ebp
44
   ret
45
46
            0x100
   . pos
47
   Stack:
```

#### copy.ys

```
. pos 0
           irmovl Stack, %esp
   init:
  irmovl Stack, %ebp
  jmp Main
   align 4
  # Source block
   src:
   .long 0x00a
   .long 0x0b0
   .long 0xc00
11
  # Destination block
12
   dest:
13
   .long 0x111
14
   .\log 0x222
   .long 0x333
16
17
           irmovl $3,%eax
   Main:
18
   pushl %eax
19
   irmovl dest, % edx
20
   pushl %edx
^{21}
   irmovl src, %ecx
   pushl %ecx
   call Copy
   halt
25
26
           pushl %ebp
   Copy:
27
   rrmovl %esp,%ebp
   mrmovl 8(%ebp),%ecx
                             \#ecx = src
   mrmovl 12(\%ebp),\%ebx
                             \#ebx = dest
   mrmovl 16(\%ebp),\%edx
                             \#edx = len
31
   irmovl $0,%eax
                             \#result = 0
   andl %edx,%edx
33
   je End
           mrmovl (%ecx),%esi
   Loop:
                                      #get *src
   rmmovl %esi,(%ebx)
                             \#scr = dest
                             #result ^= src
   xorl %esi,%eax
   irmovl $4,% edi
                             #set %edi to 4
   addl %edi,%ecx
                             \#+4
```

```
addl %edi,%ebx
                                  \#+4
   irmovl $-1,\%edi
                                  \#set %edi to -1
41
   addl %edi,%edx
                                  \#len - 1
42
                                  #Stop when 0
            Loop
   jne
44
45
             popl %ebp
   \operatorname{End}:
46
   rrmovl %ebp, %esp
47
   popl %ebp
48
   ret
49
   .pos 0x100
51
   Stack:
```

## 2.3 Analysis

### 2.3.1 sum.ys

- For the program **sum.ys**, the line  $1 \sim 19$  and line  $42 \sim 43$  are the preparatory work and do not need further explanation. For the main part, which is the sum.ys, first we store the %ebp part to stack, then we set %eax to zero as the inital sum value, %edx as the inital list position, then if the list has reached its end, we jump to state L4, which restore the value. If not, we goto the LOOP L5 part.
- In the L5 part, first %eax + = %esi to add to the sum, then (%edx) + 4 to goto the list's next elements. After doing that, again we judge whether the list has reached its end and the condition is exactly the same.
- Based the outcome of this program, %eax stores the overall sum value, which is 0xcba, %ebp get popped and get its inital value, which is the 0x100, %esi stores the last elements of the list, which is %0xc00, which proves that our program is correct.

#### 2.3.2 rsum.ys

- For the program **rsum.ys**, the line  $1 \sim 14$  and line  $47 \sim 48$  are the preparatory work and do not need further explanation. For the main part, first we let %edx stores the elel, which is the beginning, then we save %edx and call rsum.list.
- For the  $rsum\_list$  part(line27 ~ 45),which is the recursive version of the  $sum\_list$ , first we use %edx to get starting adddress, %eax to get the inital result which is 0, then we save %ebx and compare whether %edx is zero or not. If so, goto end, else, goto the next part, which is the recursive part. It is worth noting that in line 39 the constant number 0xfffffffc is -4 and we use this operation to restore %ebx, and in line 17, we push %ebp, in line 44, we pop %ebp, with these two operations we can get the old version of %ebp which is key to our recursive part.
- Based the outcome of this program, %ebp pop out and its inital value is 0x100, %eax stores the overall sum value 0xcba, which proves that our program is correct.

#### 2.3.3 copy.ys

- For the program **copy.ys**, the line  $1 \sim 16$  and line  $51 \sim 52$  are the preparatory work and do not need further explanation. For the main part(line  $18 \sim 25$ ), first we change %eax to 3, %edx to the dest, %ecx to the src and store the value to the stack, then we call Copy function and halt.
- For the Copy function(line  $28 \sim 43$ ), first we let %ebp store %esp, %ecx store src, %ebx store dest, %edx store the length, %eax stores the inital value of the result which is 0, then we compare whether the copy function has reached its end, is so, goto the End part, else, goto the Loop part.
- For the Loop part, first we get the address of the scr, then we begin the copy process, while we add the src and dest by 4 to move to the next address, in the meantime we let len 1 to serve as flag to decide when to stop.
- Based the outcome of this program, register %eax do the xorl instruction with each value and then add to %eax, so the last value of %eax is 0xcba, also the address 0x0020 value is changed from 0x111 to 0xa, the address 0x0024 value is changed from 0x222 to 0xb0, the address 0x0028 value is changed from 0x333 to 0xc00, which proves that our program is correct.

#### 2.4 Outcome

```
👦 📵 marshallee@ubuntu: ~/Documents/computer_architecture/project2/Project2/project2-h
marshallee@ubuntu:~/Documents/computer_architecture/project2/Project2/project2-h
andout/sim/sim/misc$ ./yis rsum.yo
Stopped in 70 steps at PC = 0x41. Status 'HLT', CC Z=0 S=0 0=0
Changes to registers:
%eax:
          0x00000000
                                0x00000cba
          0x00000000
%esp:
                                0x00000100
%ebp:
          0x00000000
                                0x00000100
Changes to memory:
0x00bc: 0x00000000
                                0x00000c00
0x00c0: 0x00000000
                                0x000000d0
0x00c4: 0x00000000
                                0x0000006a
0x00cc: 0x00000000
                                0х000000ь0
0x00d0: 0x00000000
                                0x000000e0
0x00d4: 0x00000000
                                0x0000006a
0x00d8: 0x00000000
                                0x00000024
0x00dc: 0x00000000
                                0x0000000a
0x00e0: 0x000000000
                                0x000000f0
0x00e4: 0x00000000
                                0x0000006a
0x00e8: 0x00000000
                                0x0000001c
0x00f0: 0x00000000
0x00f4: 0x00000000
                                0x000000fc
                                0x0000003d
0x00f8: 0x00000000
                                0x00000014
0x00fc: 0x00000000 0x00000140
0x00fc: 0x00000000 0x00000100
marshallee@ubuntu:~/Documents/computer_architecture/project2/Project2/project2-h
andout/sim/sim/misc$
```

```
🔞 🖨 📵 marshallee@ubuntu: ~/Documents/computer_architecture/project2/Project2/project2-ha
marshallee@ubuntu:~/Documents/computer_architecture/project2/Project2/project2-h
andout/sim/misc$ ./yis copy.yo
Stopped in 10000 steps at PC = 0x32. Status 'AOK', CC Z=1 S=0 0=0
Changes to registers:
%eax: 0x00000000 0x00000003
          0x00000000
                                0x00000020
%ecx:
%ebx:
          0x00000000
                                0x0000002c
%esp:
          0x00000000
                                0x00000100
%ebp:
%esi:
          0x00000000
                               0x00000100
          0x00000000
                                0x00000c00
                                0xffffffff
%edi:
          0x00000000
Changes to memory:
0x0020: 0x00000111
0x0024: 0x00000222
                                0x0000000a
                                0x000000b0
0x0028: 0x00000333
                                0x00000c00
0x00ec: 0x00000000
                                0x00000100
0x00f0: 0x00000000
0x00f4: 0x00000000
0x00f8: 0x00000000
                                0x00000049
                                0x00000014
                                0x00000020
0x00fc: 0x00000000 0x00000003
marshallee@ubuntu:~/Documents/computer_architecture/project2/Project2/project2-h
andout/sim/sim/miscS
```

# 3 Part B

## 3.1 Description

In this part, we are asked to add a new instruction iaddl to the SEQ processor, this instruction is meant to add a constant to a register.

## 3.2 Solution

| Stage      | iaddl V, rB                     |
|------------|---------------------------------|
| Fetch      | icode:ifun $\leftarrow M_1[PC]$ |
|            | $rA:rB \leftarrow M_1[PC+1]$    |
|            | $valC \leftarrow M_4[PC+2]$     |
|            | $valP \leftarrow PC + 6$        |
| Decode     | valB←R[rB]                      |
| Execute    | valE←valB <i>ADD</i> valC       |
| Memory     |                                 |
| Write back | R[rB]←valE                      |
| PC update  | PC← valP                        |
|            |                                 |

Since the code in seq-full.hcl is quite long, I only list the part where we have made changes.

## Fetch Stage

```
bool instr_valid = icode in
{ INOP, IHALT, IRRMOVL, IIRMOVL, IMRMOVL,
IOPL, IJXX, ICALL, IRET, IPUSHL, IPOPL, IIADDL };
#show that it is valid

bool need_regids =
icode in { IRRMOVL, IOPL, IPUSHL, IPOPL,
IIRMOVL, IRMMOVL, IMRMOVL, IIADDL };
#show that we need the register

bool need_valC =
icode in { IIRMOVL, IRMMOVL, IMRMOVL, IJXX, ICALL, IIADDL };
#show that we need a constant number
```

#### Decode Stage

```
12 | 1 : RNONE;
13 | ];
14 | #show that we store the value in the destination E
```

### Execute Stage

```
int aluA = [
  icode in { IRRMOVL, IOPL } : valA;
  icode in { IIRMOVL, IRMMOVL, IMRMOVL, IIADDL } : valC;
  icode in \{ ICALL, IPUSHL \} : -4;
  icode in { IRET, IPOPL } : 4;
6
  #show that the ALU operation need the valC
  int aluB = [
9
  icode in { IRMMOVL, IMRMOVL, IOPL, ICALL,
  IPUSHL, IRET, IPOPL, IIADDL \ : valB;
11
  icode in { IRRMOVL, IIRMOVL } : 0;
12
13
  #show that the ALU operation need the valB
14
15
  bool set_cc = icode in { IOPL, IIADDL };
16
  #show that this instruction may lead the flag register to change.
```

## 3.3 Analysis

It can be implemented by first using *irmovl* instruction to let the register contains the constant number, then we can use *addl* instruction to add the constant number to the destination register. Since there are roughly four stages in the Y86 instruction set, we will fully discuss this part reagarding each stages.

- For the Fetch Stage, first we should add the *IIADDL* instruction to the *instr\_valid*, then we should add the *IIADDL* instruction to the *need\_regids* set, indicating that we should register to do this operation. Finally since we need a constant to do the addition, we need to add the *IIADDL* instruction to the *need\_valc* set.
- For the Decode Stage, first we need to add the IIADDL instruction to the srcB set, indicating that we put the register value of rB in this part, then we need to add the IIADDL instruction to the dstE set, indicating that we store the value in the destination E, which is to the same register of rB.
- For the Execute Stage, first we should add the IIADDL instruction to the aluA part, indicating that the ALU operation need the valC, which is the constant value, then we should add the IIADDL instruction to the aluB part, indicating that the ALU operation need the valB, finally we should add the IIADDL instruction to the  $set\_cc$  part, indicating that this instruction may lead the flag register to change.
- For the Memory Stage, since this instruction is to add a constant number to a register, so the Memory Stage don't have to change.

## 3.4 Outcome



# 4 Part C

## 4.1 Description

- The program **ncopy.ys** copies a len-element integer array src to a non-overlapping dst, returning the count number of positive integers contained in src.
- In this part, we need to minimize the running time of **ncopy.ys** as less as possible. We use 3 ways to decrease running time: decrease jump instruction, use iaddl instruction and decrease hazards.
- As we have learned in the class, the *jump* instructions (including *jmp*, *jle*, *jl*, *je*, *jne*, *jge* and *jg* in Y86 instructions). Of course we can write one loop to solve this problem, but it costs to much time. So we use some other methods to solve this problem.
- In traditional add instruction, we can only store the instant number in registers and add the values in registers, which increase the running time. The iaddl instruction can do the add instruction between instant number and register, which decrease the running time.
- If we store a value in a register and the next instruction is to use the value in this register, there must be staff. So we use 2 registers to do it so to decrease staffs, which also decrease our running time.

## 4.2 Solution

#### ncopy.ys

```
\#/* $begin ncopy-vs */
  # ncopy.ys - Copy a src block of len ints to dst.
  # Return the number of positive ints (>0) contained in src.
5
   Include your name and ID here.
   Describe how and why you modified the baseline code.
  10
  # Do not modify this portion
11
  # Function prologue.
12
  ncopy: pushl %ebp
                            # Save old frame pointer
13
  rrmovl %esp,%ebp
                     # Set up new frame pointer
14
  pushl %esi
                     # Save callee-save regs
  pushl %ebx
  pushl %edi
17
  mrmovl 8(\%ebp),\%ebx
                      # src
18
  mrmovl 16(\%ebp),\%edx
                      # len
19
  mrmovl 12(\%ebp),\%ecx
                      # dst
20
  21
  xorl %eax, %eax
                      # initialize the count to 0
22
  Loop8:
```

```
iaddl \$-8, \%edx
                             \# len = len - 8
  andl %edx, %edx
                             # to see if the len is less than 0
27
  jl Loop4
28
  mrmovl (%ebx), %esi
30
  mrmovl 4(%ebx), %edi
31
  rmmovl %esi, (%ecx)
  andl %esi, %esi
33
  jle StageOneOf8
  iaddl $1, %eax
35
  StageOneOf8:
37
  rmmovl %edi, 4(%ecx)
38
  andl %edi, %edi
  jle StageTwoOf8
40
  iaddl $1, %eax
41
42
  StageTwoOf8:
  mrmovl 8(%ebx), %esi
  mrmovl 12(%ebx), %edi
45
  rmmovl %esi, 8(%ecx)
46
  andl %esi, %esi
47
  jle StageThreeOf8
  iaddl $1, %eax
  StageThreeOf8:
51
  rmmovl %edi, 12(%ecx)
52
  andl %edi, %edi
  jle StageFourOf8
54
  iaddl $1, %eax
  StageFourOf8:
57
  mrmovl 16(%ebx), %esi
58
  mrmovl 20(%ebx), %edi
59
  rmmovl %esi, 16(\%ecx)
  andl %esi, %esi
61
  jle StageFiveOf8
  iaddl $1, %eax
  StageFiveOf8:
65
  rmmovl %edi, 20(%ecx)
66
  andl %edi, %edi
  jle StageSixOf8
68
  iaddl $1, %eax
  StageSixOf8:
71
  mrmovl 24(%ebx), %esi
72
  mrmovl 28(%ebx), %edi
73
  rmmovl %esi, 24(%ecx)
  andl %esi, %esi
```

```
jle StageSevenOf8
   iaddl $1, %eax
77
78
   StageSevenOf8:
   rmmovl %edi, 28(%ecx)
80
   andl %edi, %edi
81
   jle Forward8
82
   iaddl $1, %eax
83
   Forward8:
85
   iaddl $32, %ebx
   iaddl $32, %ecx
   jmp Loop8
88
89
90
   Loop4:
91
   iaddl $8, %edx
92
   iaddl $-4, \%edx
   andl
          %edx, %edx
94
   jl Loop2
95
96
   mrmovl (%ebx), %esi
97
   mrmovl 4(%ebx), %edi
   rmmovl %esi, (%ecx)
   andl %esi, %esi
100
   jle StageOneOf4
101
   iaddl $1, %eax
102
103
   StageOneOf4:
104
   rmmovl %edi, 4(%ecx)
105
   andl %edi, %edi
106
   jle StageTwoOf4
107
   iaddl $1, %eax
108
109
   StageTwoOf4:
110
   mrmovl 8(%ebx), %esi
111
   mrmovl 12(%ebx), %edi
112
   rmmovl %esi, 8(%ecx)
   andl %esi, %esi
114
   jle StageThreeOf4
115
   iaddl $1, %eax
116
117
   Stage Three Of 4:\\
118
   rmmovl %edi, 12(%ecx)
   andl %edi, %edi
   jle Forward4
121
   iaddl $1, %eax
122
123
   Forward4:
   iaddl $16, %ebx
```

```
iaddl $16, %ecx
126
   iaddl $-4, \%edx
127
   jmp Loop2
128
   130
   Loop2:
131
   iaddl $4, %edx
132
   iaddl \$-2. \%edx
133
   andl %edx, %edx
134
   jl Loop1
135
   mrmovl (%ebx), %esi
137
   mrmovl 4(%ebx), %edi
138
   rmmovl %esi, (%ecx)
139
   andl %esi, %esi
140
   jle StageOneOf2
141
   iaddl $1, %eax
142
143
   StageOneOf2:
144
   rmmovl %edi, 4(%ecx)
145
   andl %edi, %edi
146
   ile Forward2
147
   iaddl $1, %eax
148
   Forward2:
150
   iaddl $8, %ebx
151
   iaddl $8, %ecx
152
   iaddl \$-2, \%edx
153
   jmp Loop1
154
155
   156
   Loop1:
157
   iaddl $2, %edx
158
   iaddl \$-1, \%edx
159
   andl %edx, %edx
160
   il Done
161
162
   mrmovl (%ebx), %esi
   rmmovl %esi, (%ecx)
164
   andl %esi, %esi
165
   ile Done
166
   iaddl $1, %eax
167
   jmp Done
168
   # Do not modify the following section of code
   # Function epilogue.
171
   Done:
172
   popl %edi
                           # Restore callee-save registers
173
   popl %ebx
   popl %esi
```

## Fetch Stage

```
# Is instruction valid?
bool instr_valid = f_icode in
{ INOP, IHALT, IRRMOVL, IIRMOVL, IMRMOVL,
IOPL, IJXX, ICALL, IRET, IPUSHL, IPOPL, IIADDL};

# Does fetched instruction require a regid byte?
bool need_regids =
f_icode in { IRRMOVL, IOPL, IPUSHL, IPOPL,
IIRMOVL, IRMMOVL, IMRMOVL, IIADDL};

# Does fetched instruction require a constant word?
bool need_valC =
f_icode in { IIRMOVL, IRMMOVL, IMRMOVL, IJXX, ICALL, IIADDL};
```

### Decode Stage

```
## What register should be used as the B source?
  int d_srcB = [
  D_icode in { IOPL, IRMMOVL, IMRMOVL, IIADDL } : D_rB;
  D_icode in { IPUSHL, IPOPL, ICALL, IRET } : RESP;
  1 : RNONE;
            # Don't_need_register
  ];
6
  ##_What_register_should_be_used_as_the_E_destination?
  int_ddstE_=
  10
  D_icode_in_{LIPUSHL, _IPOPL, _ICALL, _IRET_}: _RESP;
11
  1_: _RNONE; __#_Don't write any register
12
13
```

#### Execute Stage

```
## Select input A to ALU
int aluA = [
E_icode in { IRRMOVL, IOPL } : E_valA;
E_icode in { IIRMOVL, IRMMOVL, IMRMOVL, IIADDL} : E_valC;
E_icode in { ICALL, IPUSHL } : -4;
E_icode in { IRET, IPOPL } : 4;
# Other instructions don't_need_ALU
];

## Select input B to ALU
```

```
int aluB = [
    E_icode in { IRMMOVL, IMRMOVL, IOPL, ICALL,
    IPUSHL, IRET, IPOPL, IIADDL} : E_valB;
    E_icode in { IRRMOVL, IIRMOVL } : 0;
    # Other instructions don't_need_ALU
    ];
    ## Should the condition codes be updated?
    bool set_cc = E_icode in {IOPL , IIADDL} ;
```

## 4.3 Analysis

## 4.3.1 ncopy.ys



- We use 3 ways to decrease running time: decrease jump instruction, use iaddl instruction and decrease hazards. To explain our solution clearly, we provide the above picture to express the overview of our solution.
- Our main idea is: In each copy instruction, we copy 4 bytes. Firstly, we copy each 8 \* 4 bytes in a loop until the left data needed to copy is less than 8 \* 4 bytes. Secondly, we check if the left data is more or equal to 4 \* 4 bytes. If so, we copy 4 \* 4 bytes in this step. Thirdly, we check if the left data is more or equal to 2 \* 4 bytes. If so, we copy 2 \* 4 bytes in this step. Finally, we check if there are still 1 \* 4 bytes left. If so, we copy these 4 bytes and end the problem. Since we need to return the count number of positive integers contained in src, we will do a judge after each copy instruction to decided if we need to increase the count by 1.
- For the instruction iaddl, we will explain why we use it. In traditional instructions, if we want to add an instant number and a value in register, we need to store this number in a register and add values in 2 registers, which need 2 instruction to do this. However, the iaddl instruction allow us to add an instant number and a value in register in just 1 instruction. This will decrease the running time.
- For the registers, we define *esi* and *edi* to store the value of contiguous 2 words(4 bytes each word). We define *ebx* to store the address of current *src* and *dst* to store the address of current *dst*. We define *edx* to store the *len* of left words needed to copy. We define *eax* to store the count of positive integers in our copyed data.
- For register esi and edi, we will explain why we use 2 register to store the copy integers. Assume we just use one register, such as esi to do this job. Since the next instruction is to copy the value of esi to ecx, there must be staff. However, if we use 2 registers, there are 1 instruction using edi between these 2 instructions using esi, there will be no staff. This method can decrease the running time.

• For the Loops, we define 4 main blocks: Loop8, Loop4, Loop2 and Loop1. However, only the Loop8 is the "real" Loop which means only Loop8 can be excuted more than 1 time. In the above picture, blue means Loop8, oringe means Loop4, green means Loop2 and yellow means Loop1. We provide following picture to represent the relations among these Loop.



- We take *Loop*4 for example. First we plus 4 to *edx* and check if it is less than 0. If it is, the program will jump to *Loop*2. Else, we do 4 copy instructions in *Loop*4. The register *esi* and *edi* stores the current data we need to copy. After each copy instruction, we will check if the data is a positive integer. If it is, the register *eax* will be added by 1. Else, the problem will ignore the add instruction and jump to next copy instruction.
- Assume we need to copy n words and n is a very large number. Assume the count number of positive integers contained in src is m. In our progress, we need  $O(log_8^n + m)$  jumps. That explains why we use this loop method to decrease the number of jump. Since if we just use 1 loop to do this job, we need O(n) jumps.

## 4.3.2 pipe-full.hcl

The analysis in "pipe-full.hcl" is very similar to part B.

| Stage      | iaddl V, rB                     |
|------------|---------------------------------|
| Fetch      | icode:ifun $\leftarrow M_1[PC]$ |
|            | $rA:rB \leftarrow M_1[PC+1]$    |
|            | $valC \leftarrow M_4[PC+2]$     |
|            | $valP \leftarrow PC + 6$        |
| Decode     | valB←R[rB]                      |
| Execute    | valE←valB <i>ADD</i> valC       |
| Memory     |                                 |
| Write back | R[rB]←valE                      |
| PC update  | PC← valP                        |

- For the Fetch Stage, first we should add the *IIADDL* instruction to the *instr\_valid*, then we should add the *IIADDL* instruction to the *need\_regids* set, indicating that we need register to do this operation. Finally since we need a constant to do the addition, we need to add the *IIADDL* instruction to the *need\_valC* set.
- For the Decode Stage, first we need to add the IIADDL instruction to the  $d\_srcB$  set, indicating that we put the register value of rB in this part, then we need to add the IIADDL instruction to the  $d\_dstE$  set, indicating that we store the value in the destination E, which is to the same register of rB.
- For the Execute Stage, first we should add the IIADDL instruction to the aluA part, indicating that the ALU operation need the valC, which is the constant value, then we should add the IIADDL instruction to the aluB part, indicating that the ALU operation need the

valB, finally we should add the IIADDL instruction to the  $set\_cc$  part, indicating that this instruction may lead the flag register to change.

• For the Memory Stage, since this instruction is to add a constant number to a register, so the Memory Stage don't have to change.

# 4.4 Outcome









# 5 Task Assignments

Li Minchao write codes for partA, partB and partC.

Wang Chenyang write description and report for partC and integrate the report.

Qiang Zhiwen write description and report for partA and partB and write preknowledge of report.