In [13]:
import torch
import torch.nn as nn

import bitsandbytes as bnb
from bitsandbytes.nn import Linear8bitLt


In [14]:
fp16_model = nn.Sequential(
    nn.Linear(64, 64),
    nn.Linear(64, 64)
)
torch.save(fp16_model.state_dict(), "model.pt")



In [15]:
fp16_model[0].weight

Parameter containing:
tensor([[ 0.0562,  0.0760, -0.0051,  ...,  0.1029, -0.0352, -0.1026],
        [ 0.0066,  0.0971, -0.1236,  ...,  0.0787, -0.1059, -0.0588],
        [-0.0472, -0.0559, -0.0660,  ..., -0.1105, -0.0135,  0.0303],
        ...,
        [ 0.0575, -0.0539, -0.0719,  ...,  0.0357, -0.1239,  0.0758],
        [-0.0542,  0.0384,  0.1216,  ...,  0.0133, -0.0622,  0.1168],
        [ 0.0424,  0.0323,  0.0753,  ..., -0.0846, -0.0855,  0.1147]],
       requires_grad=True)

## Define int8 model 

In [16]:
int8_model = nn.Sequential(
    Linear8bitLt(64, 64, has_fp16_weights=False),
    Linear8bitLt(64, 64, has_fp16_weights=False)
)

## load the model with int8 

In [17]:
int8_model.load_state_dict(torch.load("model.pt"))
int8_model = int8_model.to(0) # Quantization happens here

In [18]:
int8_model[0].weight

Parameter containing:
Parameter(Int8Params([[  61,   82,   -6,  ...,  111,  -38, -111],
            [   7,  100, -127,  ...,   81, -109,  -60],
            [ -49,  -58,  -68,  ..., -114,  -14,   31],
            ...,
            [  59,  -55,  -74,  ...,   37, -127,   78],
            [ -57,   40,  127,  ...,   14,  -65,  122],
            [  43,   33,   76,  ...,  -86,  -87,  117]], device='cuda:0',
           dtype=torch.int8))

In [19]:
(int8_model[0].weight.CB * int8_model[0].weight.SCB) / 127


tensor([[ 0.0566,  0.0798, -0.0058,  ...,  0.1082, -0.0364, -0.1093],
        [ 0.0065,  0.0973, -0.1226,  ...,  0.0790, -0.1043, -0.0591],
        [-0.0454, -0.0564, -0.0657,  ..., -0.1112, -0.0134,  0.0305],
        ...,
        [ 0.0547, -0.0535, -0.0714,  ...,  0.0361, -0.1216,  0.0768],
        [-0.0529,  0.0389,  0.1226,  ...,  0.0137, -0.0622,  0.1201],
        [ 0.0399,  0.0321,  0.0734,  ..., -0.0839, -0.0833,  0.1152]],
       device='cuda:0')

In [20]:
input_ = torch.randn((1, 64), dtype=torch.float16)
hidden_states = int8_model(input_.to(torch.device('cuda', 0)))


In [21]:
import re

raw_request = '''POST /test HTTP/1.1
Host: 127.0.0.1
Content-Length: 44
Accept: application/json, */*
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69
Content-type: application/json
Origin: http://127.0.0.1
Referer: http://127.0.0.1
Accept-Encoding: gzip, deflate
Accept-Language: en-GB,en;q=0.9,ja;q=0.8,en-US;q=0.7
Connection: close

param1=value&param2=value2%26param3=value3'''

regex_pattern = r'param2=([^%]+).+param3=([^&]+)'
match = re.search(regex_pattern, raw_request)

if match:
    value2 = match.group(1)
    value3 = match.group(2)
    print("Value 2:", value2)
    print("Value 3:", value3)

Value 2: value2
Value 3: value3
