# Teach an LLM to do additions

The goal of this project is to teach an LLM to do additions, playing only with two parts:
* the tokenizer
* the positional embedding

Both the model and the dataset are fixed.

You are allowed to tune the hyperparameters, but this is not the main goal. Depending on the quality of your tokenizer and positional embedding, you may change the number of bits. The initial value of 3 is very small.

In [27]:
import torch
from torch import nn
from torch.nn import functional as F

import random
import math
import re
import time

In [28]:
number_bits = 9

dataset_size = 6400_000
train_proportion = 0.9

log_interval = 200
batch_size = 256
epochs = 1
learning_rate = 8e-4

## Step 1: Construct a tokenizer

In [29]:
pad_token="[PAD]"
eos_token="[EOS]"

### Baseline: character-level tokenizer

In [30]:
class character_level_tokenizer:
    """
    character-level
    """
    def __init__(self):
        pad_token="[PAD]"
        eos_token="[EOS]"
        self.vocab = [str(x) for x in range(10)] + ["+", "="] + [pad_token, eos_token]
        self.token_to_id = {v : k for k, v in enumerate(self.vocab)}
        self.id_to_token = {k : v for k, v in enumerate(self.vocab)}
        self.ntokens = len(self.vocab)
        self.pattern = f"[^{re.escape(''.join(self.vocab))}]"
    
    def clean(self, text):
        """
        removes all characters not in the vocabulary
        """
        out = re.sub(self.pattern, "", text)
        return out

    def pre_tokenization(self, text):
        """
        character-level
        """
        return [c for c in text]

    def encode(self, text):
        text_list = self.pre_tokenization(self.clean(text))
        return [self.token_to_id[c] for c in text_list]

    def decode(self, token_list):
        return "".join([self.id_to_token[x] for x in token_list])



In [31]:
tokenizer = character_level_tokenizer()
ntokens = tokenizer.ntokens
ntokens

14

In [32]:
prompt = "129564 + 42456 ="
inputs = tokenizer.encode(prompt)
inputs, tokenizer.decode(inputs)

([1, 2, 9, 5, 6, 4, 10, 4, 2, 4, 5, 6, 11], '129564+42456=')

# Implement your tokenizer here!

You can do anything (as long as you do not compute the addition!).
Some ideas:
* reversing numbers left to right
* arranging by groups (of, 2, 3,...)
* aligning numbers

In [33]:

class ColumnWiseAdditionTokenizer:
    def __init__(self):
        self.vocab = [str(i) for i in range(11)] + ["+", "=", pad_token, eos_token]
        self.token_to_id = {v: k for k, v in enumerate(self.vocab)}
        self.id_to_token = {k: v for v, k in self.token_to_id.items()}
        self.ntokens = len(self.vocab)
        self.pattern = f"[^{re.escape(''.join(self.vocab))}]"

    def clean(self, text):
        """
        Removes all characters not in the vocabulary
        """
        return re.sub(self.pattern, "", text)

    def pre_tokenization(self, num1, num2=None):
        """
        Returns a list of characters.
        Tokenizes an addition into a column-wise addition sequence
        If only num1 is provided, it returns a simple tokenized number.
        """
        num1 = str(num1)
        if num2 is None:
            return list(num1)
        
        num2 = str(num2)
        max_len = max(len(num1), len(num2))
        num1, num2 = num1.zfill(max_len), num2.zfill(max_len)  # Padding with zeros
        
        tokens = []
        for i in range(max_len - 1, -1, -1):  # Start from least significant digit
            tokens.append(num1[i])
            tokens.append("+")
            tokens.append(num2[i])
            tokens.append("=")
        return tokens
    
    def encode(self, text):
        """
        Extract num1 and num2 from the string input and tokenize accordingly
        """
        match = re.match(r"(\d+)\+(\d+)=", text)
        if match:
            num1, num2 = match.groups()
            token_list = self.pre_tokenization(num1, num2)
        else:
            # If the input is just a number, encode it as digits
            token_list = self.pre_tokenization(text)
        
        return [self.token_to_id[t] for t in token_list if t in self.token_to_id]
    
    def decode(self, token_list):
        return "".join([self.id_to_token[x] for x in token_list if x in self.id_to_token])

# Exemple d'utilisation
tokenizer = ColumnWiseAdditionTokenizer()
ntokens = tokenizer.ntokens
encoded = tokenizer.encode('703+499=')
print("Encoded:", encoded)
print("Decoded:", tokenizer.decode(encoded))

Encoded: [3, 11, 9, 12, 0, 11, 9, 12, 7, 11, 4, 12]
Decoded: 3+9=0+9=7+4=


I designed a **character-level tokenizer** for column-wise addition by structuring input in a way that aligns digits like manual addition. It removes unsupported characters, then **formats numbers column-wise** by reversing operands (if `reverse=True`) and padding with zeros where needed. This ensures that each digit pair aligns properly, making it easier for models to learn digit-wise dependencies. The tokenizer maps characters to token IDs for encoding and reconstructs expressions from IDs for decoding, preserving the structured representation of addition problems.

In [34]:
class ColumnWiseAdditionTokenizer:
    """
    character-level
    """
    def __init__(self, reverse=True):
        self.reverse = reverse
        pad_token="[PAD]"
        eos_token="[EOS]"
        # self.cols = ["a", "b", "c"]
        self.vocab = [str(x) for x in range(10)] + ["+", "="] + [pad_token, eos_token]
        # self.vocab = [str(x) for x in range(10)] + ["+", "=", '#'] + [pad_token, eos_token]
        self.token_to_id = {v : k for k, v in enumerate(self.vocab)}
        self.id_to_token = {k : v for k, v in enumerate(self.vocab)}
        self.ntokens = len(self.vocab)
        self.pattern = f"[^{re.escape(''.join(self.vocab))}]"

    def clean(self, text):
        """
        removes all characters not in the vocabulary
        """
        out = re.sub(self.pattern, "", text)
        return out

    def pre_tokenization(self, text):
        """
        character-level
        """

        if self.reverse:
          if '+' in text:
            a, b = text.split('+')
            c = ''
            if '=' in text:
              b, c = b.split('=')
            #a, b, c = a[::-1], b[::-1], c[::-1]
            a, b, c = a[::-1], b[::-1], c
            ret = ''
            for i in range(number_bits):
                #ret += '+'
                if i < len(a):
                   ret += a[i]
                else: ret += '0'
                ret += '+'
                if i < len(b):
                   ret += b[i]
                else: ret += '0' 
                ret += '='
            text = ret[:-1] + '=' + c
          else:
            text = text
        return [c for c in text]

    def encode(self, text):
        text_list = self.pre_tokenization(self.clean(text))
        return [self.token_to_id[c] for c in text_list]

    def decode(self, token_list):
        return "".join([self.id_to_token[x] for x in token_list if x in self.id_to_token])

# Exemple d'utilisation
tokenizer = ColumnWiseAdditionTokenizer()
ntokens = tokenizer.ntokens
encoded = tokenizer.encode('3+99=')
print("Encoded:", encoded)
print("Decoded:", tokenizer.decode(encoded))

Encoded: [3, 10, 9, 11, 0, 10, 9, 11, 0, 10, 0, 11, 0, 10, 0, 11, 0, 10, 0, 11, 0, 10, 0, 11, 0, 10, 0, 11, 0, 10, 0, 11, 0, 10, 0, 11]
Decoded: 3+9=0+9=0+0=0+0=0+0=0+0=0+0=0+0=0+0=


## Step 2: Create a dataset for arithmetic operations

In [35]:
def sample_datapoint(number_bits = 9):
    """
    returns a string containing two random numbers on `number_bits` many bits and their sum.
    """
    a_list = [random.randint(0, 9) for _ in range(number_bits)]
    b_list = [random.randint(0, 9) for _ in range(number_bits)]
    a_int = int("".join([str(x) for x in a_list]))
    b_int = int("".join([str(x) for x in b_list]))
    sum_int = a_int + b_int
    return (str(a_int) + "+" + str(b_int) + "=", str(sum_int))

sample_datapoint(3)

('956+68=', '1024')

In [36]:
data = []
for _ in range(dataset_size):
    data.append(sample_datapoint(number_bits))
data[:4]

[('763440039+143692342=', '907132381'),
 ('68201147+490557048=', '558758195'),
 ('468870653+397800144=', '866670797'),
 ('290244901+712442342=', '1002687243')]

In [37]:
data_train = data[: int(train_proportion * dataset_size)]
data_test = data[int(train_proportion * dataset_size):]

len(data_train),len(data_test)

(5760000, 640000)

## Step 3: Construct a model

### Basline: the classical Positional Embedding

In [38]:
class PositionalEmbedding(nn.Module):
    r"""Inject some information about the relative or absolute position of the tokens in the sequence.
        The positional encodings have the same dimension as the embeddings, so that the two can be summed.
        Here, we use sine and cosine functions of different frequencies.
    .. math:
        \text{PosEmbedder}(pos, 2i) = sin(pos/10000^(2i/d_model))
        \text{PosEmbedder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
        \text{where pos is the word position and i is the embed idx)
    Args:
        d_model: the embed dim (required).
        dropout: the dropout value (default=0.1).
        max_len: the max. length of the incoming sequence (default=5000).
    """

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEmbedding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        r"""Inputs of forward function
        Args:
            x: the sequence fed to the positional encoder model (required).
        Shape:
            x: [sequence length, batch size, embed dim]
            output: [sequence length, batch size, embed dim]
        """

        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

# Implement your positional embedding here!

You can do anything. Some ideas:
* RoPE
* (randomised) FIRE
* Abacus

**!!! IMPORTANT !!!** This model of Transformers is "input first", meaning that an input is a tensor with shape
(length_prompts, batch_size)

I implemented a **learnable positional embedding** module that replaces traditional sinusoidal encodings with trainable position-dependent vectors. Each position in the sequence is assigned a **learnable embedding**, initialized with a normal distribution, allowing the model to **adapt positional information dynamically**. The embeddings are added to the input token embeddings, ensuring the model retains order information while learning optimal representations.

In [39]:
class PositionalEmbedding(nn.Module):
    r"""Inject learnable positional embeddings instead of using fixed sinusoidal functions.
    
    Args:
        d_model: the embedding dimension (required).
        dropout: the dropout value (default=0.1).
        max_len: the max. length of the incoming sequence (default=5000).
    """
    
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEmbedding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Learnable positional embeddings
        self.pos_embedding = nn.Parameter(torch.zeros(max_len, 1, d_model))
        nn.init.normal_(self.pos_embedding, mean=0, std=0.02)  # Initialize embeddings

    def forward(self, x):
        r"""Inputs of forward function
        Args:
            x: the sequence fed to the positional encoder model (required).
        Shape:
            x: [sequence length, batch size, embed dim]
            output: [sequence length, batch size, embed dim]
        """
        x = x + self.pos_embedding[:x.size(0), :]
        return self.dropout(x)


In [40]:
class TransformerModel(nn.Transformer):
    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__(d_model=ninp,
                                               nhead=nhead,
                                               dim_feedforward=nhid,
                                               num_encoder_layers=nlayers)
        self.input_emb = nn.Embedding(ntoken, ninp)
        self.pos_encoder = PositionalEmbedding(ninp, dropout)
        self.decoder = nn.Linear(ninp, ntoken)

        self.ninp = ninp
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.input_emb.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.bias)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def _generate_square_subsequent_mask(self, sz):
        return torch.log(torch.tril(torch.ones(sz,sz)))

    def forward(self, src):
        mask = self._generate_square_subsequent_mask(len(src)).to(device)
        self.src_mask = mask

        src = self.input_emb(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output_enc = self.encoder(src, mask=self.src_mask)
        output_dec = self.decoder(output_enc)
        return F.log_softmax(output_dec, dim=-1), output_enc

In [41]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


Please do not change these parameters!

In [42]:
model = TransformerModel(ntoken = ntokens,
                         ninp = 128,
                         nhead = 16,
                         nhid = 64,
                         nlayers = 8)
model.to(device)



TransformerModel(
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-7): 8 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=64, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=64, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
    (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  )
  (decoder): Linear(in_features=128, out_features=14, bias=True)
  (input_emb): Embedding(14, 128)
  (pos_encoder): PositionalEmbedding(
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [43]:
def generate(model, prompts, new_tokens = 11):
    input_tensor = prompts # (length_prompts, batch_size)
    input_tensor = input_tensor.to(device)
    for _ in range(new_tokens):
        output, _ = model(input_tensor) # (length_prompts, batch_size, ntokens)
        last_output = output[-1,:,:] # (batch_size, ntokens)
        token = torch.argmax(last_output, -1).view((1,-1)) # (1, batch_size)
        input_tensor = torch.cat((input_tensor, token), 0)
    return input_tensor

In [44]:
model.eval()

prompt = "2+3="
prompt_tensor = torch.tensor(tokenizer.encode(prompt)).view((-1,1))
output = generate(model, prompt_tensor).view((1,-1))
output, tokenizer.decode(output.tolist()[0])

(tensor([[ 2, 10,  3, 11,  0, 10,  0, 11,  0, 10,  0, 11,  0, 10,  0, 11,  0, 10,
           0, 11,  0, 10,  0, 11,  0, 10,  0, 11,  0, 10,  0, 11,  0, 10,  0, 11,
           7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7]], device='cuda:0'),
 '2+3=0+0=0+0=0+0=0+0=0+0=0+0=0+0=0+0=77777777777')

In [45]:
def pad(token_list, type_list = "prompts"):
    max_length = max([len(x) for x in token_list])
    out = []
    for x in token_list:
        if type_list == "prompts":
            out.append([tokenizer.token_to_id[pad_token]] * (max_length - len(x)) + x)
        if type_list == "answers":
            out.append(x + [tokenizer.token_to_id[eos_token]] + [tokenizer.token_to_id[pad_token]] * (max_length - len(x)))
    return out, max_length

In [46]:
prompts = [tokenizer.encode("1+1="), tokenizer.encode("21+35=")]
answers = [tokenizer.encode("2"), tokenizer.encode("56")]
padded_prompts, _ = pad(prompts, "prompts")
padded_answers, _ = pad(answers, "answers")
padded_prompts, padded_answers
[tokenizer.decode(p) for p in padded_prompts], [tokenizer.decode(p) for p in padded_answers]

(['1+1=0+0=0+0=0+0=0+0=0+0=0+0=0+0=0+0=',
  '1+5=2+3=0+0=0+0=0+0=0+0=0+0=0+0=0+0='],
 ['2[EOS][PAD]', '56[EOS]'])

In [47]:
def get_batch(split, i):
    data = data_train if split == 'train' else data_test
    prompts = [tokenizer.encode(data[i][0]) for i in range(i, i + batch_size)]
    padded_prompts, length_prompts = pad(prompts, "prompts")
    answers = [tokenizer.encode(data[i][1]) for i in range(i, i + batch_size)]
    padded_answers, length_answers = pad(answers, "answers")
    X = torch.stack([torch.tensor(x) for x in padded_prompts], 1)
    Y = torch.stack([torch.tensor(x) for x in padded_answers], 1)
    return X, Y, length_prompts, length_answers

In [48]:
X, Y, length_prompts, length_answers = get_batch("train", 243)
X.shape, Y.shape, length_prompts, length_answers

(torch.Size([36, 256]), torch.Size([11, 256]), 36, 10)

## Step 4: Evaluate

In [49]:
def evaluate():
    # Turn on evaluation mode disables dropout.
    model.eval()
    correct = 0.
    with torch.no_grad():
        for batch, i in enumerate(range(0, len(data_test) - 1, batch_size)):
            prompts, target_answers, length_prompts, length_answers = get_batch("test", i)
            prompts = prompts.to(device) # (length_prompts, batch_size)
            target_answers = target_answers.to(device) # (length_answers + 1, batch_size)
            output = generate(model, prompts, length_answers + 1) # (length_prompts + length_answers + 1, batch_size)
            answers_tokens = output[length_prompts:, :] # (length_answers + 1, batch_size), contains tokens
            equality_test = answers_tokens == target_answers # (length_answers + 1, batch_size), contains boolean values
            correct += torch.all(equality_test, axis=0).float().sum()
        accuracy = correct / len(data_test)
    return accuracy.item()

In [5]:
evaluate()

0.9998890161514282

## Step 4: Train the model

In [51]:
def train_epoch():
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    total_loss = 0.
    start_time = time.time()
    for batch, i in enumerate(range(0, len(data_train) - 1, batch_size)):
        prompts, target_answers, length_prompts, length_answers = get_batch("train", i)
        prompts = prompts.to(device) # (length_prompts, batch_size)
        target_answers = target_answers.to(device) # (length_answers, batch_size)
        input_tensor = torch.cat((prompts, target_answers), 0) # (length_prompts + length_answers, batch_size)
        model.zero_grad()
        output, _ = model(input_tensor) # (length_prompts + length_answers, batch_size, ntokens)
        output_answers = output[length_prompts-1:-1,:,:].reshape(-1, ntokens) # (length_answers * batch_size, ntokens)
        target_answers = target_answers.view(-1)
        loss = F.cross_entropy(output_answers, target_answers)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| {:5d}/{:5d} batches | ms/batch {:5.2f} | loss {:5.2f} | perplexity {:8.2f}'.format(batch, len(data_train) // batch_size,
                                                                                                        elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

def train():
    best_test_accuracy = None
    test_accuracy = evaluate()
    print('-' * 89)
    print('| initialisation | test accuracy {:5.2f}'.format(test_accuracy))
    print('-' * 89)
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train_epoch()
        test_accuracy = evaluate()
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | test accuracy {:5.2f}'.format(epoch, (time.time() - epoch_start_time), test_accuracy))
        print('-' * 89)
        # Save the model if the test accuracy is the best we've seen so far.
        if not best_test_accuracy or test_accuracy < best_test_accuracy:
            with open("arithmetic.pt", 'wb') as f:
                torch.save(model, f)
            best_test_accuracy = test_accuracy

In [None]:
train()

-----------------------------------------------------------------------------------------
| initialisation | test accuracy  0.00
-----------------------------------------------------------------------------------------
|   200/22500 batches | ms/batch 86.26 | loss  2.20 | perplexity     8.98
|   400/22500 batches | ms/batch 87.93 | loss  1.97 | perplexity     7.18
|   600/22500 batches | ms/batch 85.82 | loss  1.89 | perplexity     6.65
|   800/22500 batches | ms/batch 85.62 | loss  1.79 | perplexity     6.01
|  1000/22500 batches | ms/batch 85.65 | loss  1.75 | perplexity     5.74
|  1200/22500 batches | ms/batch 85.63 | loss  1.69 | perplexity     5.44
|  1400/22500 batches | ms/batch 85.49 | loss  1.31 | perplexity     3.72
|  1600/22500 batches | ms/batch 87.55 | loss  0.77 | perplexity     2.16
|  1800/22500 batches | ms/batch 85.52 | loss  0.48 | perplexity     1.61
|  2000/22500 batches | ms/batch 85.62 | loss  0.33 | perplexity     1.38
|  2200/22500 batches | ms/batch 85.76 | 

In [1]:
model.eval()

for i in range(20):
    prompt, answers = data_test[i]
    prompt_tensor = torch.tensor(tokenizer.encode(prompt)).view((-1,1))
    output = generate(model, prompt_tensor, len(answers)).view((1,-1))
    print(tokenizer.decode(output.tolist()[0]) + "\t actual result: " + answers)

1+0=6+4=3+0=7+0=5+8=6+8=3+0=5+9=3+5=944537401	 actual result: 944537401
1+7=9+2=6+6=1+7=3+5=2+7=2+7=0+4=2+2=449989318	 actual result: 449989318
8+8=7+5=9+4=1+8=3+4=8+4=5+6=5+4=7+6=1402280436	 actual result: 1402280436
0+1=9+4=1+5=4+8=9+9=7+7=7+5=3+1=7+1=853592731	 actual result: 853592731
8+6=9+9=0+9=4+8=1+9=7+3=0+9=4+6=5+1=710113094	 actual result: 710113094
1+7=4+0=1+3=1+4=9+3=6+2=3+8=9+9=6+4=1191925448	 actual result: 1191925448
6+9=2+5=4+5=1+7=7+9=4+8=1+1=6+3=2+7=993368985	 actual result: 993368985
7+8=1+9=4+2=9+3=7+8=8+0=9+4=5+0=7+8=1563962715	 actual result: 1563962715
9+2=8+6=1+6=3+6=8+0=4+8=0+0=2+0=4+3=721289851	 actual result: 721289851
4+0=4+7=4+3=5+4=6+3=9+9=5+6=8+4=4+9=1432899814	 actual result: 1432899814
1+6=8+6=1+4=1+6=6+6=0+6=7+8=1+8=7+4=1205727647	 actual result: 1205727647
6+9=5+5=5+4=6+7=7+9=7+3=2+4=4+0=3+0=347174015	 actual result: 347174015
6+8=5+6=0+7=1+3=6+7=5+4=1+3=6+1=9+5=1475034824	 actual result: 1475034824
3+5=0+8=0+3=5+4=9+6=1+4=7+1=1+1=1+2=328659388	 actua

## Probing

This is just for fun...

In [3]:
import numpy as np

train_size = 1000
test_size = 100

model.eval()

def data_probing(size):
    X = []
    y = np.zeros(size)
    for i in range(size):
        input = torch.tensor(tokenizer.encode(data[i][0])).view((-1, 1)).to(device)
        _, output = model(input)
        output = output[-1,:,:].flatten()
        # determine whether there was a carry in the result:
        carry = len(data[i][1]) > len(data[i][0]) / 2
        X.append(output.cpu().detach().numpy())
        y[i] = carry
    return np.array(X), y

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

X_train, y_train = data_probing(train_size)
X_test, y_test = data_probing(test_size)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

reg = LogisticRegression()
reg.fit(X_train,y_train)
reg.score(X_test, y_test)

1.0