# Data Preprocess for Pretrain Model

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import os
from typing import Tuple, Dict
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
import json
import shutil
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)

cuda


# Split and Filter Data

In [3]:
def split_data(source_directory, train_dir, val_dir, test_dir, train_ratio=0.7, val_ratio=0.15):
    files = os.listdir(source_directory)
    random.shuffle(files)

    total_files = len(files)
    train_count = int(total_files * train_ratio)
    val_count = int(total_files * val_ratio)

    split_record = {"train": [], "val": [], "test": []}

    for i, file in enumerate(files):
        if i < train_count:
            shutil.move(os.path.join(source_directory, file), train_dir)
            split_record["train"].append(file)
        elif i < train_count + val_count:
            shutil.move(os.path.join(source_directory, file), val_dir)
            split_record["val"].append(file)
        else:
            shutil.move(os.path.join(source_directory, file), test_dir)
            split_record["test"].append(file)

    return split_record

source_directory = '/home/luqiao/project/data/subset'
train_dir = '/home/luqiao/project/data/train'
val_dir = '/home/luqiao/project/data/val'
test_dir = '/home/luqiao/project/data/test'

split_info = split_data(source_directory, train_dir, val_dir, test_dir)

# Save the split information to a JSON file
with open('/home/luqiao/project/data/split_info.json', 'w') as f:
    json.dump(split_info, f, indent=4)

In [6]:
def collect_file_paths(directory):
    """ Collects all .txt file paths recursively in a given directory. """
    file_paths = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.txt'):
                file_paths.append(os.path.join(root, file))
    return file_paths

def create_dataset_dictionary(train_dir, test_dir, val_dir):
    """ Creates a dictionary of datasets with file paths. """
    datasets = {
        'train': collect_file_paths(train_dir),
        'test': collect_file_paths(test_dir),
        'val': collect_file_paths(val_dir)
    }
    return datasets

# Example usage with placeholder paths
train_directory = '/home/luqiao/project/data/train/'
test_directory = '/home/luqiao/project/data/test/'
val_directory = '/home/luqiao/project/data/val/'


# Collecting file paths and creating the dictionary
split_info = create_dataset_dictionary(train_directory, test_directory, val_directory)

# Save the dictionary as a JSON file
json_filename = '/home/luqiao/project/data/split_info.json'
with open(json_filename, 'w') as json_file:
    json.dump(split_info, json_file, indent=4)

### Write split information into json file

In [None]:
with open('/home/luqiao/project/data/split_info.json', 'r') as f:
    split_info = json.load(f)
print(split_info["train"][:10])

### Load Dataset

In [None]:
dataset = load_dataset('text', data_files=split_info["train"][:1000])

### Check dataset

In [11]:
dataset["train"][:10]

{'text': ['WASHINGTON, D.C.—Republican presidential candidate Donald Trump did three rallies on Sunday, October 30. He said 27 false things: Las Vegas, Nevada',
  '',
  "Republican presidential nominee Donald Trump told a rally in Albuquerque that he's tied with Hillary Clinton in New Mexico. In fact, Clinton leads by an average of nine points in New Mexico polls. ( CARLO ALLEGRI / REUTERS )",
  '',
  '1. Falsely said, “We’re winning many national polls.” (Repeated at both rallies later in the day. At the time he spoke in Las Vegas, Trump was only leading in the Los Angeles Times tracking poll that has been consistently most favourable to him. He was tied in one other poll, by Rasmussen, and trailing in all the others.) 2. Falsely said, “We’re ahead in many states, including your great state and North Carolina.” (The North Carolina claim was repeated at a rally later in the day. Trump is trailing in both Nevada and North Carolina.) 3. Falsely said of Clinton’s email deletion, “Did anyb

### Split data

In [None]:
base_path = '/home/luqiao/project/data'
base_paths = {
    'train': '/home/luqiao/project/data/train/',
    'val': '/home/luqiao/project/data/val/',
    'test': '/home/luqiao/project/data/test/'
}
data_files = {split: [base_paths[split] + filename for filename in filenames] 
              for split, filenames in split_info.items()}
complete_dataset = load_dataset('text', data_files=split_info)

### Filter empty files

In [None]:
def is_not_empty(example):
    return example['text'].strip() != ''
filtered_complete_dataset = complete_dataset.filter(is_not_empty)

### Save filtered dataset to disk

In [None]:
filtered_complete_dataset.save_to_disk('/home/luqiao/project/data/filtered_dataset')

# Tokenizer (VOCAB 40000)

In [None]:
import re
from collections import Counter

class SimpleTokenizer:
    def __init__(self, max_vocab_size):
        self.vocab = {'[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[SEP]': 3, '[MASK]': 4}
        self.mask_token = '[MASK]'
        self.max_vocab_size = max_vocab_size

    def build_vocab(self, corpus):
        # Tokenize and count word frequencies
        word_counts = Counter(word for sentence in corpus for word in sentence.split())
        
        # Select the most common words up to max_vocab_size
        for word, _ in word_counts.most_common(self.max_vocab_size - len(self.vocab)):
            self.vocab[word] = len(self.vocab)
    
    def tokenize(self, text):
        return [self.vocab.get(word, self.vocab['[UNK]']) for word in text.split()]

    def convert_tokens_to_string(self, tokens):
        words = [list(self.vocab.keys())[list(self.vocab.values()).index(token)] for token in tokens]
        return ' '.join(words)

# Example usage
corpus = filtered_complete_dataset["train"][:]["text"]

tokenizer = SimpleTokenizer(max_vocab_size=40000)
tokenizer.build_vocab(corpus)

# Tokenize a sentence
tokens = tokenizer.tokenize("this is a test sentence for the tokenizer")
print(tokens) 

# Convert tokens back to string
sentence = tokenizer.convert_tokens_to_string(tokens)
print(sentence)  # this is a test sentence for the tokenizer


### Check VOCAB

In [36]:
for key, value in enumerate(tokenizer.vocab):
    print(key, value)
    if key ==100:
        break


0 [PAD]
1 [UNK]
2 [CLS]
3 [SEP]
4 [MASK]
5 the
6 to
7 of
8 and
9 a
10 in
11 that
12 is
13 for
14 on
15 with
16 The
17 was
18 as
19 it
20 be
21 are
22 I
23 have
24 at
25 by
26 from
27 this
28 not
29 you
30 an
31 he
32 has
33 his
34 or
35 but
36 will
37 they
38 their
39 we
40 more
41 who
42 about
43 can
44 were
45 had
46 which
47 been
48 would
49 one
50 all
51 said
52 out
53 up
54 also
55 In
56 when
57 than
58 its
59 like
60 your
61 what
62 if
63 into
64 so
65 just
66 other
67 some
68 people
69 our
70 her
71 my
72 do
73 no
74 only
75 new
76 It
77 there
78 after
79 first
80 could
81 This
82 over
83 But
84 –
85 get
86 two
87 she
88 how
89 He
90 time
91 —
92 A
93 because
94 most
95 any
96 them
97 even
98 these
99 make
100 -


### Save VOCAB to json

In [37]:
vocab_file = '/home/luqiao/project/data/vocab40000.json'
with open(vocab_file, 'w') as json_file:
    json.dump(tokenizer.vocab, json_file, indent=4)

### Check Tokenizer

In [39]:
# Tokenize a sentence
tokens = tokenizer.tokenize(filtered_complete_dataset["train"][567]["text"])
print(tokens) 

# Convert tokens back to string
sentence = tokenizer.convert_tokens_to_string(tokens)
print(sentence)  # this is a test sentence for the tokenizer

[55, 1, 5053, 5, 3901, 20411, 26, 98, 75, 3870, 8, 5, 505, 3210, 19, 80, 248, 14, 243, 1207, 114, 501, 6, 2157, 18, 67, 68, 36, 20, 238, 7064, 26, 5, 627, 130, 514, 36, 85, 244, 9064]
In [UNK] view, the worker displacement from these new technologies and the economic stress it could place on political systems may lead to conflict as some people will be big winners from the changes while others will get left behind.


# Tokenizer (VOCAB 40000, adding punctuations)

In [None]:
class UpgradeTokenizer:
    def __init__(self, max_vocab_size, punctuations=['.', ',', '!', '?', ':', ';', '-', '(', ')']):
        self.vocab = {'[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[SEP]': 3, '[MASK]': 4}
        self.mask_token = '[MASK]'
        self.max_vocab_size = max_vocab_size
        self.punctuations = punctuations

    def custom_tokenize(self, text):
        # text = text.lower()  # Convert text to lower case
        # Escape punctuations for regular expression
        escaped_punctuations = [re.escape(p) for p in self.punctuations]
        # Pattern for words or punctuation
        pattern = r'\w+|' + '|'.join(escaped_punctuations)
        
        tokens = re.findall(pattern, text)
        return tokens

    def build_vocab(self, corpus):
        # Tokenize and count word frequencies
        word_counts = Counter(word for sentence in corpus for word in self.custom_tokenize(sentence))
        
        # Select the most common words up to max_vocab_size
        for word, _ in word_counts.most_common(self.max_vocab_size - len(self.vocab)):
            self.vocab[word] = len(self.vocab)
    
    def tokenize(self, text):
        return [self.vocab.get(word, self.vocab['[UNK]']) for word in self.custom_tokenize(text)]

    def convert_tokens_to_string(self, tokens):
        words = [list(self.vocab.keys())[list(self.vocab.values()).index(token)] for token in tokens]
        sentence = ''
        for word in words:
            if word in self.punctuations:
                sentence += word  # Add punctuation without space
            else:
                if sentence and not sentence.endswith(' '):
                    sentence += ' '  # Add space before word if it's not the start of the sentence
                sentence += word
        return sentence

# Example usage
corpus = OpenwebDataset["train"][:]["text"]
tokenizer2 = UpgradeTokenizer(max_vocab_size=40000)
tokenizer2.build_vocab(corpus)

# Tokenize a sentence
tokens = tokenizer2.tokenize("this is a test sentence for the tokenizer.")
print(tokens)

# Convert tokens back to string
sentence = tokenizer2.convert_tokens_to_string(tokens)
print(sentence)


### Save to json

In [None]:
vocab_file = '/home/luqiao/project/data/vocab40000-p.json'
with open(vocab_file, 'w') as json_file:
    json.dump(tokenizer2.vocab, json_file, indent=4)

# Tokenizer (VOCAB 60000 with punctuations and case sensitive)

In [None]:
class UpgradeTokenizer2:
    def __init__(self, max_vocab_size, punctuations=['.', ',', '!', '?', ':', ';', '-', '(', ')']):
        self.vocab = {'[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[SEP]': 3, '[MASK]': 4}
        self.mask_token = '[MASK]'
        self.max_vocab_size = max_vocab_size
        self.punctuations = punctuations

    def custom_tokenize(self, text):
        # Generate a regex pattern that excludes specified punctuations
        excluded_punctuations = ''.join(re.escape(p) for p in self.punctuations)
        pattern = r"\b\w+'?\w*|[^\w\s" + excluded_punctuations + "]"

        tokens = re.findall(pattern, text.lower())
        return tokens

    def build_vocab(self, corpus):
        word_counts = Counter(word for sentence in corpus for word in self.custom_tokenize(sentence))
        for word, _ in word_counts.most_common(self.max_vocab_size - len(self.vocab)):
            self.vocab[word] = len(self.vocab)
    
    def tokenize(self, text):
        return [self.vocab.get(word, self.vocab['[UNK]']) for word in self.custom_tokenize(text)]

    def convert_tokens_to_string(self, tokens):
        words = [list(self.vocab.keys())[list(self.vocab.values()).index(token)] for token in tokens]
        sentence = ''
        for word in words:
            if word in self.punctuations:
                sentence += word
            else:
                if sentence and not sentence.endswith(' '):
                    sentence += ' '
                sentence += word
        return sentence

# Initialize your tokenizer
tokenizer = UpgradeTokenizer2(max_vocab_size=60000)  # Adjust max_vocab_size as needed

def read_corpus(folder_path):
    corpus = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):  # Ensure it's a text file
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                corpus.append(file.read())
    return corpus

# Specify the folder path containing your text files
folder_path = '/root/autodl-tmp/train_data'

# Read the corpus from text files
corpus = read_corpus(folder_path)

# Build vocabulary using the corpus
tokenizer.build_vocab(corpus)

# Tokenize a sentence
tokens = tokenizer.tokenize("this is a test sentence for the tokenizer. it's waht you're! what?")
print(tokens)

# Convert tokens back to string
sentence = tokenizer.convert_tokens_to_string(tokens)
print(sentence)


In [None]:
def custom_tokenize(text):
    punctuations=['.', ',', '!', '?', ':', ';', '-', '(', ')']
    # text = text.lower()  # Convert text to lower case
    # Escape punctuations for regular expression
    escaped_punctuations = [re.escape(p) for p in punctuations]
    # Pattern for words or punctuation
    pattern = r'\w+|' + '|'.join(escaped_punctuations)
    
    tokens = re.findall(pattern, text)
    return tokens

def convert_tokens_to_string(tokens):
    punctuations=['.', ',', '!', '?', ':', ';', '-', '(', ')']
    words = [list(tokenizer2.vocab.keys())[list(tokenizer2.vocab.values()).index(token)] for token in tokens]
    sentence = ''
    for word in words:
        if word in punctuations:
            sentence += word  # Add punctuation without space
        else:
            if sentence and not sentence.endswith(' '):
                sentence += ' '  # Add space before word if it's not the start of the sentence
            sentence += word
    return sentence

### Save to json

In [None]:
vocab_file = '/root/project/vocab60000-latest.json'
with open(vocab_file, 'w') as json_file:
    json.dump(tokenizer.vocab, json_file, indent=4)

### Check Tokenizer

In [None]:
tokens = custom_tokenize("this is a test sentence for the tokenizer.")
print(tokens)

In [None]:
convert_tokens_to_string([35, 15, 11, 699, 2723, 17, 7, 1, 5])