In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install PyPDF2 

# import libraries

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling

# Étape 1: Extraction du texte depuis les PDF
import os
from PyPDF2 import PdfReader

# Étape 2: Nettoyage et prétraitement du texte
import re
from nltk.tokenize import word_tokenize
import torch
import warnings 
warnings.filterwarnings('ignore')

# Data extraction

In [None]:
def extract_text_from_pdfs(pdf_dir):
    texts = []
    for filename in os.listdir(pdf_dir):
        if filename.endswith('.pdf'):
            reader = PdfReader(os.path.join(pdf_dir, filename))
            text = ''
            for page in reader.pages:
                text += page.extract_text()
            texts.append(text)
    return texts

pdf_dir = '/kaggle/input/llm-books-africa/book'
texts = extract_text_from_pdfs(pdf_dir)


# Data Preparation

In [None]:
def preprocess_text(text):
    # Suppression des caractères spéciaux
    text = re.sub(r'\W+', ' ', text)
    # Mise en minuscule
    text = text.lower()
    # Tokenisation
    tokens = word_tokenize(text)
    return ' '.join(tokens)

In [None]:
cleaned_texts = [preprocess_text(text) for text in texts]

# Étape 3: Sauvegarder les données nettoyées pour l'entraînement
train_file = 'train.txt'
with open(train_file, 'w') as f:
    for text in cleaned_texts:
        f.write(text + '\n')

# Load model

In [None]:
# Étape 4: Chargement du modèle pré-entraîné
model_name = "gpt2"  # Tu peux utiliser un modèle plus grand selon ta capacité
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)


In [None]:
# Étape 5: Préparation des données pour l'entraînement
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=train_file,
    block_size=128  # Taille des blocs de texte
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Pour ne pas faire de masked language modeling
)

In [None]:
# Étape 6: Fine-tuning du modèle
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

trainer.train()

In [None]:
# Étape 7: Génération de texte
input_text = "raconte l'histoire d'un enfant qui lave un pagne"
#input_ids = tokenizer.encode(input_text, return_tensors='pt')
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

In [None]:
#torch.backends.cuda.enable_mem_efficient_sdp(False)
#torch.backends.cuda.enable_flash_sdp(False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move model to the correct device
model = model.to(device)

# Ensure input tensors are on the same device as the model
input_ids = input_ids.to(device)

In [None]:
# Modifie les hyperparamètres pour influencer la génération
output = model.generate(
    input_ids,
    max_length=100,
    temperature=0.7,  # Contrôle la créativité du modèle
    top_k=50,  # Limite les choix du modèle à top_k options
    top_p=0.95,  # Utilisation de la "nucleus sampling" pour plus de diversité
    repetition_penalty=1.2  # Réduit la répétition
)


In [None]:
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)