In [10]:
import pandas as pd
from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader, LLMPredictor, ServiceContext, StorageContext, load_index_from_storage

In [7]:
from langchain import OpenAI
import gradio as gr
import os

In [3]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [4]:
api_key = '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'

In [8]:
os.environ["OPENAI_API_KEY"] = api_key

def construct_index(directory_path):
    max_input_size = 4096
    num_outputs = 512
    max_chunk_overlap = 20
    chunk_size_limit = 600

    prompt_helper = PromptHelper(max_input_size, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit)

    llm_predictor = LLMPredictor(llm=OpenAI(temperature=0.7, model_name="text-davinci-003", max_tokens=num_outputs))

    documents = SimpleDirectoryReader(directory_path).load_data()

    index = GPTSimpleVectorIndex(documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper)

    index.save_to_disk('index.json')

    return index

def chatbot(input_text):
    index = GPTSimpleVectorIndex.load_from_disk('index.json')
    response = index.query(input_text, response_mode="compact")
    return response.response

iface = gr.Interface(fn=chatbot,
                     inputs=gr.inputs.Textbox(lines=7, label="Enter your text"),
                     outputs="text",
                     title="My AI Chatbot")

index = construct_index("docs")
iface.launch(share=True)

AttributeError: module 'gradio' has no attribute 'inputs'

In [11]:
df_sentiment = pd.read_csv('/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Sentiment_In_Session/Final_DF/sentiment_analysis_all.csv')

In [12]:
df = df_sentiment.copy()

In [13]:
df.head()

Unnamed: 0.1,Unnamed: 0,Sitzung,Date,Start,Schluss,Speaker,Text_Spoken,Reactions,Name,Fraktion_x,Position,Wahlperiode,positive,negative,neutral
0,3600000,143,2020-01-30,0 days 09:00:00,0 days 19:01:00,Vizepräsident Wolfgang Kubicki,Der Antrag ist damit abgelehnt.,,Wolfgang Kubicki,FDP,Abgeordnete*r,20.0,0.069543,0.737363,0.193094
1,3600001,143,2020-01-30,0 days 09:00:00,0 days 19:01:00,Vizepräsident Wolfgang Kubicki,Endgültiges Ergebnis Abgegebene Stimmen: 633;d...,,Wolfgang Kubicki,FDP,Abgeordnete*r,20.0,0.493157,0.350404,0.15644
2,3600002,143,2020-01-30,0 days 09:00:00,0 days 19:01:00,Vizepräsident Wolfgang Kubicki,Sybille Benning Dr André Berghegger Melanie Be...,,Wolfgang Kubicki,FDP,Abgeordnete*r,20.0,0.58236,0.243855,0.173785
3,3600003,143,2020-01-30,0 days 09:00:00,0 days 19:01:00,Vizepräsident Wolfgang Kubicki,Dr Reinhard Brandl Sebastian Brehm Heike Brehm...,,Wolfgang Kubicki,FDP,Abgeordnete*r,20.0,0.531848,0.315269,0.152883
4,3600004,143,2020-01-30,0 days 09:00:00,0 days 19:01:00,Vizepräsident Wolfgang Kubicki,Fischer (Karlsruhe Land).,,Wolfgang Kubicki,FDP,Abgeordnete*r,20.0,0.55722,0.229822,0.212958


In [42]:
# Filter speakers with at least 2000 sentences
speaker_counts = df['Name'].value_counts()
selected_speakers = speaker_counts[speaker_counts >= 20000].index
filtered_df = df[df['Name'].isin(selected_speakers)]

In [44]:
selected_speakers

Index(['Petra Pau', 'Claudia Roth', 'Hermann Otto Solms', 'Norbert Lammert',
       'Wolfgang Kubicki', 'Wolfgang Thierse', 'Susanne Kastner',
       'Peter Friedrich', 'Wolfgang Schäuble', 'Ulla Schmidt',
       'Edelgard Bulmahn', 'Gerda Hasselfeldt', 'Thomas Oppermann',
       'Eduard Oswald'],
      dtype='object')

In [45]:
# Combine text for each speaker
speaker_texts = filtered_df.groupby('Name')['Text_Spoken'].apply(' '.join)

In [47]:
# Example: Fine-tuning process (high-level)
tokenizer = GPT2Tokenizer.from_pretrained('gpt3')
model = GPT2LMHeadModel.from_pretrained('gpt3')

OSError: gpt3 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

# Let us train GPT on our speakers

In [11]:
speaker_texts

Name
Agnieszka Brugger           Herr Präsident! Meine Damen und Herren! In kei...
Albert Rupprecht(Weiden)    Sehr geehrte Frau Präsidentin! Liebe Kolleginn...
Albrecht Glaser             Herr Präsident! Meine sehr verehrten Damen und...
Alexander Bonde             Herr Präsident! Verehrte Damen und Herren! Lie...
Alexander Dobrindt          Herr Präsident! Sehr geehrte Damen und Herren!...
                                                  ...                        
Wolfgang Wiehle             Sehr geehrter Herr Präsident! Kolleginnen und ...
Wolfgang Wieland            Frau Staatsekretärin, grüne Parlamentarier sin...
Wolfgang Zöller             Grüß Gott, Herr Präsident! Liebe Kolleginnen u...
Yvonne Magwas               Sehr geehrter Herr Präsident! Liebe Kolleginne...
Özcan Mutlu                 Sehr geehrter Herr Präsident! Liebe Kolleginne...
Name: Text_Spoken, Length: 722, dtype: object

In [12]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW, get_linear_schedule_with_warmup

In [21]:
class SpeakerDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        return {'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten()}

In [22]:
# Dataset preparation
dataset = SpeakerDataset(speaker_texts, tokenizer, max_length=512)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

In [24]:
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from transformers import get_linear_schedule_with_warmup

In [28]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
# Set up optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs)

In [29]:
# Function for training one epoch
def train_epoch(model, data_loader, optimizer, device):
    model = model.train()
    total_loss = 0
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = input_ids.clone()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    return total_loss / len(data_loader)

In [30]:
# Function for evaluating the model
def evaluate_model(model, data_loader, device):
    model = model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = input_ids.clone()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
    return total_loss / len(data_loader)

In [31]:
# Training and validation
epochs = 4  # Choose the number of epochs
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [34]:
from torch.utils.data import DataLoader, random_split

# Assuming you have a 'dataset' variable which is an instance of your custom Dataset class
# Split the dataset into training and validation sets
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoaders for training and validation sets
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)  # No need to shuffle the validation set

In [35]:
# Set pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token
for epoch in range(epochs):
    train_loss = train_epoch(model, train_loader, optimizer, device)
    val_loss = evaluate_model(model, val_loader, device)
    print(f"Epoch {epoch + 1}, Train Loss: {train_loss}, Val Loss: {val_loss}")

# Save the model
model.save_pretrained('/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Sentiment_In_Session/training_models_on_politicians')
tokenizer.save_pretrained('/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Sentiment_In_Session/training_models_on_politicians')

Epoch 1, Train Loss: 3.6319022469404265, Val Loss: 3.366922950744629
Epoch 2, Train Loss: 3.5257807882820686, Val Loss: 3.3341230869293215
Epoch 3, Train Loss: 3.496351201359819, Val Loss: 3.3341230869293215
Epoch 4, Train Loss: 3.502487589673298, Val Loss: 3.3341230869293215


('/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Sentiment_In_Session/training_models_on_politicians/tokenizer_config.json',
 '/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Sentiment_In_Session/training_models_on_politicians/special_tokens_map.json',
 '/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Sentiment_In_Session/training_models_on_politicians/vocab.json',
 '/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Sentiment_In_Session/training_models_on_politicians/merges.txt',
 '/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Sentiment_In_Session/training_models_on_politicians/added_tokens.json')

# Using the model

In [39]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained('/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Sentiment_In_Session/training_models_on_politicians')
tokenizer = GPT2Tokenizer.from_pretrained('/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Sentiment_In_Session/training_models_on_politicians')

# Function to generate text
def generate_text(prompt, length=50):
    model.eval()
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    attention_mask = torch.ones(input_ids.shape, device=input_ids.device)
    output = model.generate(input_ids, attention_mask=attention_mask, max_length=length, num_return_sequences=1)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example usage
prompt = "Schreibe einen Text wie Petra Pau."  # Replace with your own prompt
generated_text = generate_text(prompt)
print(generated_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Schreibe einen Text wie Petra Pau. Herr Präsident! Liebe Kolleginnen und Kollegen! Ich muss einmal einmal einmal einmal einmal einmal
