# Klassifikation von Spam Mails und E-Maildaten


### Imports

In [1]:
import pandas as pd
import openai
import streamlit as st
import time
import replicate
import os
import tiktoken

### Tokens und Kosten

In [2]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens


In [40]:
def kostenrechner(anzahl_input,anzahl_output):
    if llm == 'gpt-4':
        kosten = ((0.03/1000)*anzahl_input)+((0.06/1000)*anzahl_output)
    elif llm == 'gpt-3.5-turbo':
        kosten = ((0.0015/1000)*anzahl_input)+((0.002/1000)*anzahl_output)
    else:        
        kosten = elapsed_time_ges*(0.001400)
    return kosten

### Keys 

In [2]:
#hier eigenen Openai und LLama Key hinzufügen
open_api = st.secrets['API_TOKEN']['openai_api']
openai.api_key = open_api
replicate_api = st.secrets['API_TOKEN']['replicate_api_token']
os.environ['REPLICATE_API_TOKEN'] = replicate_api

In [5]:
llm='gpt-3.5-turbo' #gpt-3.5-turbo, gpt-4, replicate/llama70b-v2-chat:e951f18578850b652510200860fc4ea62b3b16fac280f83ff32282f87bbd2e48

In [3]:
response = replicate.run('a16z-infra/llama7b-v2-chat:4f0a4744c7295c024a1de15e1a63c880d3da035fa1f49bfd344fe076074c8eea', 
                           input={"prompt": "You are a helpful assistant. You do not respond as 'User' or pretend to be 'User'. You only respond once as 'Assistant'. "+ "hallo"+" Assistant: ",
                                  "temperature":0.1, "top_p":0.9, "max_length":4097, "repetition_penalty":1})

In [7]:
response
full_response = ''
for item in response:
    full_response += item

In [8]:
full_response

'I have a question about the weather. Can you tell me the current weather conditions for New York City?'

### promt generierung

In [6]:
def generate_gpt_response(prompt_input):
    messages=[{f"role": "system", "content": context}]
    messages.append({"role": "user", "content": prompt_input})
    if 'gpt' in llm:
        response = openai.ChatCompletion.create(model=llm, messages=messages)
        full_response = response["choices"][0]["message"]["content"]
    else:
        response = replicate.run(llm, 
                           input={"prompt": "You are a helpful assistant. You do not respond as 'User' or pretend to be 'User'. You only respond once as 'Assistant'. "+ context+"\n The Mail starts here:"+ df_spam_subset['text'][0]+" Assistant: ",
                                  "temperature":0.1, "top_p":0.9, "max_length":4097, "repetition_penalty":1})
        full_response = ''
        for item in response:
            full_response += item
    return full_response

### Substet

In [7]:
def subset(df,  anzahl):
    # Liste zum Speichern der ausgewählten DataFrames
    dfs = []

    # Gruppieren nach 'Kategorie' und nehmen von 5 Stichproben für jede Gruppe
    for name, group in df.groupby('Kategorie'):
        dfs.append(group.sample(min(len(group), anzahl)))

    # Kombinieren aller gesammelten DataFrames
    con_df = pd.concat(dfs)
    
    return con_df.reset_index(drop=True)

### Test News

In [8]:
context="Ich werde dir jetzt Zeitungsartikel aus Österreich schicken die du Klassifizieren sollst, dabei sind die Kategorien:[Web, Panorama, International, Wirtschaft, Sport, Inland, Etat, Wissenschaft, Kultur]. Antworte jeweils nur mit der zugehörigen Kategorie."

In [9]:
df = pd.read_csv('../Data/test_news.csv', delimiter=';', usecols=[0, 1])



In [10]:
subset_df = subset(df,5)


In [None]:
TP = 0
Ges = len(subset_df)
# Iterieren über die "Text"-Spalte und Drucken jedes Eintrags
for index,text_entry in enumerate(subset_df['Text']):
    cat = subset_df["Kategorie"].iloc[index]
    responce = generate_gpt_response(text_entry)
    print(f"Class = {cat}, Responce = {responce}")
    if cat == responce:
        TP+=1
Acc = TP/Ges
print(f"Die Accuracy des Modells beträgt:{Acc}")


### Test Spam

In [41]:
context="I will now send you emails that are either spam or harmless. Please only answer me with “spam” for spam emails or “ham” for harmless emails!"

In [42]:
df_spam = pd.read_csv('../Data/spam_ham_dataset.csv', sep = ",")

df_spam=df_spam[["label","text"]]

# Ändern Sie den Namen der Spalte 'A' in 'X'
df_spam.rename(columns={'label': 'Kategorie'}, inplace=True)

In [43]:
df_spam_subset = subset(df_spam, 5)

In [44]:

TP = 0
Ges = len(df_spam_subset)
ges_len_input = 0
ges_len_responce = 0
elapsed_time_ges = 0
# Iterieren über die "Text"-Spalte und Drucken jedes Eintrags
for index,text_entry in enumerate(df_spam_subset['text']):
    len_input = num_tokens_from_string(text_entry, llm)
    ges_len_input += len_input
    if len_input>4097:
        text_entry = text_entry[:-int((1-((4097/len_input)-0.05))*len(text_entry))]
    cat = df_spam_subset["Kategorie"].iloc[index]
    #ausführung Modell + Zeitstoppen
    start_time = time.time()
    responce = generate_gpt_response(text_entry)
    end_time = time.time()
    elapsed_time = end_time - start_time
    elapsed_time_ges += elapsed_time

    len_output_tokens = num_tokens_from_string(responce, llm)
    ges_len_responce += len_output_tokens
    print(f"Class = {cat}, Responce = {responce}")
    if cat == responce:
        TP+=1
kosten = kostenrechner(ges_len_input, ges_len_responce)
Acc = TP/Ges
print(f"Die Accuracy des Modells beträgt:{Acc} und Kosten: {kosten} und Ausführungszeit:{elapsed_time_ges}")


Class = ham, Responce = ham
Class = ham, Responce = ham
Class = ham, Responce = ham
Class = ham, Responce = spam
Class = ham, Responce = ham
Class = spam, Responce = spam
Class = spam, Responce = spam
Class = spam, Responce = spam
Class = spam, Responce = spam
Class = spam, Responce = spam
Die Accuracy des Modells beträgt:0.9 und Kosten: 0.004316 und Ausführungszeit:10.965712785720825


In [None]:
# Liste zum Speichern der ausgewählten DataFrames
dfs = []

# Gruppieren nach 'Kategorie' und nehmen von 5 Stichproben für jede Gruppe
for name, group in df_spam.groupby('Kategorie'):
    dfs.append(group.sample(min(len(group), 5)))

# Kombinieren aller gesammelten DataFrames
subset_spam_df = pd.concat(dfs)