# Klassifikation von Spam Mails und E-Maildaten


### Imports

In [1]:
import pandas as pd
import openai
import time
import replicate
import os
import tiktoken
import toml

### Tokens und Kosten

In [2]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens


In [3]:
def kostenrechner(anzahl_input,anzahl_output):
    if llm == 'gpt-4':
        kosten = ((0.03/1000)*anzahl_input)+((0.06/1000)*anzahl_output)
    elif llm == 'gpt-3.5-turbo':
        kosten = ((0.0015/1000)*anzahl_input)+((0.002/1000)*anzahl_output)
    elif llm == 'replicate/llama70b-v2-chat:e951f18578850b652510200860fc4ea62b3b16fac280f83ff32282f87bbd2e48':        
        kosten = elapsed_time_ges*(0.001400)
    return kosten

### Keys 

In [4]:
# Laden Sie die .toml-Datei
data = toml.load("../.streamlit/secrets.toml")

# Werte auslesen
replicate_api_token = data["API_TOKEN"]["replicate_api_token"]
os.environ['REPLICATE_API_TOKEN'] = replicate_api_token
openai_api = data["API_TOKEN"]["openai_api"]
openai.api_key = openai_api


In [5]:
llm='gpt-3.5-turbo' #gpt-3.5-turbo, gpt-4, replicate/llama70b-v2-chat:e951f18578850b652510200860fc4ea62b3b16fac280f83ff32282f87bbd2e48

### promt generierung

In [6]:
def generate_gpt_response(prompt_input):
    messages=[{f"role": "system", "content": context}]
    messages.append({"role": "user", "content": prompt_input})
    if 'gpt' in llm:
        response = openai.ChatCompletion.create(model=llm, messages=messages)
        full_response = response["choices"][0]["message"]["content"]
    else:
        response = replicate.run(llm, 
                           input={"prompt": "You are a helpful assistant. You do not respond as 'User' or pretend to be 'User'. You only respond once as 'Assistant'. "+ context + prompt_input +" Assistant: ",
                                  "temperature":0.1, "top_p":0.9, "max_length":4097, "repetition_penalty":1})
        full_response = ''
        for item in response:
            full_response += item
    return full_response

### Substet

In [7]:
def subset(df,  anzahl, cat_label):
    # Liste zum Speichern der ausgewählten DataFrames
    dfs = []
    # Gruppieren nach 'Kategorie' und nehmen von 5 Stichproben für jede Gruppe
    for name, group in df.groupby(cat_label):
        dfs.append(group.sample(min(len(group), anzahl)))

    # Kombinieren aller gesammelten DataFrames
    con_df = pd.concat(dfs)
    
    return con_df.reset_index(drop=True)

### Test News

In [8]:
context="I will send you titles of newspaper articles in English now, which you should classify. The categories are: [BUSINESS, ENTERTAINMENT, HEALTH, NATION, SCIENCE, SPORTS, TECHNOLOGY, WORLD]. Always respond with the respective category. Your answer should only be one word. \n The Title starts here:"

In [9]:
df = pd.read_csv('../Data/labelled_newscatcher_dataset.csv', delimiter=';')
df = df.iloc[:, [0, 4]]


In [10]:
subset_df = subset(df,20, "topic")


In [11]:
TP = 0
Ges = len(subset_df)
ges_len_input = 0
ges_len_responce = 0
elapsed_time_ges = 0
# Iterieren über die "Text"-Spalte und Drucken jedes Eintrags
for index,text_entry in enumerate(subset_df['title']):
    cat = subset_df["topic"].iloc[index]
    if 'gpt'in llm:
        len_input = num_tokens_from_string(text_entry, llm)
        ges_len_input += len_input
    else:
        ges_len_responce = 0
    #ausführung Modell + Zeitstoppen
    start_time = time.time()
    responce = generate_gpt_response(text_entry)
    end_time = time.time()
    elapsed_time = end_time - start_time
    elapsed_time_ges += elapsed_time
    if 'gpt'in llm:
        len_output_tokens = num_tokens_from_string(responce, llm)
        ges_len_responce += len_output_tokens
    else:
        ges_len_responce = 0
    print(f"Class = {cat}, Responce = {responce}, Text = {text_entry}")
    if cat == responce:
        TP+=1
Acc = TP/Ges
kosten = kostenrechner(ges_len_input, ges_len_responce)
print(f"The accuracy of the model is: {Acc} and cost: {kosten} and execution time: {elapsed_time_ges}")


Class = BUSINESS, Responce = BUSINESS, Text = If Microsoft buys TikTok, it could be bad news for Google Cloud
Class = BUSINESS, Responce = BUSINESS, Text = Twitter, TikTok held preliminary talks about potential combination: WSJ
Class = BUSINESS, Responce = TECHNOLOGY, Text = World Food Packaging Robotics Market Outlook 2020-2024 - Global Market Forecast to Grow by $ 813.02 Million During 2020-2024
Class = BUSINESS, Responce = BUSINESS, Text = Lead Acid Battery for Energy Storage Market to Hit $9.73 Bn by 2027; Escalating Demand for Efficient Energy Storage Systems Worldwide to Feed Market Growth: Fortune Business Insights™
Class = BUSINESS, Responce = BUSINESS, Text = Uber CEO says service faces temporary shutdown in California over worker reclassification ruling
Class = BUSINESS, Responce = BUSINESS, Text = Global Anti-money Laundering Software Market Size, Share of the Industry Worldwide In 2019, By Country
Class = BUSINESS, Responce = HEALTH, Text = Travel news: Two more cruise line

### Test Spam

In [None]:


context="Please identify the content that follows the word 'Subject' as either harmless or spam. Reply with 'ham' for harmless and 'spam' for spam. Your answer should only be one word."

In [None]:
df_spam = pd.read_csv('../Data/spam_ham_dataset.csv', sep = ",")

df_spam=df_spam[["label","text"]]


In [None]:
df_spam

In [None]:
df_spam_subset = subset(df_spam, 10,"label")

In [None]:
df_spam_subset

In [None]:
TP = 0
Ges = len(df_spam_subset)
ges_len_input = 0
ges_len_responce = 0
elapsed_time_ges = 0
# Iterieren über die "Text"-Spalte und Drucken jedes Eintrags
for index,text_entry in enumerate(df_spam_subset['text']):
    if 'gpt'in llm:
        len_input = num_tokens_from_string(text_entry, llm)
        ges_len_input += len_input
    else:
        len_input = num_tokens_from_string(text_entry, "gpt-3.5-turbo")
        ges_len_responce = 0
    if len_input>4097:
        text_entry = text_entry[:-int((1-((4097/len_input)-0.05))*len(text_entry))]

    cat = df_spam_subset["label"].iloc[index]
    #ausführung Modell + Zeitstoppen
    start_time = time.time()
    responce = generate_gpt_response(text_entry)
    end_time = time.time()
    elapsed_time = end_time - start_time
    elapsed_time_ges += elapsed_time

    if 'gpt'in llm:
        len_output_tokens = num_tokens_from_string(responce, llm)
        ges_len_responce += len_output_tokens
    else:
        ges_len_responce = 0
    print(f"Class = {cat}, Responce = {responce}")
    if cat == responce:
        TP+=1
kosten = kostenrechner(ges_len_input, ges_len_responce)
Acc = TP/Ges
print(f"The accuracy of the model is: {Acc} and cost: {kosten} and execution time: {elapsed_time_ges}")


In [None]:
# Liste zum Speichern der ausgewählten DataFrames
dfs = []

# Gruppieren nach 'Kategorie' und nehmen von 5 Stichproben für jede Gruppe
for name, group in df_spam.groupby('Kategorie'):
    dfs.append(group.sample(min(len(group), 5)))

# Kombinieren aller gesammelten DataFrames
subset_spam_df = pd.concat(dfs)