# Klassifikation von Spam Mails und E-Maildaten


### Imports

In [1]:
import pandas as pd
import openai
import time
import replicate
import os
import tiktoken
import toml

### Tokens und Kosten

In [2]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens


In [3]:
def kostenrechner(anzahl_input,anzahl_output):
    if llm == 'gpt-4':
        kosten = ((0.03/1000)*anzahl_input)+((0.06/1000)*anzahl_output)
    elif llm == 'gpt-3.5-turbo':
        kosten = ((0.0015/1000)*anzahl_input)+((0.002/1000)*anzahl_output)
    elif llm == 'replicate/llama70b-v2-chat:e951f18578850b652510200860fc4ea62b3b16fac280f83ff32282f87bbd2e48':        
        kosten = elapsed_time_ges*(0.001400)
    return kosten

### Keys 

In [4]:
# Laden Sie die .toml-Datei
data = toml.load("../.streamlit/secrets.toml")

# Werte auslesen
replicate_api_token = data["API_TOKEN"]["replicate_api_token"]
os.environ['REPLICATE_API_TOKEN'] = replicate_api_token
openai_api = data["API_TOKEN"]["openai_api"]
openai.api_key = openai_api


In [5]:
llm='gpt-3.5-turbo' #gpt-3.5-turbo, gpt-4, replicate/llama70b-v2-chat:e951f18578850b652510200860fc4ea62b3b16fac280f83ff32282f87bbd2e48

### promt generierung

In [6]:
def generate_gpt_response(prompt_input):
    messages=[{f"role": "system", "content": context}]
    messages.append({"role": "user", "content": prompt_input})
    if 'gpt' in llm:
        response = openai.ChatCompletion.create(model=llm, messages=messages)
        full_response = response["choices"][0]["message"]["content"]
    else:
        response = replicate.run(llm, 
                           input={"prompt": "You are a helpful assistant. You do not respond as 'User' or pretend to be 'User'. You only respond once as 'Assistant'. "+ context + prompt_input +" Assistant: ",
                                  "temperature":0.1, "top_p":0.9, "max_length":4097, "repetition_penalty":1})
        full_response = ''
        for item in response:
            full_response += item
    return full_response

### Substet

In [7]:
def subset(df,  anzahl, cat_label):
    # Liste zum Speichern der ausgewählten DataFrames
    dfs = []
    # Gruppieren nach 'Kategorie' und nehmen von 5 Stichproben für jede Gruppe
    for name, group in df.groupby(cat_label):
        dfs.append(group.sample(min(len(group), anzahl)))

    # Kombinieren aller gesammelten DataFrames
    con_df = pd.concat(dfs)
    
    return con_df.reset_index(drop=True)

### Test News

In [8]:
context="I will send you titles of newspaper articles in English now, which you should classify. The categories are: [BUSINESS, ENTERTAINMENT, HEALTH, NATION, SCIENCE, SPORTS, TECHNOLOGY, WORLD]. Always respond with the respective category. Your answer should only be one word. \n The Title starts here:"

In [9]:
df = pd.read_csv('../Data/labelled_newscatcher_dataset.csv', delimiter=';')
df = df.iloc[:, [0, 4]]


In [10]:
subset_df = subset(df,10, "topic")


In [11]:
TP = 0
Ges = len(subset_df)
ges_len_input = 0
ges_len_responce = 0
elapsed_time_ges = 0
# Iterieren über die "Text"-Spalte und Drucken jedes Eintrags
for index,text_entry in enumerate(subset_df['title']):
    cat = subset_df["topic"].iloc[index]
    if 'gpt'in llm:
        len_input = num_tokens_from_string(text_entry, llm)
        ges_len_input += len_input
    else:
        ges_len_responce = 0
    #ausführung Modell + Zeitstoppen
    start_time = time.time()
    responce = generate_gpt_response(text_entry)
    end_time = time.time()
    elapsed_time = end_time - start_time
    elapsed_time_ges += elapsed_time
    if 'gpt'in llm:
        len_output_tokens = num_tokens_from_string(responce, llm)
        ges_len_responce += len_output_tokens
    else:
        ges_len_responce = 0
    print(f"Class = {cat}, Responce = {responce}, Text = {text_entry}")
    if cat == responce:
        TP+=1
Acc = TP/Ges
kosten = kostenrechner(ges_len_input, ges_len_responce)
print(f"The accuracy of the model is: {Acc} and cost: {kosten} and execution time: {elapsed_time_ges}")


Class = BUSINESS, Responce = BUSINESS, Text = Sterling nears 5-month high vs dollar amid impasse on U.S. relief package
Class = BUSINESS, Responce = BUSINESS, Text = Global Steam Boilers Market for Thermal Power Plant & Oil Refinery, 2020-2025 - ResearchAndMarkets.com
Class = BUSINESS, Responce = BUSINESS, Text = New York Accuses Egg Producer of Price Gouging in Pandemic
Class = BUSINESS, Responce = BUSINESS, Text = Canadian Natural Resources buys Painted Pony for $461 million to boost Western Canada acreage
Class = BUSINESS, Responce = TECHNOLOGY, Text = Facebook, Snap Inc held talks to buy TikTok rival Dubsmash: Report
Class = BUSINESS, Responce = BUSINESS, Text = Real Rates Reversal Hitting Big Trades Including Gold and Nasdaq
Class = BUSINESS, Responce = BUSINESS, Text = Israeli jeweller creates gold and diamond-encrusted mask that costs $1.5 million
Class = BUSINESS, Responce = BUSINESS, Text = Electric Valve Market Challenges and Growth Factor By 2026| VETEC Ventiltechnik GmbH, A

### Test Spam

In [12]:


context="Please identify the content that follows the word 'Subject' as either harmless or spam. Reply with 'ham' for harmless and 'spam' for spam. Your answer should only be one word."

In [13]:
df_spam = pd.read_csv('../Data/spam_ham_dataset.csv', sep = ",")

df_spam=df_spam[["label","text"]]


In [14]:
df_spam

Unnamed: 0,label,text
0,ham,Subject: enron methanol ; meter # : 988291\r\n...
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,spam,"Subject: photoshop , windows , office . cheap ..."
4,ham,Subject: re : indian springs\r\nthis deal is t...
...,...,...
5166,ham,Subject: put the 10 on the ft\r\nthe transport...
5167,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168,ham,Subject: calpine daily gas nomination\r\n>\r\n...
5169,ham,Subject: industrial worksheets for august 2000...


In [15]:
df_spam_subset = subset(df_spam, 10,"label")

In [16]:
df_spam_subset

Unnamed: 0,label,text
0,ham,"Subject: hpl nom for february 9 , 2001\r\n( se..."
1,ham,Subject: deal 338634 meter 985077\r\nthe above...
2,ham,Subject: fw : intrastate gas\r\nfyi : from leg...
3,ham,Subject: cornhusker\r\ndaren - - - ( re : the ...
4,ham,Subject: fun pics . htm\r\n-\r\n- fun pics . htm
5,ham,"Subject: union carbide , 96002201\r\nfax of co..."
6,ham,"Subject: fw : tufco deal 108058\r\ndarin ,\r\n..."
7,ham,Subject: assistant - realignment of duties\r\n...
8,ham,Subject: goldston outage\r\nper kevin at golds...
9,ham,Subject: point change to tagg deal 589257\r\nf...


In [17]:
TP = 0
Ges = len(df_spam_subset)
ges_len_input = 0
ges_len_responce = 0
elapsed_time_ges = 0
# Iterieren über die "Text"-Spalte und Drucken jedes Eintrags
for index,text_entry in enumerate(df_spam_subset['text']):
    if 'gpt'in llm:
        len_input = num_tokens_from_string(text_entry, llm)
        ges_len_input += len_input
    else:
        len_input = num_tokens_from_string(text_entry, "gpt-3.5-turbo")
        ges_len_responce = 0
    if len_input>4097:
        text_entry = text_entry[:-int((1-((4097/len_input)-0.05))*len(text_entry))]

    cat = df_spam_subset["label"].iloc[index]
    #ausführung Modell + Zeitstoppen
    start_time = time.time()
    responce = generate_gpt_response(text_entry)
    end_time = time.time()
    elapsed_time = end_time - start_time
    elapsed_time_ges += elapsed_time

    if 'gpt'in llm:
        len_output_tokens = num_tokens_from_string(responce, llm)
        ges_len_responce += len_output_tokens
    else:
        ges_len_responce = 0
    print(f"Class = {cat}, Responce = {responce}")
    if cat == responce:
        TP+=1
kosten = kostenrechner(ges_len_input, ges_len_responce)
Acc = TP/Ges
print(f"The accuracy of the model is: {Acc} and cost: {kosten} and execution time: {elapsed_time_ges}")


Class = ham, Responce = ham
Class = ham, Responce = spam
Class = ham, Responce = ham
Class = ham, Responce = ham
Class = ham, Responce = spam
Class = ham, Responce = spam
Class = ham, Responce = ham
Class = ham, Responce = ham
Class = ham, Responce = ham
Class = ham, Responce = ham
Class = spam, Responce = spam
Class = spam, Responce = spam
Class = spam, Responce = spam
Class = spam, Responce = spam
Class = spam, Responce = spam


In [None]:
# Liste zum Speichern der ausgewählten DataFrames
dfs = []

# Gruppieren nach 'Kategorie' und nehmen von 5 Stichproben für jede Gruppe
for name, group in df_spam.groupby('Kategorie'):
    dfs.append(group.sample(min(len(group), 5)))

# Kombinieren aller gesammelten DataFrames
subset_spam_df = pd.concat(dfs)