In [1]:
import pandas as pd
import numpy as np

In [2]:
meta_df = pd.read_csv('data/MEISD_text.csv')

In [3]:
# For the multilabel classification we use:
columns = ['Utterances', 'dialog_ids', 'intensity', 'intensity2', 'intensity3']
meta_dfs = meta_df[columns].copy()

In [4]:
meta_dfs.head()

Unnamed: 0,Utterances,dialog_ids,intensity,intensity2,intensity3
0,look around you,1,,,
1,say hello to your competition,1,,,
2,eight of you will switch to an easier specialty,1,,,
3,five of you will crack under the pressure,1,,,
4,two of you will be asked to leave,1,,,


In [5]:
meta_dfs.loc[:, 'intensity'] = pd.to_numeric(meta_dfs['intensity'], errors='coerce')
meta_dfs.loc[:, 'intensity2'] = pd.to_numeric(meta_dfs['intensity2'], errors='coerce')
meta_dfs.loc[:, 'intensity3'] = pd.to_numeric(meta_dfs['intensity3'], errors='coerce')


#Zamień wartości na liczby całkowite
meta_dfs['intensity'] = pd.to_numeric(meta_dfs['intensity'], errors='coerce').fillna(0)
meta_dfs['intensity2'] = pd.to_numeric(meta_dfs['intensity2'], errors='coerce').fillna(0)
meta_dfs['intensity3'] = pd.to_numeric(meta_dfs['intensity3'], errors='coerce').fillna(0)

#Zamień wartości zawierające tylko białe znaki lub '`', 'neu', 'po' na NaN
meta_dfs['intensity'] = meta_dfs['intensity'].replace(r'^\s*$', np.nan, regex=True)
meta_dfs['intensity'] = meta_dfs['intensity'].replace(['`', 'neu', 'po'], np.nan)
meta_dfs['intensity2'] = meta_dfs['intensity2'].replace(r'^\s*$', np.nan, regex=True)
meta_dfs['intensity2'] = meta_dfs['intensity2'].replace(['`', 'neu', 'po'], np.nan)
meta_dfs['intensity3'] = meta_dfs['intensity3'].replace(r'^\s*$', np.nan, regex=True)
meta_dfs['intensity3'] = meta_dfs['intensity3'].replace(['`', 'neu', 'po'], np.nan)

#Użyj forward fill, aby uzupełnić brakujące wartości poprzedzającą wartością
meta_dfs['intensity'] = meta_dfs['intensity'].ffill()
meta_dfs['intensity2'] = meta_dfs['intensity2'].ffill()
meta_dfs['intensity3'] = meta_dfs['intensity3'].ffill()

#Usuń znaki niebędące cyframi (np. '`') za pomocą wyrażeń regularnych
meta_dfs['intensity'] = meta_dfs['intensity'].replace(r'\D', '', regex=True).astype(int)  # Usuwa wszystko, co nie jest cyfrą
meta_dfs['intensity2'] = meta_dfs['intensity2'].replace(r'\D', '', regex=True).astype(int)
meta_dfs['intensity3'] = meta_dfs['intensity3'].replace(r'\D', '', regex=True).astype(int)

In [6]:
# Hiperparametr kontrolujący siłę szumu (możesz dostosować)
alpha = 0.2  # Wpływ intensity_2 i intensity_3 na intensity_1
noise_std = 0.1  # Standardowe odchylenie losowego szumu

# Obliczamy szum na podstawie intensity_2 i intensity_3
def add_controlled_noise(row):
    base_intensity = row['intensity']

    # Tworzymy kontrolowany szum na podstawie intensity_2 i intensity_3
    influence = (row['intensity2'] + row['intensity3']) / 2 - 1.5
    random_noise = np.random.normal(0, noise_std)  # Losowy szum

    # Nowa wartość intensity_1
    new_intensity = base_intensity + alpha * influence + random_noise

    # Przycinamy wartości do zakresu [0, 3]
    return np.clip(new_intensity, 0, 3)

# Tworzymy nową kolumnę z zaszumionym intensity_1
meta_dfs['intensity_1_noisy'] = meta_dfs.apply(add_controlled_noise, axis=1)

In [ ]:
import os
import pandas as pd
import numpy as np
import json

In [7]:
meta_dfs

Unnamed: 0,Utterances,dialog_ids,intensity,intensity2,intensity3,intensity_1_noisy
0,look around you,1,0,0,0,0.000000
1,say hello to your competition,1,0,0,0,0.000000
2,eight of you will switch to an easier specialty,1,0,0,0,0.000000
3,five of you will crack under the pressure,1,0,0,0,0.000000
4,two of you will be asked to leave,1,0,0,0,0.000000
...,...,...,...,...,...,...
20012,"oh, that's right, you're a woman and you need ...",1125,1,2,0,0.911023
20013,i'll try again,1125,1,2,0,0.828618
20014,"please, pam, reconsider and have a bagel",1125,1,1,0,0.907747
20015,i have an early lunch,1125,1,2,0,0.903971


In [21]:
first_25_data = []
last_25_data = []

def process_group(group):
    num_rows = len(group)
    quarter_size = max(1, num_rows // 4)

    # First 25%
    first_25 = group.iloc[:quarter_size]
    first_25_avg_intensity = first_25.loc[first_25['intensity_1_noisy'] > 0, 'intensity_1_noisy'].mean()

    # Last 25%
    last_25 = group.iloc[-quarter_size:]
    last_25_avg_intensity = last_25.loc[last_25['intensity_1_noisy'] > 0, 'intensity_1_noisy'].mean()

    # Dodanie średniej jako nowej kolumny
    first_25 = first_25.assign(avg_intensity_noisy=first_25_avg_intensity)
    last_25 = last_25.assign(avg_intensity_noisy=last_25_avg_intensity)

    first_25_data.append(first_25)
    last_25_data.append(last_25)

meta_dfs.groupby('dialog_ids').apply(process_group)

first_25_df = pd.concat(first_25_data).reset_index(drop=True)
last_25_df = pd.concat(last_25_data).reset_index(drop=True)

  meta_dfs.groupby('dialog_ids').apply(process_group)


In [22]:
# Grupowanie z uwzględnieniem średniej intensywności
grouped_first_25 = first_25_df.groupby('dialog_ids').agg({
    'Utterances': ' '.join,
    'avg_intensity_noisy': 'first'
}).reset_index().fillna(0)

grouped_last_25 = last_25_df.groupby('dialog_ids').agg({
    'Utterances': ' '.join,
    'avg_intensity_noisy': 'first'
}).reset_index().fillna(0)

df = grouped_first_25[['dialog_ids', 'Utterances', 'avg_intensity_noisy']].fillna(0)

In [23]:
df.rename(columns={'avg_intensity_noisy':'label'}, inplace=True)

In [24]:
df

Unnamed: 0,dialog_ids,Utterances,label
0,1,look around you say hello to your competition ...,0.000000
1,2,"i'm george o'malley uh, we met at the mixer. y...",1.344341
2,3,seattle is surrounded by water on three sides ...,1.175248
3,4,yes no other reason? just a favor for an old p...,1.178085
4,5,if he doesn't respond to these tests in the ne...,1.571909
...,...,...,...
1119,1121,i was thinking exactly the same thing no. crim...,1.802966
1120,1122,"just to be clear, he backed down an insurance ...",2.286215
1121,1123,"hey, michael, did you fall into a koi pond? i ...",1.119796
1122,1124,"hey, boss, did you find nemo? i can name pixar...",1.005297


In [25]:
df.to_csv('C:/Users/juwieczo/DataspellProjects/meisd_project/datafirst_25_percent.csv', index=False)