In [12]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# minu tulemused
data = pd.read_csv('minu_tulemused.csv')

data = data.drop(columns=['Unnamed: 0'])
data = data.rename(columns={
    "ese": "belongings",
    "riide_ese": "clothing_items",
    "jook_toit": "edibles",
    "sisse_minna": "enclosures_and_vehicles",
    "geo_lok": "geographical_locations",
    "asutus": "institutions",
    "-": "nonsense_words",
    "org_ür_fir": "organisations_and_events",  
    "isikunimi": "persons",
    "peal_seista": "platforms_and_floors",
    "suund": "spatial_directions",
    "piirkond": "spatial_region",
    "ajaperiood": "time_expressions",
    "taim": "vegetative_enclosure"
})
data['-'] = data['nonsense_words']



In [13]:
data

Unnamed: 0,lemma,0,1,?,nonsense_words,time_expressions,institutions,belongings,geographical_locations,edibles,persons,organisations_and_events,spatial_region,clothing_items,platforms_and_floors,enclosures_and_vehicles,spatial_directions,vegetative_enclosure,-
0,Nõugaste,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,turbaauk,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,lillepaviljon,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,transpordikomisjon,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,turismihooaeg,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,ovaalsaal,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
497,softivahetus,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
498,Elbrus,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
499,dj,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [14]:
# teeb veergudest -, 0, 1 ja ? ühe veeru
def märgendaKoht(rida):
    veerud = ['-', '?', '0', '1']
    for veerg in veerud:
        if rida[veerg] == 1:
            return veerg
    return 'NaN'         

In [15]:
data['koht'] = data.apply(märgendaKoht, axis=1)

data = data.drop(columns=['-', '0', '1', '?'])


In [16]:
data

Unnamed: 0,lemma,nonsense_words,time_expressions,institutions,belongings,geographical_locations,edibles,persons,organisations_and_events,spatial_region,clothing_items,platforms_and_floors,enclosures_and_vehicles,spatial_directions,vegetative_enclosure,koht
0,Nõugaste,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
1,turbaauk,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
2,lillepaviljon,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
3,transpordikomisjon,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,turismihooaeg,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,ovaalsaal,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
497,softivahetus,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
498,Elbrus,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
499,dj,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [17]:
# Features = all category columns (you might need to drop 'Unnamed: 0', 'lemma', and '0', '1', '?', '-')
feature_cols = ['nonsense_words', 'time_expressions', 'institutions', 'belongings', 
                'geographical_locations', 'edibles', 'persons', 
                'organisations_and_events', 'spatial_region', 
                'clothing_items', 'platforms_and_floors', 'enclosures_and_vehicles', 
                'spatial_directions', 'vegetative_enclosure']

X = data[feature_cols]
y = data['koht']  # <- whatever your correct label column is called ('-', '0', '1', '?')

# Split into train/test for checking
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Create and train model
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# Evaluate
print(f"Accuracy on test set: {rf.score(X_test, y_test):.2f}")

Accuracy on test set: 1.00


In [18]:
# GPT 35
df_GPT35 = pd.read_csv('GPT3_5_tulemused.csv')
df_GPT35['nonsense_words'] = df_GPT35['nonsense_words'].apply(lambda x: 1 if x == 0 else 0 if x == 1 else x)
X_pred = df_GPT35[feature_cols].dropna()
df_GPT35 = df_GPT35.loc[X_pred.index]  # align indices
df_GPT35['koht'] = rf.predict(X_pred)


# Save result
#predictions.to_csv('predictions_with_rf_labels.csv', index=False)


In [19]:
# GPT 4
df_GPT4 = pd.read_csv('GPT4_0_tulemused.csv')
df_GPT4['nonsense_words'] = df_GPT4['nonsense_words'].apply(lambda x: 1 if x == 0 else 0 if x == 1 else x)
X_pred = df_GPT4[feature_cols].dropna()
df_GPT4 = df_GPT4.loc[X_pred.index]  # align indices
df_GPT4['koht'] = rf.predict(X_pred)

In [20]:
def kontrolliKasOnKoht(df_gpt, kontrollitav_lemma, õige_veerg, mudel):
    leitud_lemmad = df_gpt[df_gpt['lemma'] == kontrollitav_lemma]
    if not leitud_lemmad.empty:
        leitud_rida = leitud_lemmad.iloc[0]
        if (leitud_rida['koht'] == õige_veerg):
                return 1
        #print(mudel +':')
        #print(f"Lemma: {kontrollitav_lemma}, Correct label: {õige_veerg}, Predicted label: {leitud_rida['koht']}")
        
    return 0  

In [21]:
kokku = 0
õigeid_35 = 0
õigeid_4 = 0


for index, row in data.iterrows():
    kontrollitav_lemma = row["lemma"]
    õige_veerg = row['koht']
    õigeid_35 += kontrolliKasOnKoht(df_GPT35, kontrollitav_lemma, õige_veerg, '3.5')
    õigeid_4 += kontrolliKasOnKoht(df_GPT4, kontrollitav_lemma, õige_veerg, '4o')
    kokku += 1

täpsus_35 = õigeid_35/kokku
täpsus_4 = õigeid_4/kokku

print("GPT3.5 koha täpsus: " + str(täpsus_35))
print("GPT4o koha täpsus: " + str(täpsus_4))

GPT3.5 koha täpsus: 0.5109780439121756
GPT4o koha täpsus: 0.6287425149700598


In [22]:
from sklearn.metrics import precision_score, recall_score, f1_score

def get_predictions_and_labels(df_gpt, data):
    y_true = []
    y_pred = []

    for index, row in data.iterrows():
        kontrollitav_lemma = row["lemma"]
        õige_veerg = row['koht']
        y_true.append(õige_veerg)

        leitud_lemmad = df_gpt[df_gpt['lemma'] == kontrollitav_lemma]
        if not leitud_lemmad.empty:
            leitud_rida = leitud_lemmad.iloc[0]
            ennustatud_koht = leitud_rida['koht']
        else:
            ennustatud_koht = "PUUDU"  # or some placeholder for missing prediction

        y_pred.append(ennustatud_koht)

    return y_true, y_pred

# Get predictions for both models
y_true_35, y_pred_35 = get_predictions_and_labels(df_GPT35, data)
y_true_4, y_pred_4 = get_predictions_and_labels(df_GPT4, data)

# Accuracy (täpsus)
täpsus_35 = sum(1 for t, p in zip(y_true_35, y_pred_35) if t == p) / len(y_true_35)
täpsus_4 = sum(1 for t, p in zip(y_true_4, y_pred_4) if t == p) / len(y_true_4)

# Precision, Recall, F1 (macro for multi-class case, or adjust as needed)
precision_35 = precision_score(y_true_35, y_pred_35, average='macro', zero_division=0)
recall_35 = recall_score(y_true_35, y_pred_35, average='macro', zero_division=0)
f1_35 = f1_score(y_true_35, y_pred_35, average='macro', zero_division=0)

precision_4 = precision_score(y_true_4, y_pred_4, average='macro', zero_division=0)
recall_4 = recall_score(y_true_4, y_pred_4, average='macro', zero_division=0)
f1_4 = f1_score(y_true_4, y_pred_4, average='macro', zero_division=0)

# Print results
print("GPT3.5:")
print("Täpsus (accuracy):", täpsus_35)
print("Precison:", precision_35)
print("Recall:", recall_35)
print("F1-score:", f1_35)

print("\nGPT4o:")
print("Täpsus (accuracy):", täpsus_4)
print("Precision:", precision_4)
print("Recall:", recall_4)
print("F1-score:", f1_4)


GPT3.5:
Täpsus (accuracy): 0.5109780439121756
Precison: 0.3814186811939621
Recall: 0.3636352053743358
F1-score: 0.3311140954312495

GPT4o:
Täpsus (accuracy): 0.6287425149700598
Precision: 0.4002665245202559
Recall: 0.41781747238268974
F1-score: 0.3815673126805058


In [52]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define your base model
rf = RandomForestClassifier(random_state=42)

# Define the parameter grid you want to search over
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 10, 20],
    'min_samples_leaf': [1, 3, 5],
    'class_weight': ['balanced']
}

# Set up the grid search
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,  # 3-fold cross-validation
    n_jobs=-1,  # use all CPU cores for faster training
    verbose=2   # print progress
)

# Fit the model
grid_search.fit(X_train, y_train)

# Best parameters found
print("Best parameters:", grid_search.best_params_)

# Use the best model
best_rf = grid_search.best_estimator_

# Predict
y_pred = best_rf.predict(X_test)


Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best parameters: {'class_weight': 'balanced', 'max_depth': None, 'min_samples_leaf': 1, 'n_estimators': 100}
