## Modules import

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from matplotlib import pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from sklearn.neighbors import KNeighborsClassifier
from time import time

In [40]:
from sklearn.linear_model import LogisticRegression

## Data import

In [2]:
data = pd.read_csv('lab2_oil_gas_field_construction_data.csv')
marked_data = data[((data.generalized_work_class.notna())) & (data.global_work_class.notna())]

embeddings_train = pd.read_csv('x_train.csv', index_col="Unnamed: 0")
embeddings_val = pd.read_csv('x_val.csv', index_col="Unnamed: 0")
embeddings_test = pd.read_csv('x_test.csv', index_col="Unnamed: 0")

In [3]:
test = pd.read_excel("lab2_test_dataset.xlsx")

In [4]:
test = test[~test["generalized_work_class"].isna()]
test = test[~test["global_work_class"].isna()]
test = test[~test["work_name"].isna()]
test = test[~test["index"].isna()]

In [19]:
data.loc[464317]

work_name                 монтаж кипиа и зра с электроприводом
generalized_work_class                                     NaN
global_work_class                                          NaN
upper_works                                                NaN
Name: 464317, dtype: object

In [91]:
len(test)

197881

In [5]:
test["index"] = test["index"].astype(int)
test.set_index(test["index"], inplace=True)
test.drop(columns=["index"], inplace=True)
test.head(2)

Unnamed: 0_level_0,work_name,generalized_work_class,global_work_class
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
507695,"монтаж шаровых кранов, дроссельной шайбы, запо...",Монтаж мк,Монтаж мк
464317,монтаж кипиа и зра с электроприводом,Монтаж ЗРА,Монтаж


In [6]:
embeddings_test = embeddings_test.loc[test.index]
len(embeddings_test)

197881

In [7]:
embeddings_train_all = pd.concat([embeddings_train, embeddings_val])

In [105]:
embeddings_train_all.tail(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
608828,0.0892,0.009223,-0.01907,-0.003863,-0.082208,0.064999,0.041418,0.081321,0.058563,0.048179,...,-0.076867,0.009255,0.016595,0.10606,0.001887,0.025737,-0.000173,-0.012725,-0.03737,0.060899
699546,0.09023,-0.012241,-0.046547,0.015321,-0.067068,0.079483,-0.094171,0.037245,0.058103,0.022777,...,-0.057203,-0.007346,0.027031,0.079532,-0.032168,0.055671,0.050986,-0.029939,-0.038647,0.056857


In [106]:
embeddings_val.tail(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
608828,0.0892,0.009223,-0.01907,-0.003863,-0.082208,0.064999,0.041418,0.081321,0.058563,0.048179,...,-0.076867,0.009255,0.016595,0.10606,0.001887,0.025737,-0.000173,-0.012725,-0.03737,0.060899
699546,0.09023,-0.012241,-0.046547,0.015321,-0.067068,0.079483,-0.094171,0.037245,0.058103,0.022777,...,-0.057203,-0.007346,0.027031,0.079532,-0.032168,0.055671,0.050986,-0.029939,-0.038647,0.056857


In [107]:
embeddings_train.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
225625,0.094319,-0.022503,-0.051255,0.020912,-0.077827,0.074578,-0.0882,0.032821,0.045553,0.025431,...,-0.064423,-0.016055,0.039017,0.074969,-0.019998,0.064222,0.051063,-0.027311,-0.024215,0.051081
238367,-0.043785,-0.008877,0.012008,0.06917,-0.061046,0.04538,-0.021549,0.034831,0.044076,0.008401,...,0.000976,-0.038714,-0.008661,0.057099,-0.018799,0.072227,0.054251,-0.022674,-0.061255,0.028011


In [48]:
embeddings_train_all.tail(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
608828,0.0892,0.009223,-0.01907,-0.003863,-0.082208,0.064999,0.041418,0.081321,0.058563,0.048179,...,-0.076867,0.009255,0.016595,0.10606,0.001887,0.025737,-0.000173,-0.012725,-0.03737,0.060899
699546,0.09023,-0.012241,-0.046547,0.015321,-0.067068,0.079483,-0.094171,0.037245,0.058103,0.022777,...,-0.057203,-0.007346,0.027031,0.079532,-0.032168,0.055671,0.050986,-0.029939,-0.038647,0.056857


In [110]:
len(marked_data)

296858

In [111]:
len(embeddings_train_all)

296858

In [46]:
y1_train_raw = marked_data.generalized_work_class.loc[embeddings_train_all.index]
y2_train_raw = marked_data.global_work_class.loc[embeddings_train_all.index]

In [47]:
y1_train_raw

225625       Монтаж теплоизоляции
238367    Заполнение полости свай
357855                  Монтаж мк
270696        Монтаж трубопровода
54329              Засыпка щебнем
                   ...           
253575          Изготовление свай
535045          Монтаж ростверков
474034        Монтаж трубопровода
608828             Гидроиспытания
699546       Монтаж теплоизоляции
Name: generalized_work_class, Length: 296858, dtype: object

## Targets encoding

In [50]:
generalized_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
global_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

In [51]:
y1_train = pd.Series(
    generalized_encoder.fit_transform(
        y1_train_raw.values.reshape(-1, 1)
    ).flatten(),
    index=y1_train_raw.index,
)

y2_train = pd.Series(
    global_encoder.fit_transform(
        y2_train_raw.values.reshape(-1, 1)
    ).flatten(),
    index=y2_train_raw.index,
)

In [52]:
y1_test = pd.Series(
    generalized_encoder.transform(
        test.generalized_work_class.values.reshape(-1, 1)
    ).flatten(),
    index=test.index,
)

y2_test = pd.Series(
    global_encoder.transform(
        test.global_work_class.values.reshape(-1, 1)
    ).flatten(),
    index=test.index,
)

## Testing

In [53]:
def show_metrics1(predictions):
    f1_score_1_micro = f1_score(y1_test, predictions, average='micro')
    f1_score_1_macro = f1_score(y1_test, predictions, average='macro')
    
    show_df = pd.DataFrame(
        {
        'generalized_work_class': [f1_score_1_micro, f1_score_1_macro]
        },
        index=['F1 micro', 'F1 macro']
    ).round(6)
    
    return show_df

In [54]:
def show_metrics2(predictions):
    f1_score_2_micro = f1_score(y2_test, predictions, average='micro')
    f1_score_2_macro = f1_score(y2_test, predictions, average='macro')
    
    show_df = pd.DataFrame(
        {
        'global_work_class': [f1_score_2_micro, f1_score_2_macro]
        },
        index=['F1 micro', 'F1 macro']
    ).round(6)
    
    return show_df

In [55]:
moment1 = time()
knn1 =  KNeighborsClassifier(weights="distance", metric='minkowski', n_neighbors=4)
knn1.fit(embeddings_train_all, y1_train)
moment2 = time()

knn1_preds = knn1.predict(embeddings_test)
moment3 = time()

print(f"Time for training, s: {(moment2-moment1):.2f}")
print(f"Time for testing, s: {(moment3-moment2):.2f}")

Time for training, s: 0.65
Time for testing, s: 168.69


In [57]:
moment1 = time()
knn2 =  KNeighborsClassifier(weights="distance", n_neighbors=4)
knn2.fit(embeddings_train_all, y2_train)
moment2 = time()

knn2_preds = knn2.predict(embeddings_test)
moment3 = time()

print(f"Time for training, s: {(moment2-moment1):.2f}")
print(f"Time for testing, s: {(moment3-moment2):.2f}")

Time for training, s: 0.64
Time for testing, s: 168.63


In [56]:
show_metrics1(knn1_preds)

Unnamed: 0,generalized_work_class
F1 micro,0.99573
F1 macro,0.971533


In [58]:
show_metrics2(knn2_preds)

Unnamed: 0,global_work_class
F1 micro,0.996387
F1 macro,0.9856


## Hypothesis testing

In [59]:
key_pairs_info = marked_data.groupby(["generalized_work_class", "global_work_class"]).size()

keys1, keys2 = [], []
for key1, key2 in list(key_pairs_info.index):
    keys1.append(key1)
    keys2.append(key2)
    
key_pairs_info_idx = pd.DataFrame(list(zip(keys1, keys2)), columns=["generalized_work_class", "global_work_class"])

In [60]:
sur_first_keys = key_pairs_info_idx.generalized_work_class[~key_pairs_info_idx.generalized_work_class.duplicated(keep=False)]

marked_data["Index"] = pd.Series(marked_data.index, index=marked_data.index)
sur_marked_data = marked_data.merge(sur_first_keys, 
                                    on="generalized_work_class").set_index("Index")
sur_keys_data = sur_marked_data[["generalized_work_class", "global_work_class"]].drop_duplicates()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  marked_data["Index"] = pd.Series(marked_data.index, index=marked_data.index)


In [61]:
row1 = pd.Series(
    generalized_encoder.transform(sur_keys_data \
                             .generalized_work_class \
                             .values.reshape(-1, 1)) \
                             .flatten(),
    name="generalized_work_class")

row2 = pd.Series(
    global_encoder.transform(sur_keys_data \
                             .global_work_class \
                             .values.reshape(-1, 1)) \
                             .flatten(),
    name="global_work_class")

sur_keys_encoded = pd.concat([row1, row2], axis=1)
sur_keys_encoded.head(2)

Unnamed: 0,generalized_work_class,global_work_class
0,132.0,54.0
1,157.0,26.0


In [62]:
knn1_preds_series = pd.Series(knn1_preds, index=test.index, name="generalized_work_class")
knn2_preds_series = pd.Series(knn2_preds, index=test.index, name="global_work_class")

In [64]:
knns_def_preds = knn1_preds_series.to_frame()\
            .merge(sur_keys_encoded, on="generalized_work_class", how="left")\
            .set_index(y1_test.index)
knns_def_preds.head(2)

Unnamed: 0_level_0,generalized_work_class,global_work_class
index,Unnamed: 1_level_1,Unnamed: 2_level_1
507695,139.0,34.0
464317,80.0,26.0


In [65]:
knn_idxs = knns_def_preds.global_work_class[knns_def_preds.global_work_class.isna()].index

In [66]:
knns_def_preds.loc[knn_idxs, "global_work_class"] = knn2_preds_series[knn_idxs]

In [67]:
knn_pred_gen = knns_def_preds.generalized_work_class.values
knn_pred_glob = knns_def_preds.global_work_class.values

In [68]:
show_metrics2(knn_pred_glob)

Unnamed: 0,global_work_class
F1 micro,0.996367
F1 macro,0.98568
