In [None]:
!pip install datasets open3d laspy

In [None]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import load_dataset

import torch 
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR # или другой scheduler
from torch.utils.data import Dataset, DataLoader
import wandb

import gc
import laspy
import open3d as o3d

import warnings
warnings.simplefilter(action='ignore')

from tqdm.auto import tqdm
tqdm.pandas()

In [None]:
train_df = pd.read_csv('/kaggle/input/power-line-security-zone-vegetation-detection/train.csv').drop_duplicates().reset_index(drop=True)
print('shape: ', train_df.shape)
train_df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_df['class'] = le.fit_transform(train_df['class'])
classes = {i: class_name for i, class_name in enumerate(le.classes_)}

In [None]:
def group2box(row):
    return np.array([row['center_x'], row['center_y'], row['center_z'], row['size_x'], row['size_y'], row['size_z'], row['yaw'], row['class']])

train_df['answer'] = train_df.apply(group2box, axis=1)
train_df = train_df[['file_name', 'answer']]

In [None]:
new_df = train_df.groupby('file_name').agg(lambda x: np.array(x)).reset_index()
new_df['counts'] = new_df.answer.map(len)

def reshape_arrays(row):
    return np.concatenate(row['answer']).reshape(row['counts'], 8)
    
new_df['answer'] = new_df.apply(reshape_arrays, axis=1)

In [None]:
ds = load_dataset("Eka-Korn/power_line_lidar_data")

train_counts = pd.read_csv('/kaggle/input/power-line-counts/train_counts.csv').T.reset_index()
train_counts['index'] = train_counts['index'].map(lambda x: x[:-4])

test_counts = pd.read_csv('/kaggle/input/power-line-counts/test_counts.csv').T.reset_index()
test_counts['index'] = test_counts['index'].map(lambda x: x[:-4])

In [None]:
workers = 4
batch_size = 1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lr = 3e-4
warmup = 20
# weight_decay = 1e-5
epochs = 100
warmup_steps = 10

In [None]:
class CustomDataset(Dataset):
    def __init__(self, sub_ds, counts_df, new_df):
        self.ds = sub_ds
        self.counts_df = counts_df
        self.new_df = new_df
    
    def __getitem__(self, i):
        row = self.counts_df.iloc[i]
        start = row[0]
        end = row[1]
        cur_df = self.ds[start:end]
        cur_df = pd.DataFrame(cur_df)[['x', 'y', 'z']].sort_values(by='x').reset_index(drop=True).T
        cur_df = torch.tensor(cur_df.values, dtype=torch.float)

        boxes = self.new_df[self.new_df['file_name'] == row['index']]['answer'].values[0]
        return cur_df, boxes
    
    def __len__(self):
        return len(self.counts_df)

    # batch_size = 1
    # def collate_fn(self, batch):

In [None]:
train_dataset = CustomDataset(ds['train'], train_counts, new_df)
test_dataset = CustomDataset(ds['test'], test_counts, new_df)

In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=workers, pin_memory=True,)
                                           #collate_fn=train_dataset.collate_fn) 
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, num_workers=workers, pin_memory=True,)
                                          #collate_fn=train_dataset.collate_fn, ) 

In [None]:
class Detector(nn.Module):
    
    def __init__(self, input_dim, num_coords, num_classes, alpha):
        super(Detector, self).__init__()
        self.alpha = alpha
        self.conv1 = nn.Conv1d(3, 32, 11)
        self.conv2 = nn.Conv1d(32, 128, 11)
        self.conv3 = nn.Conv1d(128, 256, 11)
        # self.conv5 = nn.Conv1d(256, 512, 3)
        
        # self.btchnorm1 = nn.BatchNorm1d(32)
        # self.btchnorm2 = nn.BatchNorm1d(64)
        # self.btchnorm3 = nn.BatchNorm1d(128)
        # self.btchnorm4 = nn.BatchNorm1d(256)
        # self.btchnorm5 = nn.BatchNorm1d(512)
        
        self.pool = nn.MaxPool1d(2)
        # self.lin1 = nn.Linear(512, 256)
        self.lin1 = nn.Linear(256, 128)
        self.lin2 = nn.Linear(128, 64)
        self.lin3 = nn.Linear(64, 10) # [cx, cy, cz, dx, dy, dz, yaw] + [class_0, class_1, class_2]

        self.relu = nn.ReLU()
        
        
    def forward(self, xyz, b):
        emb_dim = int(xyz.shape[2] // (b*self.alpha))
        x = list(xyz.split(emb_dim, dim=2))
        if x[-1].shape[-1] != emb_dim:
            x.pop(-1)
        x = torch.concatenate(x, dim=0)

        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = self.pool(self.relu(self.conv3(x)))
        
        # x = self.pool(self.btchnorm1(self.relu(self.conv1(x))))
        # x = self.pool(self.btchnorm2(self.relu(self.conv2(x))))
        # x = self.pool(self.btchnorm3(self.relu(self.conv3(x))))
        # x = self.pool(self.btchnorm4(self.relu(self.conv4(x))))
        # x = self.pool(self.btchnorm5(self.relu(self.conv5(x))))
        
        # x = torch.mean(x, 2)[0]
        x = torch.max(x, 2)[0]
        x = self.relu(self.lin1(x))
        x = self.relu(self.lin2(x))
        x = self.lin3(x)
        # x = self.relu(self.lin3(x))
        # x = self.lin4(x)
        return x

In [None]:
class CustomLoss(nn.Module):
    def __init__(self):
        super(CustomLoss, self).__init__()
        self.criterion_reg = nn.SmoothL1Loss()
        self.criterion_class = nn.CrossEntropyLoss()
        
    def forward(self, boxes, preds):
        right_preds = []
        # as batch_size = 1
        boxes = boxes[0]
        for box in boxes:
            target_class = torch.tensor(box[-1], dtype=torch.long)
            target_xyz = torch.tensor(box[:-1], dtype=torch.float)
        
            min_loss, l = 10**9, -1
            for i, pred in enumerate(preds):
                pred_classes = pred[-3:]
                pred_coordinates = pred[:-3]
                loss_reg = self.criterion_reg(pred_coordinates, target_xyz)
                loss_class = self.criterion_class(pred_classes, target_class)
                loss = loss_reg# + loss_class
                if loss < min_loss:
                    min_loss = loss
                    l = i
                    
            right_preds.append(preds[l].unsqueeze(0))
            preds = torch.concatenate([preds[:l, :], preds[(l+1):, :]], dim=0)
        
        right_preds = torch.concatenate(right_preds, dim=0)
        
        target_class = torch.tensor(boxes[:, -1], dtype=torch.long)
        target_xyz = torch.tensor(boxes[:, :-1], dtype=torch.float)
        pred_classes = right_preds[:, -3:]
        pred_coordinates = right_preds[:, :-3]
        
        loss_reg = self.criterion_reg(pred_coordinates, target_xyz)
        loss_class = self.criterion_class(pred_classes, target_class)
        return loss_reg, loss_class

In [None]:
input_dim = 3 #xyz
num_coords = 7 # cx, cy, cz, dx, dy, dz, yaw
num_classes = 3 
alpha = 1.5 # must be >= 1
total_steps = len(train_loader) * epochs

In [None]:
model = Detector(input_dim, num_coords, num_classes, alpha).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)#, weight_decay=weight_decay)
criterion = CustomLoss().to(device)

In [None]:
scheduler = CosineAnnealingLR(optimizer, T_max=total_steps, eta_min=0)

In [None]:
wandb_project = "power_line_hack"
wandb_run_name = "custom_detection2"
wandb.init(project=wandb_project, name=wandb_run_name)

In [None]:
for epoch in range(epochs):
    print(f'Epoch: {epoch+1}')
    model.train()
    train_loss = 0
    for i, (xyz, boxes) in enumerate(train_loader):
        preds = model(xyz.to(device), boxes.shape[1])
        loss_reg, loss_class = criterion(boxes.to(device), preds)
        train_loss = [loss_reg.item(), loss_class.item()]
        loss = loss_reg + loss_class
        print(f'Train Loss. Regression: {train_loss[0]}; Classification: {train_loss[1]}')
        
        optimizer.zero_grad()
        loss_reg.backward()
        optimizer.step()
        scheduler.step()
    
    model.eval()
    with torch.no_grad():
        for i, (xyz, boxes) in enumerate(test_loader):
            preds = model(xyz.to(device), boxes.shape[1])
            loss_reg, loss_class = criterion(boxes.to(device), preds)
            val_loss = [loss_reg.item(), loss_class.item()]
            print(f'Validation Loss. Regression: {val_loss[0]}; Classification: {val_loss[1]}')

    wandb.log({
            "epoch": epoch,
            "train/regression/last": train_loss[0],
            "train/classification/last": train_loss[1],
            "val/regression/last": val_loss[0],
            "val/classification/last": val_loss[1],
            "lr": optimizer.param_groups[0]["lr"]
        })

In [None]:
# torch.save(model.state_dict, 'first_model_dict.pth')

## Make predictions

In [None]:
path = '/kaggle/input/power-line-security-zone-vegetation-detection/test/test'
las_files = os.listdir(path)
for las_file in las_files:
    print(las_file)
    las_filepath = os.path.join(path, las_file)
    las = laspy.read(las_filepath)
    points = las.xyz - las.header.offset
    pcd = o3d.geometry.PointCloud()
    pcd.points = o3d.utility.Vector3dVector(points)
    downpcd = pcd.voxel_down_sample(voxel_size=0.4)
    df = pd.DataFrame(np.asarray(downpcd.points), columns=['x', 'y', 'z'])
    df['file_name'] = las_file
    df.to_csv(f'{las_file}.csv', index=False)
        
    del downpcd
    del points
    del df
    gc.collect()

In [None]:
output_dir = '/kaggle/working'
files = [os.path.join(output_dir, x) for x in os.listdir(output_dir)]
files.remove('/kaggle/working/.virtual_documents')
files.remove('/kaggle/working/wandb')

In [None]:
def make_count(files):
    count = 0
    counts = {}
    for file in files:
        df = pd.read_csv(file)
        n = len(df)
        counts[file.split('/')[-1]] = [count, count+n]
        count += n
    pd.DataFrame(counts).to_csv(f'counts.csv', index=False)

make_count(files)

In [None]:
dataset = load_dataset("csv", data_files={"test": files})
dataset.push_to_hub('power_line_lidar_data_test', token=)

In [None]:
ds = load_dataset("Eka-Korn/power_line_lidar_data_test")

In [None]:
class CustomDataset(Dataset):
    def __init__(self, sub_ds, counts_df):
        self.ds = sub_ds
        self.counts_df = counts_df
    
    def __getitem__(self, i):
        row = self.counts_df.iloc[i]
        start, end = row[0], row[1]
        cur_df = self.ds[start:end]
        cur_df = pd.DataFrame(cur_df)[['x', 'y', 'z']].sort_values(by='x').reset_index(drop=True).T
        cur_df = torch.tensor(cur_df.values, dtype=torch.float)

        return cur_df
    
    def __len__(self):
        return len(self.counts_df)

    # batch_size = 1
    # def collate_fn(self, batch):

In [None]:
counts = pd.read_csv('/kaggle/input/counts/counts.csv').T.reset_index()
counts['index'] = counts['index'].map(lambda x: x[:-4])

In [None]:
test_dataset = CustomDataset(ds['test'], counts)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, num_workers=4, pin_memory=True,)

In [None]:
b = 100 # ???????????????

In [None]:
model = torch.load('/kaggle/input/27nov/pytorch/default/1/first_model.pth').to(device)

In [None]:
sft = nn.Softmax(dim=1)

In [None]:
model.eval()
predicts = []
with torch.no_grad():
    for i, xyz in enumerate(test_loader):
        preds = model(xyz.to(device), b) # [b, 10]
        out_classes = sft(preds[:, -3:]).cpu().numpy() # probabilities
        
        pred_classes, pred_coordinates, mxs = [], [], []
        for j, out_class in enumerate(out_classes):
            mx = max(out_class)
            if mx > 0.75:
                pred_classes.append(classes[out_class.argmax()])
                pred_coordinates.append(preds[j, :-3])
                mxs.append(mx)
                
        predicts.append({'classes': pred_classes,
                         'coordinates': pred_coordinates,
                         'maximums': mxs,
                         'ind': i})

In [None]:
for pred in predicts:
    print(len(pred['classes']))
    # print(pred['classes'])

In [None]:
submission = pd.read_csv('/kaggle/input/power-line-security-zone-vegetation-detection/sample_submission.csv')
submission.columns

In [None]:
result = pd.DataFrame()
all_count = 0

In [None]:
for i, pred in enumerate(predicts):
    classes, coords, mxs = pred['classes'], pred['coordinates'], pred['maximums']
    file_name = counts.iloc[i]['index']
    for j in range(len(classes)):
        coord = coords[j].cpu().numpy()
        data = {'file_name': file_name,
                'center_x': coord[0],
                'center_y': coord[1],
                'center_z': coord[2],
                'size_x': coord[3],
                'size_y': coord[4],
                'size_z': coord[5],
                'yaw': coord[6],
                'class': classes[j],
                'score': mxs[j]}
        result[all_count] = data
        all_count += 1

In [None]:
result = result.T.reset_index().rename(columns={'index': 'id'})
result['id'] += 1
result.to_csv('submision.csv', index=False)

## Vizualize data

In [None]:
fig = plt.figure(figsize=(15, 10))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(xyz_df['x'][70000], xyz_df['y'][:70000], xyz_df['z'][:70000], s=1, alpha=0.5)

# ax.set_title(row['class'])
ax.set_xlabel("X")
ax.set_ylabel("Y")
ax.set_zlabel("Z")

plt.show()