# Preprocessing danych

In [None]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


In [None]:
money = 100000
debt = 250000

print("cheap: ", 0, " - ",money)
print("average: ", money, " - ", debt+money)
print("expensive: ", debt+money, " - ", "inf")

# Initial data viewing

In [None]:
with open('zadanie_studenci/train_data.csv', 'r') as f:
    reader = csv.reader(f)
    data = list(reader)

df = pd.DataFrame(data[1:], columns=data[0])
test_df = pd.read_csv('zadanie_studenci/test_data.csv')

missing_values = df.isnull().sum()
print(missing_values)

In [None]:
df.describe()

In [None]:
columns_with_types = {}

for column in df.columns:
    numeric_series = pd.to_numeric(df[column], errors='coerce')

    if not numeric_series.isna().any():
        if (numeric_series == numeric_series.astype(int)).all():
            print(f"{column} has type: int")
            columns_with_types[column] = "int"
        else:
            print(f"{column} has type: float")
            columns_with_types[column] = "float"
    else:
        print(f"{column} has type: string")
        columns_with_types[column] = "string"


In [None]:
columns_with_types

In [None]:
numerical_columns = []
categorical_columns = []

for column, _type in columns_with_types.items():
    if _type == "int" or _type == "float":
        numerical_columns.append(column)
    else:
        categorical_columns.append(column)

print("Numerical columns: ", numerical_columns)
print("Categorical columns: ", categorical_columns)

In [None]:
df[numerical_columns] = df[numerical_columns].apply(pd.to_numeric, errors='coerce')

In [None]:
for colm in categorical_columns:
    value_counts = df[colm].value_counts()
    print(f"{colm} has unique values: {value_counts.to_dict()}")

In [None]:
df['SalePrice'] = df['SalePrice'].astype(int)

min_target = df['SalePrice'].min()
max_target = df['SalePrice'].max()

def categorize_target(value):
    if value < money:
        return 0
    elif value < debt + money:
        return 1
    else:
        return 2

df['price_category'] = df['SalePrice'].apply(categorize_target)
df.drop(columns=['SalePrice'], inplace=True)

target_counts = df['price_category'].value_counts().to_dict()

print("Target counts: ", target_counts)
print("Max target: ", max_target)
print("Min target: ", min_target)

# With this knowledge we start data preprocessing

- Plan is like this: for now to leave the numerical vals as they are and to just convert the data to ints
- for coulumns TimeToBusStop, TimeToSubway do index encoding, as is thinkt that actually it will be good for the model to have a correlation that 10-15 minutes is closer to 5-10mins then 0-5 mins
- turn year built to years old

# Label encoding

In [None]:
idx_bus_stop = data[0].index("TimeToBusStop")
idx_subway = data[0].index("TimeToSubway")
idx_heating = data[0].index("HeatingType")
idx_apt_manage = data[0].index("AptManageType")

bus_stop_mapping_str_to_int = {"0~5min": 3, "5min~10min": 2, "10min~15min": 1}
bus_stop_mapping_int_to_str = {v: k for k, v in bus_stop_mapping_str_to_int.items()}

subway_mapping_str_to_int = {"no_bus_stop_nearby": 0, "0-5min": 4, "5min~10min": 3, "10min~15min": 2, "15min~20min": 1}
subway_mapping_int_to_str = {v: k for k, v in subway_mapping_str_to_int.items()}

heat_mapping_str_to_int = {"individual_heating": 1, "central_heating": 0}
heat_mapping_int_to_str = {v: k for k, v in heat_mapping_str_to_int.items()}

apt_mapping_str_to_int = {"management_in_trust": 1, "self_management": 0}
apt_mapping_int_to_str = {v: k for k, v in apt_mapping_str_to_int.items()}

df['TimeToBusStop'] = df['TimeToBusStop'].map(bus_stop_mapping_str_to_int)
df['TimeToSubway'] = df['TimeToSubway'].map(subway_mapping_str_to_int)
df['HeatingType'] = df['HeatingType'].map(heat_mapping_str_to_int)
df['AptManageType'] = df['AptManageType'].map(apt_mapping_str_to_int)

print(df[['TimeToBusStop', 'TimeToSubway', 'HeatingType', 'AptManageType']].head())

print("Unique values in TimeToBusStop:", df['TimeToBusStop'].unique())
print("Unique values in TimeToSubway:", df['TimeToSubway'].unique())
print("Unique values in HeatingType:", df['HeatingType'].unique())
print("Unique values in AptManageType:", df['AptManageType'].unique())

In [None]:
df.describe(include='all')

In [None]:
df['YearBuilt'] = 2015 - df['YearBuilt']

In [None]:
df.describe(include='all')

In [None]:
df = pd.get_dummies(df, columns=['HallwayType', 'SubwayStation'])

bool_columns = df.select_dtypes(include=['bool']).columns.tolist()
df[bool_columns] = df[bool_columns].astype(int)

In [None]:
df.describe(include='all')

In [None]:
numerical_columns.remove('SalePrice')

In [None]:
target = df['price_category']
data = df.drop(columns=['price_category'])

data_num = data[numerical_columns]
data_cat = data[data.columns.difference(numerical_columns)]

# Bootstrapping

In [None]:
def data_weights(dataframe):
    weights = np.ones(len(dataframe))
    for col in dataframe.columns:
        val_counts = dataframe[col].value_counts(normalize=True)
        if col not in ['TimeToBusStop', 'TimeToSubway', 'HeatingType', 'AptManageType']:
            for value, count in val_counts.items():
                weights[dataframe[col] == value] *= (1 - count / len(dataframe))
        else:
            weights[dataframe[col] == 1] *= (1 - (dataframe[col] == 1).sum() / len(dataframe))
    weights_series = pd.Series(weights, index=dataframe.index)
    weights_series = weights_series * (1 / weights_series.mean())
    return weights_series
data_weights = data_weights(data_cat)
print(data_weights.mean())

In [None]:
# indices = np.arange(len(data))
# t_indices = np.arange(0.8 * len(data)) # 80% for training
# train_indices = np.random.choice(
#     t_indices,
#     size=10000,
#     replace=True,
#     p=data_weights.iloc[t_indices].values / data_weights.iloc[t_indices].values.sum())
# val_indices = np.array([i for i in indices if i not in t_indices])
#
# train_data_num = data_num.iloc[train_indices]
# train_data_cat = data_cat.iloc[train_indices]
# train_target = target.iloc[train_indices]
#
# val_data_num = data_num.iloc[val_indices]
# val_data_cat = data_cat.iloc[val_indices]
# val_target = target.iloc[val_indices]


In [None]:
train_indices = np.random.rand(len(data))>0.1

data_num = pd.concat([data_num, data_cat[['TimeToBusStop', 'TimeToSubway', 'HeatingType', 'AptManageType']]], axis=1)
data_cat = data_cat.copy()
data_cat.drop(columns=['TimeToBusStop', 'TimeToSubway', 'HeatingType', 'AptManageType'], inplace=True)

train_data_num = data_num[train_indices]
train_data_cat = data_cat[train_indices]
train_target = target[train_indices]

val_data_num = data_num[~train_indices]
val_data_cat = data_cat[~train_indices]
val_target = target[~train_indices]

# normalize the data
- test_set the same as train set so that there is no data leakage

In [None]:
scaler = MinMaxScaler()
train_data_num = scaler.fit_transform(train_data_num)
val_data_num = scaler.transform(val_data_num)

In [None]:
train_data_num = torch.tensor(train_data_num, dtype=torch.float32)
train_data_cat = torch.tensor(train_data_cat.values, dtype=torch.float32)
train_target = torch.tensor(train_target.values, dtype=torch.int64)

val_data_num = torch.tensor(val_data_num, dtype=torch.float32)
val_data_cat = torch.tensor(val_data_cat.values, dtype=torch.float32)
val_target = torch.tensor(val_target.values, dtype=torch.int64)

sample_weights = torch.tensor(data_weights.iloc[train_indices].values, dtype=torch.float32)

train_dataset = torch.utils.data.TensorDataset(train_data_num, train_data_cat, train_target)
test_dataset = torch.utils.data.TensorDataset(val_data_num, val_data_cat, val_target)

train_sampler = torch.utils.data.WeightedRandomSampler(
    weights=sample_weights,
    num_samples=len(train_indices),
    replacement=True
)

In [None]:
class FlatClassifier(nn.Module):
    def __init__(self,size_1,size_2, activation, dropout):
        super(FlatClassifier, self).__init__()
        self.emb_layer = nn.Sequential(
            nn.Linear(train_data_cat.shape[1], train_data_cat.shape[1]*2),
            nn.ReLU(),
            nn.Linear(train_data_cat.shape[1]*2, train_data_cat.shape[1])
        )
        self.layer1 = nn.Linear(train_data_num.shape[1] + train_data_cat.shape[1], size_1)
        self.bn1 = nn.BatchNorm1d(size_1)
        self.act_1 = activation()
        self.d1 = nn.Dropout(dropout)
        self.layer2 = nn.Linear(size_1, size_2)
        self.bn2 = nn.BatchNorm1d(size_2)
        self.act_2 = activation()
        self.d2 = nn.Dropout(dropout)
        self.layer3 = nn.Linear(size_2, size_2//2)
        self.bn3 = nn.BatchNorm1d(size_2//2)
        self.act_3 = activation()
        self.output = nn.Linear(size_2//2, 3)

    def forward(self, x, cat_x):
        cat_x_embedded = self.emb_layer(cat_x)
        x = torch.cat([x,cat_x_embedded],dim=1)
        x = self.layer1(x)
        x = self.bn1(x)
        x = self.act_1(x)
        x = self.d1(x)
        x = self.layer2(x)
        x = self.bn2(x)
        x = self.act_2(x)
        x = self.d2(x)
        x = self.layer3(x)
        x = self.bn3(x)
        x = self.act_3(x)
        output = self.output(x)
        return output

In [None]:
def get_accuracy(model, data_loader):
    correct = 0
    total = 0
    model.eval()
    for x, cat_x, labels in data_loader:
        x, cat_x, labels = x.to(device), cat_x.to(device), labels.to(device)
        output = model(x, cat_x)
        pred = output>0
        correct += pred.eq(labels.view_as(pred)).sum().item()
        total += x.shape[0]
    return correct / total

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
model = FlatClassifier(64, 32, nn.LeakyReLU, 0).to(device)

num_epochs = 300

train_loader = DataLoader(train_dataset, batch_size=32, sampler=train_sampler)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
total_steps = len(train_loader) * num_epochs

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0007)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10)

iters = []
losses = []
val_losses = []
train_acc = []
val_acc = []
best_val_acc = 0
patience = 100
counter = 0
for n in range(num_epochs):
    epoch_losses = []
    for num_data, cat_data, target_batch in train_loader:
        num_data = num_data.float().to(device)
        cat_data = cat_data.float().to(device)
        target_batch = target_batch.long().to(device)
        model.train()

        outputs = model(num_data, cat_data)
        loss = criterion(outputs, target_batch)

        weighted_loss = loss.mean()

        optimizer.zero_grad()
        weighted_loss.backward()
        optimizer.step()

        epoch_losses.append(weighted_loss.item())

    loss_mean = np.array(epoch_losses).mean()
    iters.append(n)
    losses.append(loss_mean)

    model.eval()
    with torch.no_grad():
        valid_epoch_losses = []
        for num_data, cat_data, target_batch in test_loader:
            num_data = num_data.float().to(device)
            cat_data = cat_data.float().to(device)
            target_batch = target_batch.long().to(device)
            outputs = model(num_data, cat_data)
            valid_loss = criterion(outputs, target_batch).mean().item()
            valid_epoch_losses.append(valid_loss)

        val_loss_mean = np.array(valid_epoch_losses).mean()
        val_losses.append(val_loss_mean)

        correct_train = 0
        total_train = 0
        for num_data, cat_data, target_batch in train_loader:
            num_data = num_data.float().to(device)
            cat_data = cat_data.float().to(device)
            target_batch = target_batch.long().to(device)
            outputs = model(num_data, cat_data)
            _, predicted = torch.max(outputs.data, 1)
            total_train += target_batch.size(0)
            correct_train += (predicted == target_batch).sum().item()
        train_accuracy = correct_train / total_train

        correct_test = 0
        total_test = 0
        for num_data, cat_data, target_batch in test_loader:
            num_data = num_data.float().to(device)
            cat_data = cat_data.float().to(device)
            target_batch = target_batch.long().to(device)
            outputs = model(num_data, cat_data)
            _, predicted = torch.max(outputs.data, 1)
            total_test += target_batch.size(0)
            correct_test += (predicted == target_batch).sum().item()
        test_accuracy = correct_test / total_test

        if test_accuracy > best_val_acc:
            best_val_acc = test_accuracy
            counter = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            counter += 1
            if counter >= patience:
                print(f"Early stopping at epoch {n}")
                break

    train_acc.append(train_accuracy)
    val_acc.append(test_accuracy)
    scheduler.step(loss_mean)

    print(f"Epoch {n} train_loss {loss_mean:.3f} val_loss {val_loss_mean:.3f} train_acc: {train_accuracy:.3f} test_acc: {test_accuracy:.3f}")

print(f"Loading best model from checkpoint...")
model = FlatClassifier(64, 32, nn.LeakyReLU, 0).to(device)
model.load_state_dict(torch.load('best_model.pth'))
model.eval()

with torch.no_grad():
    correct_train = 0
    total_train = 0
    for num_data, cat_data, target_batch in train_loader:
        num_data = num_data.float().to(device)
        cat_data = cat_data.float().to(device)
        target_batch = target_batch.long().to(device)
        outputs = model(num_data, cat_data)
        _, predicted = torch.max(outputs.data, 1)
        total_train += target_batch.size(0)
        correct_train += (predicted == target_batch).sum().item()
    best_train_accuracy = correct_train / total_train

    correct_test = 0
    total_test = 0
    for num_data, cat_data, target_batch in test_loader:
        num_data = num_data.float().to(device)
        cat_data = cat_data.float().to(device)
        target_batch = target_batch.long().to(device)
        outputs = model(num_data, cat_data)
        _, predicted = torch.max(outputs.data, 1)
        total_test += target_batch.size(0)
        correct_test += (predicted == target_batch).sum().item()
    best_test_accuracy = correct_test / total_test

print("Best Model Training Accuracy: {:.3f}".format(best_train_accuracy))
print("Best Model Validation Accuracy: {:.3f}".format(best_test_accuracy))

In [None]:
max_len = min(len(iters), len(losses), len(val_losses), len(train_acc), len(val_acc))
print(f"Number of epochs tracked: {max_len}")

plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.title("Loss Curves")
plt.plot(iters[:max_len], losses[:max_len], label="Train")
plt.plot(iters[:max_len], val_losses[:max_len], label="Validation")
plt.xlabel("Iterations")
plt.ylabel("Loss")
plt.legend(loc='best')

plt.subplot(1,2,2)
plt.title("Training Curve")
plt.plot(iters[:max_len], train_acc[:max_len], label="Train")
plt.plot(iters[:max_len], val_acc[:max_len], label="Validation")
plt.xlabel("Iterations")
plt.ylabel("Training Accuracy")
plt.legend(loc='best')
plt.show()

# Ladujemy zbior testowy i generujemy predykcje do pliku

In [None]:
with open('zadanie_studenci/test_data.csv', 'r') as f:
    reader = csv.reader(f)
    data = list(reader)

df_t = pd.DataFrame(data[1:], columns=data[0])

missing_values = df_t.isnull().sum()


# do the same data preprocessing as in training dataset

In [None]:
df_t[numerical_columns] = df_t[numerical_columns].apply(pd.to_numeric, errors='coerce')

In [None]:
df_t['TimeToBusStop'] = df_t['TimeToBusStop'].map(bus_stop_mapping_str_to_int)
df_t['TimeToSubway'] = df_t['TimeToSubway'].map(subway_mapping_str_to_int)
df_t['HeatingType'] = df_t['HeatingType'].map(heat_mapping_str_to_int)
df_t['AptManageType'] = df_t['AptManageType'].map(apt_mapping_str_to_int)

df_t['YearBuilt'] = 2015 - df_t['YearBuilt']

df_t = pd.get_dummies(df_t, columns=['HallwayType', 'SubwayStation'])

bool_columns = df_t.select_dtypes(include=['bool']).columns.tolist()
df_t[bool_columns] = df_t[bool_columns].astype(int)

test_data_num = pd.concat([
    df_t[numerical_columns],
    df_t[['TimeToBusStop', 'TimeToSubway', 'HeatingType', 'AptManageType']]
], axis=1)

test_data_scaled = scaler.transform(test_data_num)

df_t[test_data_num.columns] = test_data_scaled

In [None]:
df_t.describe(include='all')

In [None]:
# Get numerical data
val_data_num = df_t[numerical_columns]

# Add the categorical features that should be treated as numerical
cat_features_to_move = ['TimeToBusStop', 'TimeToSubway', 'HeatingType', 'AptManageType']
val_data_num = pd.concat([val_data_num, df_t[cat_features_to_move]], axis=1)

# Get remaining categorical data (excluding the ones moved to numerical)
val_data_cat = df_t[df_t.columns.difference(numerical_columns)]
val_data_cat = val_data_cat.drop(columns=cat_features_to_move)

# Convert to tensors
val_data_num = torch.tensor(val_data_num.values, dtype=torch.float32)
val_data_cat = torch.tensor(val_data_cat.values, dtype=torch.float32)

test_data = torch.utils.data.TensorDataset(val_data_num, val_data_cat)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

model.eval()
predictions = []
with torch.no_grad():
    for num_data, cat_data in test_loader:
        num_data = num_data.float().to(device)
        cat_data = cat_data.float().to(device)
        outputs = model(num_data, cat_data)
        _, predicted = torch.max(outputs.data, 1)
        predictions.extend(predicted.cpu().numpy())

In [None]:
print(predictions[:10])

# save the predictions as a csv file

In [None]:
with open('zadanie_studenci/predictions.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    for pred in predictions:
        writer.writerow([pred])

# co jeszcze dodac?
- koniec notebooka lab4 - wagi do przykładów
- inny preprocessing danych jak sie bedzie chciało
- model nie przekopiowany z notebooka z labów
- eksperymentacja z optymizerem
- embedding jak sie bedzie chciało
- walka z przeuczeniem, bachnorm, regularization, weight decay itd.