#HW 3

<font color='red'>**DEADLINE: 27.04.2024 23:59**</font>


Run the cells below first.

In [None]:
import numpy as np
import pandas as pd
import scipy.linalg as sla
import matplotlib.pyplot as plt
import time
from tqdm import tqdm
import os
%matplotlib inline
# Fix the seed and the random state
seed=42
random_state=42
np.random.seed(random_state)

In [None]:
def order_points_clockwise(pts):
    center = np.mean(pts, axis=0)
    angles = np.arctan2(pts[:,1] - center[1], pts[:,0] - center[0])
    return pts[np.argsort(angles)]

def is_convex_quad(points, regime="ordered"):
    points = np.array(points)
    if regime == "unordered":
      points = order_points_clockwise(points) #this part was missing in the seminar
    def cross(a, b, c):
        ab = (b[0] - a[0], b[1] - a[1])
        bc = (c[0] - b[0], c[1] - b[1])
        return ab[0] * bc[1] - ab[1] * bc[0]

    signs = []
    n = 4
    for i in range(n):
        a, b, c = points[i], points[(i + 1) % n], points[(i + 2) % n]
        z = cross(a, b, c)
        signs.append(np.sign(z))

    signs = [s for s in signs if s != 0]
    return all(s > 0 for s in signs) or all(s < 0 for s in signs)


def generate_quadrilateral_dataset(n_samples=1000, xy_range=(-10, 10), random_state=None, regime="ordered"):
    if random_state:
        np.random.seed(random_state)

    data = []
    for _ in range(n_samples):
        points = np.random.uniform(xy_range[0], xy_range[1], size=(4, 2))
        label = int(is_convex_quad(points, regime=regime))
        flat_points = points.flatten().tolist()
        data.append(flat_points + [label])

    columns = [f'{coord}{i+1}' for i in range(4) for coord in ['x', 'y']] + ['label']
    df = pd.DataFrame(data, columns=columns)
    return df


def plot_pie_charts(y_true, y_pred):
    labels = np.unique(y_true)

    fig, axes = plt.subplots(1, len(labels), figsize=(18, 6)) # Adjust figure size as needed

    for i, label in enumerate(labels):
        true_positives = np.sum([(y_true[j] == label) & (y_pred[j] == label) for j in range(len(y_true))])
        total_elements = np.sum([y_true[j] == label for j in range(len(y_true))])

        proportions = [true_positives, total_elements - true_positives]

        axes[i].pie(proportions, labels=['True', 'False'], autopct='%1.1f%%', startangle=90,
                    colors=['lightgreen', 'lightcoral'])
        axes[i].set_title(f"Class {label}")

    plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_polygon_from_row(df, row_index, n_points=4, show_labels=True):
    """
    Plots a quadrilateral from a DataFrame row generated by generate_quadrilateral_dataset.

    Parameters:
        df (pd.DataFrame): The dataset containing flattened quadrilateral points and labels.
        row_index (int): The index of the row to visualize.
        show_labels (bool): Whether to show point indices on the plot.
    """
    row = df.iloc[row_index]
    points = np.array([[row[f'x{i+1}'], row[f'y{i+1}']] for i in range(n_points)])
    # Close the loop to form the quadrilateral
    points = np.vstack([points, points[0]])

    # Get label
    is_convex = row['label']

    # Choose color
    color = 'green' if is_convex else 'red'

    # Plot
    plt.figure(figsize=(10, 10))
    plt.plot(points[:, 0], points[:, 1], marker='o', color=color, linewidth=2)
    if show_labels:
        for i, (x, y) in enumerate(points[:-1]):
            plt.text(x, y, f'P{i+1}', fontsize=12, ha='right')

    plt.title(f"Quadrilateral (Convex: {bool(is_convex)})", fontsize=14)
    plt.gca().set_aspect('equal')
    plt.grid(True)
    plt.show()

# Task 0 (Bonus: 4 points)
On the seminar we discussed the probability of random points alligning in a quadrilateral or not.

We have seen that the probability of class "convex" is around 27%. However it was only for a **fixed order over points**, now we see an updated version where are two regimes:

"ordered" checks points in exact order: (0,0), (1,0), (0,1), (1,1)  is labeled as 0, because edges (1,0) - (0,1) and (1,1) - (0,0) intersect.

"unordered" checks points with regard to it's convex hull, and previous example will be labled as 1.


As a bonus task we ask you to formally find the exact probabilities in both cases:
## For unordered case "unordered" (2 points)

## For unordered case "ordered" (2 points)



## Take a look on examples with "ordered" regime

In [None]:
df = generate_quadrilateral_dataset(n_samples=1000, random_state=42, regime="ordered")
plot_polygon_from_row(df, row_index=10)

In [None]:
plot_polygon_from_row(df, row_index=11)

This is because of the wrong order of points

In [None]:
plot_polygon_from_row(df, row_index=110)

## Now take a look on the same examples with "unordered" regime





In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial import ConvexHull

def plot_polygon_from_row(df, row_index, n_points=5, show_labels=True, show_hull=True):
    """
    Plots a polygon from a DataFrame row with an optional convex hull visualization.

    Parameters:
        df (pd.DataFrame): The dataset containing flattened polygon points and labels.
        row_index (int): The index of the row to visualize.
        n_points (int): Number of points in the polygon (default 4 for quadrilateral).
        show_labels (bool): Whether to show point indices on the plot.
        show_hull (bool): Whether to display the convex hull of the points.
    """
    row = df.iloc[row_index]
    points = np.array([[row[f'x{i+1}'], row[f'y{i+1}']] for i in range(n_points)])

    # Get label
    is_convex = row['label']
    color = 'green' if is_convex else 'red'

    plt.figure(figsize=(10, 10))

    # Plot all points
    plt.scatter(points[:, 0], points[:, 1], c=color, s=100)

    # Show point labels
    if show_labels:
        for i, (x, y) in enumerate(points):
            plt.text(x, y, f'P{i+1}', fontsize=12, ha='right')

    # Plot convex hull
    if show_hull and n_points >= 3:
        hull = ConvexHull(points)
        for simplex in hull.simplices:
            plt.plot(points[simplex, 0], points[simplex, 1], color=color, linewidth=2)
        # Close the hull loop
        plt.plot([points[hull.vertices[-1], 0], points[hull.vertices[0], 0]],
                 [points[hull.vertices[-1], 1], points[hull.vertices[0], 1]], color=color, linewidth=2)

    plt.title(f"Polygon (Convex: {bool(is_convex)})", fontsize=14)
    plt.gca().set_aspect('equal')
    plt.grid(True)
    plt.show()

In [None]:
df = generate_quadrilateral_dataset(n_samples=1000, random_state=42, regime="unordered")
plot_polygon_from_row(df, n_points=4, row_index=10)

In [None]:
plot_polygon_from_row(df,n_points=4, row_index=11)

In [None]:
plot_polygon_from_row(df, n_points=4, row_index=110)

# Task 1 (10 points)

We would like you to solve the **unordered regime classification problem** for **5 points**. You will need to complete several steps, following the same approach demonstrated during the seminar.

We will evaluate your model's performance using the **F1 score**.

> **Why F1 score?**  
> In imbalanced classification problems, accuracy can be misleading—especially if one class dominates. For example, if 90% of your data belongs to one class, a model that always predicts that class would achieve 90% accuracy, despite having no real predictive power.  
> The **F1 score** provides a better measure of model quality in such cases, as it balances **precision** and **recall**, ensuring that both false positives and false negatives are taken into account.

---

### 🔧 Hints:

1. If your model trains slowly, try using a **GPU** to accelerate the training process.

2. If your model is underperforming:
   - First, try increasing the **hidden size** (i.e., the *width* of the model).
   - Only if that doesn’t help, consider increasing the **number of layers** (i.e., the *depth* of the model).

## Generate data (2.5 points)

In [None]:
def all_points_on_hull(points):
    points = np.array(points)
    if len(points) <= 2:
        return True
    hull = ConvexHull(points)
    return len(hull.vertices) == len(points)


def generate_polygon_dataset(n_samples=10000, n_points=5, xy_range=(-10, 10)):
    """
    Generate a dataset of random n-gons and labels indicating whether they are convex.

    Parameters:
        n_samples (int): Number of polygons to generate.
        n_points (int): Number of vertices per polygon.
        xy_range (tuple): Min and max range for x and y coordinates.
        random_state (int or None): Random seed for reproducibility.

    Returns:
        pd.DataFrame: Dataset with flattened point coordinates and convexity labels.
    """
    return



In [None]:
N_samples = 10**6 #you can change it
df = generate_polygon_dataset(n_samples=N_samples)
plot_polygon_from_row(df, n_points=N_points, row_index=11)

In [None]:
labels = #...
# Count the occurrences of each label (0 and 1)
label_counts = {}
for label in labels:
    label_counts[label] = label_counts.get(label, 0) + 1

# Extract labels and counts for the pie chart
labels = list(label_counts.keys())
sizes = list(label_counts.values())

# Create the pie chart
plt.figure(figsize=(6, 6))  # Adjust figure size as needed
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title('Distribution of Planarity Labels')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = #...

## Create classes `MLP` and `MyDataset` for our problem (2.5 points)

Your task is to implement the `MLP` and `MyDataset` classes to solve the **unordered regime problem** with **5 points**.

In the seminar, we used the following model configuration as a baseline:

- `hidden_size = 256`
- `3 hidden layers`

However, you are encouraged to experiment and choose your own model architecture. Feel free to adjust the **hidden size**, **number of layers**, and **activation functions** to achieve the best possible performance.

In [None]:
import torch
import torch.nn as nn
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# Define the MLP model
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        return

    def forward(self, x):
        return


input_size = None
hidden_size = None
output_size = None
mlp = MLP(input_size, hidden_size, output_size)


In [None]:
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self, X, y):
        return
    def __len__(self):
        return

    def __getitem__(self, idx):
        return

BATCH_SIZE = 32

train_dataset = MyDataset(X_train, y_train)
test_dataset = MyDataset(X_test, y_test)

train_dataloader = #...
test_dataloader = #... Don't shuffle

print(len(train_dataloader), len(test_dataloader))
print(len(train_dataloader)*BATCH_SIZE, len(test_dataloader)*BATCH_SIZE)

## Write a training algorithm and train your model (5 points)

In [None]:
criterion = nn.BCELoss()
opt = torch.optim.Adam(mlp.parameters(), lr=0.0003) #you can choose your own learning rate

In [None]:

model_dir = "model_weights"
os.makedirs(model_dir, exist_ok=True)  # Create the directory if it doesn't exist


num_epochs = 5
train_losses = []
test_losses = []
best_test_loss = float('inf') # Initialize with a very high value
best_epoch = 0

for epoch in tqdm(range(num_epochs)):
    # Training

    # Testing




# Plot the losses
plt.plot(train_losses, label='Train Loss')
plt.plot(test_losses, label='Test Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title('Train and Test Loss')
plt.show()

In [None]:
from sklearn.metrics import log_loss, accuracy_score, f1_score

mlp.eval()
y_pred_prob = []
y_true = []

with torch.no_grad():
    for inputs, labels in test_dataloader:
        inputs = inputs.reshape(inputs.shape[0], -1).float()
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = mlp(inputs)
        y_pred_prob.extend(outputs.cpu().numpy())
        y_true.extend(labels.cpu().numpy())

# Convert probabilities to binary predictions
y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_prob]

# Metrics
logloss_mlp = log_loss(y_true, y_pred_prob)
accuracy_mlp = accuracy_score(y_true, y_pred)
f1_mlp = f1_score(y_true, y_pred)

print(f"MLP Log Loss: {logloss_mlp:.4f}, Accuracy: {accuracy_mlp:.4f}, F1 Score: {f1_mlp:.4f}")

Your target F1_score is 0.8

In [None]:
plot_pie_charts(np.array(y_test), np.array(y_pred))