In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression  # Example classifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [4]:
shapes_df = pd.read_csv('data/shapes_dataset.csv')
shapes_eval_df = pd.read_csv('data/shapes_eval_dataset.csv')

In [None]:
shapes_df.head(10)

Unnamed: 0,class,x1,y1,x2,y2,x3,y3,x4,y4,x5,...,x496,y496,x497,y497,x498,y498,x499,y499,x500,y500
0,triangle,-2.360064,-1.350912,-2.463916,-1.508069,-2.369937,-1.404027,-2.320119,-1.379858,-2.274563,...,-2.374778,-1.289691,-2.350988,-1.320375,-2.449795,-1.310276,0.04431,4.860049,0.021297,4.929705
1,triangle,-2.474225,-1.567552,-2.436462,-1.458902,-2.63054,-1.529414,-2.450999,-1.254607,-2.323863,...,-2.399504,-1.41994,-2.277535,-1.468667,-2.653858,-1.464417,0.04431,4.860049,0.021297,4.929705
2,triangle,-2.51148,-1.386699,-2.535617,-1.448272,-2.368618,-1.132085,-2.328835,-1.528182,-2.421894,...,-2.255682,-1.325829,-2.310839,-1.44812,-2.564651,-1.413198,0.04431,4.860049,0.021297,4.929705
3,triangle,-2.502244,-1.449171,-2.411072,-1.347141,-2.34517,-1.336013,-2.472723,-1.473162,-2.302615,...,-2.446268,-1.536691,-2.562772,-1.441618,-2.492226,-1.228363,0.04431,4.860049,0.021297,4.929705
4,triangle,-2.522535,-1.276709,-2.537713,-1.50479,-2.287307,-1.48787,-2.249531,-1.354192,-2.394983,...,-2.303287,-1.419298,-2.302452,-1.481298,-2.586208,-1.593286,0.04431,4.860049,0.021297,4.929705
5,triangle,-2.734248,-1.413408,-2.510482,-1.389658,-2.345338,-1.423859,-2.243113,-1.461246,-2.317036,...,-2.428555,-1.510122,-2.566477,-1.472422,-2.551274,-1.554806,0.04431,4.860049,0.021297,4.929705
6,triangle,-2.421398,-1.558294,-2.4174,-1.531226,-2.567901,-1.450546,-2.500068,-1.364314,-2.298065,...,-2.355224,-1.291498,-2.56819,-1.492265,-2.510377,-1.343607,0.04431,4.860049,0.021297,4.929705
7,triangle,-2.370601,-1.472492,-2.364374,-1.476796,-2.535756,-1.406836,-2.520105,-1.389812,-2.498934,...,-2.612154,-1.349919,-2.514604,-1.269339,-2.524224,-1.306563,0.04431,4.860049,0.021297,4.929705
8,triangle,-2.571395,-1.388862,-2.566444,-1.462326,-2.407375,-1.543029,-2.540811,-1.394188,-2.26721,...,-2.375094,-1.519026,-2.347047,-1.454908,-2.606707,-1.347764,0.04431,4.860049,0.021297,4.929705
9,triangle,-2.425368,-1.366725,-2.391463,-1.440488,-2.527746,-1.52341,-2.256621,-1.299808,-2.367263,...,-2.451571,-1.461273,-2.583419,-1.652752,-2.310216,-1.361717,0.04431,4.860049,0.021297,4.929705


In [5]:
X_train_full = shapes_df.drop('class', axis=1)
y_train_full = shapes_df['class']
X_eval = shapes_eval_df.drop('class', axis=1, errors='ignore')

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=42
)

In [7]:
preprocessor = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler())
preprocessor.fit(X_train)

In [8]:
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)
X_eval_processed = preprocessor.transform(X_eval)

In [None]:
model = LogisticRegression(random_state=42)
pipeline = make_pipeline(model) 

In [12]:
model.fit(X_train_processed, y_train)
y_pred_test = model.predict(X_test_processed)

In [13]:
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))
print("\nTest Classification Report:\n", classification_report(y_test, y_pred_test))
print("\nTest Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))

# 7. Predict on the evaluation dataset
y_pred_eval = model.predict(X_eval_processed)

# 8. Submission: Count the predicted "stars"
star_class = 'star'  # Assuming 'star' is the class name
predicted_stars_count = sum(1 for prediction in y_pred_eval if prediction == star_class)

print("\nPredicted Number of Stars in Evaluation Dataset:", predicted_stars_count)

Test Accuracy: 1.0

Test Classification Report:
               precision    recall  f1-score   support

        star       1.00      1.00      1.00       988
    triangle       1.00      1.00      1.00      1012

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000


Test Confusion Matrix:
 [[ 988    0]
 [   0 1012]]

Predicted Number of Stars in Evaluation Dataset: 8913
