In [1]:
import numpy as np
import pandas as pd 
import random
random.seed(42)
np.random.seed(42)

In [2]:
df = pd.read_csv("wine-clustering.csv")

df.head()

Unnamed: 0,Alcohol,Malic_Acid,Ash,Ash_Alcanity,Magnesium,Total_Phenols,Flavanoids,Nonflavanoid_Phenols,Proanthocyanins,Color_Intensity,Hue,OD280,Proline
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [4]:
X = df.copy()
numerical_cols = X.columns.tolist()

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import LocalOutlierFactor


In [6]:
num_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

processor = ColumnTransformer(transformers=[
    ("num", num_pipe, numerical_cols)
], remainder="drop")

lof_pipe = Pipeline(steps=[
    ("process", processor),
    ("lof", LocalOutlierFactor(n_neighbors=20, contamination=0.05))
])


labels = lof_pipe.fit_predict(X)

df["outlier"] = labels

In [7]:
df["outlier"].value_counts()


outlier
 1    169
-1      9
Name: count, dtype: int64

In [8]:
submission = pd.DataFrame({
    "Id": df.index,
    "outlier": df["outlier"]
})

submission.to_csv("submission.csv", index=False)
