In [None]:
import polars as pl
import numpy as np
import pyarrow as pa

import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

from polars_tools import down_cast_numeric_cols

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split , GridSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score

In [None]:
"""
Notes

Classifier models to investigate:
- Logistic Regression
- K-nearest neighbours
- naive bayes
- support vector
- Decision Tree
- Random Forest (isn't this enemble?)
- Some interpretable ensemble (there is one in my notes)
"""

In [None]:
lf = pl.scan_csv(r"data.csv")

In [None]:
lf = lf.collect().drop(pl.col("")).lazy()

# Data Exploration

## Basics (Epxloration)

In [None]:
lf.collect().shape

In [None]:
lf.describe()

## Show correlation matrix Heatmap

In [None]:
## re-lable the diagnosis
lf = lf.with_columns(
    (pl.when(pl.col("diagnosis")==pl.lit('B')).then(1).otherwise(0)).alias("diagnosis")
               )

In [None]:
lf.collect()

In [None]:
lf_no_id = lf.select(
    pl.exclude(["id"])
).collect()

In [None]:
transformer = StandardScaler().fit(lf_no_id)
transformer

In [None]:
trans_arr = transformer.transform(lf_no_id)
#lf_scaled = pl.concat([lf.select(pl.col("diagnosis")) ,  pl.LazyFrame(trans_arr , schema=df.columns)] ,how="horizontal")
lf_scaled = pl.LazyFrame(trans_arr , schema=lf_no_id.columns)

df = lf_scaled.collect()

In [None]:
pl.concat([lf.select(pl.col("id")),lf_scaled] , how="horizontal").collect()

In [None]:
## Encoding:
# Malingang: -1.297676
# Benign: 0.770609


# Findings: -Drop all features with less than 0.4 or -0.4 correlation.
#           -Look at Multicolllinearity --> for those features that have high correlation with the target variable, check if they have high correlation with each other and then drop one of them
df = lf_scaled.collect()

# Correlation Matrix created by calculating covariance. This is possible because the dataset is standardized
#covariance_matrix_unscaled = np.cov(df, rowvar=False)
covariance_matrix = np.cov(df, rowvar=False) 

# Visualize the Covariance Matrix
plt.figure(figsize=(100, 100), dpi=70) 
ax = sns.heatmap(covariance_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True , xticklabels=df.columns , yticklabels=df.columns)

# Add labels and title
plt.title("Correlation Matrix Heatmap")
plt.show()

In [None]:
df.to_pandas()

## Pairplot

# Create Baseline Models

### Create train-test split

In [None]:
df = pl.concat([ lf.select(pl.col("diagnosis")).collect() , df.select(pl.exclude(["diagnosis"]))] , how="horizontal")

In [None]:
X = df.select(
    pl.exclude(["diagnosis"])
)
y = df.select(pl.col("diagnosis"))

X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size=0.33, random_state=42)

In [None]:
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

### K-Nearest Neighbor

### Naive Bayes

### Support Vector Machines

### Decision Tree

### One big run

# Grid Search

### KNN

### Gaussian Naive Bayes

### Support Vector Machine

### Decision Tree Classifier

# Additional Data Cleaning