In [6]:
# TODO: Task 1 Create a classificator with 97% accuracy using sklearn library ( MNIST dataset)
# TODO: Task 2 create shift function to augment data
# TODO: Task 3 Resolve Titanic dataset using sklearn library
# TODO: Task 4 Create SPAM classificator using sklearn library

In [22]:
from sklearn.datasets import fetch_openml
from sklearn.datasets import load_digits
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier

In [8]:
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
digits = load_digits()

In [10]:
estimator = KNeighborsClassifier(n_jobs=-1)

In [11]:
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [12]:
estimator.fit(X_train, y_train)

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [13]:
y_pred = estimator.predict(X_test)

In [17]:
score = f1_score( y_test, y_pred, average="macro")
score

0.9694818676216178

In [20]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1381
           1       0.95      0.99      0.97      1575
           2       0.99      0.95      0.97      1398
           3       0.97      0.97      0.97      1428
           4       0.98      0.96      0.97      1365
           5       0.96      0.96      0.96      1263
           6       0.97      0.99      0.98      1375
           7       0.96      0.98      0.97      1459
           8       0.99      0.93      0.96      1365
           9       0.96      0.96      0.96      1391

    accuracy                           0.97     14000
   macro avg       0.97      0.97      0.97     14000
weighted avg       0.97      0.97      0.97     14000



In [24]:
# Best Parameters: {'n_neighbors': 3, 'weights': 'distance'}
param_grid = {
    "n_neighbors": [3, 4, 5],
    # "weights": ["uniform", "distance"],
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
}

search = GridSearchCV(estimator, param_grid, cv=cv, scoring="accuracy", n_jobs=-1, verbose=1, return_train_score=False)
search.fit(X_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


 0.96414285 0.96528572]


0,1,2
,estimator,KNeighborsClassifier()
,param_grid,"{'n_neighbors': [3, 5, ...], 'weights': ['uniform', 'distance']}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_neighbors,3
,weights,'distance'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [25]:
best_model = search.best_estimator_
best_params = search.best_params_
best_score = search.best_score_
print("Best Parameters:", best_params)
print("Best Cross-Validation Score:", best_score)

Best Parameters: {'n_neighbors': 3, 'weights': 'distance'}
Best Cross-Validation Score: 0.9689107231946176


In [26]:
y_pred = best_model.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1381
           1       0.96      0.99      0.98      1575
           2       0.99      0.96      0.98      1398
           3       0.97      0.97      0.97      1428
           4       0.98      0.96      0.97      1365
           5       0.97      0.96      0.97      1263
           6       0.98      0.99      0.98      1375
           7       0.96      0.98      0.97      1459
           8       0.99      0.95      0.97      1365
           9       0.95      0.97      0.96      1391

    accuracy                           0.97     14000
   macro avg       0.97      0.97      0.97     14000
weighted avg       0.97      0.97      0.97     14000



In [27]:
# make shift function
import numpy as np


def shift_image(image, dx, dy, shift_px=2):
    """Shift the image in the specified direction."""
    image = image.reshape(28, 28)
    shifted_image = np.roll(image, shift=shift_px * dx, axis=1)
    shifted_image = np.roll(shifted_image, shift=shift_px * dy, axis=0)
    if dx > 0:
        shifted_image[:, :shift_px] = 0
    elif dx < 0:
        shifted_image[:, -shift_px:] = 0
    if dy > 0:
        shifted_image[:shift_px, :] = 0
    elif dy < 0:
        shifted_image[-shift_px:, :] = 0
    return shifted_image.reshape(784)

In [30]:
X_train = np.array(X_train).astype(np.float32)
y_train = np.array(y_train).astype(int)

X_train_augmented = [image for image in X_train]
y_train_augmented = [label for label in y_train]

for dx, dy in ((1, 0), (-1, 0), (0, 1), (0, -1)):
    X_shifted = np.array([shift_image(image, dx, dy) for image in X_train])
    X_train_augmented.extend(X_shifted)
    y_train_augmented.extend(y_train)

In [31]:
# let's train the model again
X_train_augmented = np.array(X_train_augmented)
y_train_augmented = np.array(y_train_augmented)

In [32]:
best_model.fit(X_train_augmented, y_train_augmented)

0,1,2
,n_neighbors,3
,weights,'distance'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [33]:
y_pred = best_model.predict(X_test)



In [35]:
y_train = np.array(y_train).astype(int)
y_test = np.array(y_test).astype(int)

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      1381
           1       0.96      1.00      0.98      1575
           2       0.99      0.95      0.97      1398
           3       0.97      0.98      0.98      1428
           4       0.98      0.96      0.97      1365
           5       0.97      0.97      0.97      1263
           6       0.97      0.99      0.98      1375
           7       0.96      0.98      0.97      1459
           8       0.99      0.94      0.96      1365
           9       0.96      0.97      0.96      1391

    accuracy                           0.97     14000
   macro avg       0.97      0.97      0.97     14000
weighted avg       0.97      0.97      0.97     14000

