In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('data/dz3/train.csv')

In [3]:
y_data = train['label']
x_data = train.drop("label", axis=1)
x_train, x_valid, y_train, y_valid = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

In [4]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

standardized_x_train = StandardScaler().fit_transform(x_train)

In [5]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

standardized_x_train = StandardScaler().fit_transform(x_train)

pca_train = PCA(20)
pca_data = pca_train.fit_transform(standardized_x_train)

print("explained variance ratio = {}".format(np.sum(pca_train.explained_variance_ratio_)))

explained variance ratio = 0.3894352123182725


In [7]:
from sklearn.preprocessing import label_binarize
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier

y_train_bin = label_binarize(y_train, classes=np.arange(y_train.max()))

indexes = np.arange(len(x_train), dtype=int)
np.random.shuffle(indexes)
x_local = pca_data[indexes[:4000]]
y_local_bin = y_train_bin[indexes[:4000]]

clf = OneVsRestClassifier(SVC()).fit(x_local, y_local_bin)

In [8]:
standardized_x_valid = StandardScaler().fit_transform(x_valid)

pca_valid = pca_train.transform(standardized_x_valid)
predict_y = clf.predict(pca_valid)

In [9]:
from sklearn.metrics import accuracy_score

y_valid_bin = label_binarize(y_valid, classes=np.arange(y_valid.max()))

print("Accuracy:", accuracy_score(y_valid_bin, predict_y))

Accuracy: 0.8796428571428572


In [10]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

standardized_x_train = StandardScaler().fit_transform(x_train)

pca_train = PCA(20)
pca_data = pca_train.fit_transform(standardized_x_train)

print("explained variance ratio = {}".format(np.sum(pca_train.explained_variance_ratio_)))

explained variance ratio = 0.3894352123182725


In [12]:
from sklearn.ensemble import RandomForestClassifier

indexes = np.arange(len(x_train), dtype=int)
np.random.shuffle(indexes)
x_local = pca_data[indexes[:4000]]
y_local = np.array(y_train)[indexes[:4000]]

forest = RandomForestClassifier(n_estimators = 500)
forest = forest.fit(x_local, y_local)

In [13]:
standardized_x_valid = StandardScaler().fit_transform(x_valid)

pca_valid = pca_train.transform(standardized_x_valid)
predict_y = forest.predict(pca_valid)

In [14]:
from sklearn.metrics import accuracy_score

y_valid_bin = label_binarize(y_valid, classes=np.arange(y_valid.max()))
predict_y_bin = label_binarize(predict_y, classes=np.arange(predict_y.max()))

print("Accuracy:", accuracy_score(y_valid_bin, predict_y_bin))

Accuracy: 0.905


better result

In [15]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.pipeline import Pipeline

preprocessor = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selection', SelectKBest(score_func=mutual_info_classif, k=100)), 
    ('pca', PCA(n_components=30))])

x_train_processed = preprocessor.fit_transform(x_train, y_train)    
x_valid_processed = preprocessor.transform(x_valid)
y_train_bin = label_binarize(y_train, classes=np.arange(y_train.max()))
y_valid_bin = label_binarize(y_valid, classes=np.arange(y_valid.max()))

In [16]:
svm = OneVsRestClassifier(SVC(kernel='rbf', C=10))
svm.fit(x_train_processed, y_train_bin)
svm_pred = svm.predict(x_valid_processed)
print(f"SVM Accuracy: {accuracy_score(y_valid_bin, svm_pred):.4f}")

SVM Accuracy: 0.9267


In [17]:
forest = RandomForestClassifier(n_estimators=500, n_jobs=-1)
forest.fit(x_train_processed, y_train)
rf_pred = forest.predict(x_valid_processed)
rf_pred_bin = label_binarize(rf_pred, classes=np.arange(rf_pred.max()))
print(f"Random Forest Accuracy: {accuracy_score(y_valid_bin, rf_pred_bin):.4f}")

Random Forest Accuracy: 0.8985


somehow worse

## Better score with custop fitures

In [2]:
train = pd.read_csv('data/dz3/train.csv')
y_data = train['label']
x_data = train.drop("label", axis=1)

In [3]:
x_data_reshaped = x_data.values.reshape(-1, 28, 28)

In [4]:
def extract_mnist_features(images):
    features = []
    for image in images:
        # Calculate horizontal and vertical symmetry
        h_symmetry = np.mean(np.abs(image - np.fliplr(image)))
        v_symmetry = np.mean(np.abs(image - np.flipud(image)))
        
        # Calculate center of mass
        y_coord, x_coord = np.indices(image.shape)
        total_mass = image.sum()
        if total_mass == 0:
            com_y, com_x = image.shape[0]/2, image.shape[1]/2
        else:
            com_y = np.sum(y_coord * image) / total_mass
            com_x = np.sum(x_coord * image) / total_mass
        
        # Add number of pixels above mean intensity
        mean_intensity = np.mean(image)
        active_pixels = np.sum(image > mean_intensity)
        
        features.append([h_symmetry, v_symmetry, com_y, com_x, active_pixels])
    
    return np.array(features)

In [5]:
mnist_features = extract_mnist_features(x_data_reshaped)

In [6]:
x_combined = np.hstack([
    x_data,  
    mnist_features * 10
])

In [7]:
x_train, x_valid, y_train, y_valid = train_test_split(
    x_combined, y_data, test_size=0.2, random_state=42, stratify=y_data
)
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)

In [8]:
pca = PCA(n_components=50) 
x_train_pca = pca.fit_transform(x_train_scaled)
x_valid_pca = pca.transform(x_valid_scaled)
print(f"Explained variance with 50 components: {pca.explained_variance_ratio_.sum():.4f}")

Explained variance with 50 components: 0.5606


In [9]:
svm = SVC(kernel='rbf', C=5, gamma='scale')
svm.fit(x_train_pca, y_train)
svm_pred = svm.predict(x_valid_pca)
print(f"SVM Accuracy: {accuracy_score(y_valid, svm_pred):.4f}")

SVM Accuracy: 0.9702


pretty cool

In [10]:
forest = RandomForestClassifier(
    n_estimators=500,
    max_features='sqrt',
    min_samples_split=4,
    max_depth=None,
    n_jobs=-1
)
forest.fit(x_train_pca, y_train)
rf_pred = forest.predict(x_valid_pca)
print(f"Random Forest Accuracy: {accuracy_score(y_valid, rf_pred):.4f}")

Random Forest Accuracy: 0.9435


nit great