https://habr.com/ru/companies/otus/articles/892828/

Всё это связано с уменьшением размерности, так как ***проклятие размерности***

In [1]:
import numpy as np
from sklearn.metrics import pairwise_distances

def analyze_distances(n_samples=500, dimensions=[2, 10, 50, 100]):
    np.random.seed(42)
    results = {}
    
    for dim in dimensions:
        X = np.random.rand(n_samples, dim)
        distances = pairwise_distances(X)
        # Убираем диагональ, так как расстояние до самой себя = 0
        non_zero_distances = distances[np.triu_indices_from(distances, k=1)]
        min_dist = non_zero_distances.min()
        max_dist = non_zero_distances.max()
        ratio = min_dist / max_dist
        results[dim] = {
            "min_distance": min_dist,
            "max_distance": max_dist,
            "ratio": ratio
        }
        print(f"Размерность: {dim:3d} | min: {min_dist:.4f} | max: {max_dist:.4f} | ratio: {ratio:.4f}")
    
    return results

analyze_distances()

Размерность:   2 | min: 0.0013 | max: 1.3384 | ratio: 0.0010
Размерность:  10 | min: 0.3085 | max: 2.2094 | ratio: 0.1396
Размерность:  50 | min: 1.7976 | max: 3.8377 | ratio: 0.4684
Размерность: 100 | min: 2.8983 | max: 5.0679 | ratio: 0.5719


{2: {'min_distance': 0.0013390939410155658,
  'max_distance': 1.3383860002906764,
  'ratio': 0.001000528951083421},
 10: {'min_distance': 0.30845729063082716,
  'max_distance': 2.2094214702988606,
  'ratio': 0.1396099815166109},
 50: {'min_distance': 1.7975654492089295,
  'max_distance': 3.837663503973125,
  'ratio': 0.4684010068490668},
 100: {'min_distance': 2.8983301608063816,
  'max_distance': 5.067892946570015,
  'ratio': 0.5719004310791512}}

In [2]:
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score

# Генерируем синтетический набор данных
X, y = make_classification(n_samples=1000, n_features=100, n_informative=20, random_state=42)
X

array([[ 0.09601292, -1.50969981, -2.6717549 , ...,  1.07064231,
         0.15347747,  0.03005139],
       [-0.48565995, -0.7503502 , -2.33290065, ..., -0.14931178,
        -0.91263811,  0.35515428],
       [-1.06701611, -0.67048521, -0.03887184, ..., -0.01547561,
        -0.66396581,  0.18237464],
       ...,
       [ 2.79647507,  0.23340362, -4.8164783 , ..., -0.04279975,
         0.81667463, -0.67012307],
       [ 1.17834399, -0.45134362,  1.9273969 , ...,  0.26923306,
         1.28363179,  0.52844825],
       [-0.38523769,  1.23926446,  2.35778962, ...,  2.05301162,
        -0.58700922, -0.4037699 ]])

In [7]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Снижаем размерность с помощью PCA
pca = PCA(n_components=20, random_state=42)
pca


In [11]:

X_train_pca = pca.fit_transform(X_train)
print(X_train_pca.shape)
X_train_pca

(700, 20)


array([[  3.13439049,  -6.95501313,  -0.37848104, ...,  -0.47861029,
         -2.04654452,  -1.2129644 ],
       [ 16.09887969,   6.93823854,  -0.13678045, ...,  -0.30649929,
         -0.30392795,   0.84060067],
       [-15.19288911,  -7.30303409,   1.97557102, ...,   1.00915538,
          0.41169479,   0.32039885],
       ...,
       [-11.26287216,   2.11471981,  -0.87702889, ...,   4.20867945,
         -2.1647208 ,  -0.67611049],
       [  9.69761824,  19.41145288,   0.7157658 , ...,  -0.04188145,
         -1.6124884 ,   0.4954145 ],
       [ 16.7759087 ,  -1.64793838,   1.35088087, ...,   0.81049717,
         -2.03310011,   1.08402915]])

In [12]:

X_test_pca = pca.transform(X_test)
print(X_test_pca.shape)
X_test_pca



(300, 20)


array([[-10.56844611,   7.08826175,   1.77995875, ...,   1.15012233,
          0.05673289,   2.74716264],
       [ -1.24888245,  18.22110535, -10.4303768 , ...,   0.18266853,
         -2.91193744,  -1.98367685],
       [ -2.42089975,  -3.40006887,  -2.13562362, ...,   1.68082756,
         -1.79362209,   0.71048621],
       ...,
       [ -5.45144905,  -0.82603219,   2.49565294, ...,   2.1011426 ,
          1.61870287,  -1.67620223],
       [ 13.96989645,  -5.43791939,   8.29532371, ...,   0.61832845,
         -1.29756961,   0.35002392],
       [ -3.67516443,  -9.73789439,  -1.69022055, ...,   1.01768129,
          0.29662195,   1.12561238]])

In [13]:

# Обучаем KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn


In [14]:

knn.fit(X_train_pca, y_train)


In [15]:

y_pred = knn.predict(X_test_pca)
y_pred

array([0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0])

In [16]:


print(f"Accuracy после PCA: {accuracy_score(y_test, y_pred):.4f}")

Accuracy после PCA: 0.9367


In [19]:
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score

# Генерируем синтетический набор данных
X, y = make_classification(n_samples=1000, n_features=100, n_informative=20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Обучаем KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print(f"Accuracy без PCA: {accuracy_score(y_test, y_pred):.4f}")

Accuracy без PCA: 0.9067


# Оптимальное количество деревьев

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Генерируем набор данных
X, y = make_classification(n_samples=1500, n_features=20, n_informative=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Параметры для перебора
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['sqrt', 'log2', None],
    'min_samples_split': [2, 5, 10],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

print(f"Лучшие параметры: {grid_search.best_params_}")
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)
print(f"Accuracy оптимизированного Random Forest: {accuracy_score(y_test, y_pred):.4f}")

Fitting 5 folds for each of 54 candidates, totalling 270 fits


0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to di

Лучшие параметры: {'bootstrap': False, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 200}
Accuracy оптимизированного Random Forest: 0.8867


In [22]:
grid_search.refit_time_

0.8433449268341064

In [23]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# 1. Загрузка и подготовка данных
iris = datasets.load_iris()
X = iris.data
y = iris.target

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 2. Определение сетки параметров для поиска
#  Мы определяем диапазон значений C и gamma, которые хотим протестировать.
#  Чем больше значений, тем более тщательный будет поиск, но и тем дольше он займет.
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1],
    'kernel': ['rbf'] # Указываем ядро RBF
}

# 3. Использование GridSearchCV для поиска оптимальных параметров
#   GridSearchCV автоматически обучает и оценивает модель для каждой комбинации параметров
#   и выбирает наилучшую комбинацию на основе выбранной метрики (по умолчанию accuracy).
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=2, cv=3) # cv=3 означает 3-кратную перекрестную проверку

# Обучение модели с GridSearchCV
grid.fit(X_train, y_train)

# 4. Вывод лучших параметров и модели
print("Лучшие параметры:", grid.best_params_)
print("Лучшая модель:", grid.best_estimator_)

# 5. Оценка модели на тестовой выборке
y_pred = grid.predict(X_test)
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1

In [10]:
import pandas as pd

s = pd.Series(list('abca'))
print(f's = {type(s).__name__} {{')
print(s)
print('}')

pd.get_dummies(s)

s = Series {
0    a
1    b
2    c
3    a
dtype: object
}


Unnamed: 0,a,b,c
0,True,False,False
1,False,True,False
2,False,False,True
3,True,False,False
