In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### 1. Считаем данные

In [None]:
X_train = pd.read_csv("./Data/x_train.csv", delimiter=';', header=None)
y_train = pd.read_csv("./Data/y_train.csv", delimiter=';', header=None)
X_test = pd.read_csv("./Data/x_test.csv", delimiter=';', header=None) 
y_test = pd.read_csv("./Data/y_test.csv", delimiter=';', header=None)

In [None]:
X_train.head()

In [None]:
# Сбалансированы ли данные?

In [None]:
y_train = np.array(y_train)

### 2. Визуализация

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

In [None]:
tsne = TSNE(verbose=1)
manifold = tsne.fit_transform(X_train)

In [None]:
targets = np.unique(y_train)
cdict = {0: 'red', 1: 'blue', 2: 'green', 3: 'yellow', 4: 'black'}
plt.figure(figsize=(10, 10))
for t in targets:
    idx = np.where(y_train==t)[0]
    plt.scatter(manifold[idx, 0], manifold[idx, 1], c=cdict[t], s=10, alpha=0.5, label=t)
plt.legend()
plt.show()

In [None]:
pca = PCA(n_components=2)
manifold = pca.fit_transform(X_train)

In [None]:
targets = np.unique(y_train)
cdict = {0: 'red', 1: 'blue', 2: 'green', 3: 'yellow', 4: 'black'}
plt.figure(figsize=(10, 10))
for t in targets:
    idx = np.where(y_train==t)[0]
    plt.scatter(manifold[idx, 0], manifold[idx, 1], c=cdict[t], s=10, alpha=0.5, label=t)
plt.legend()
plt.show()

Забыл сказать... Оба метода работают на данных, которые центрированы и шкалированы

In [None]:
#scaling here

In [None]:
tsne = TSNE(verbose=1)
manifold = tsne.fit_transform(X_train)

In [None]:
targets = np.unique(y_train)
cdict = {0: 'red', 1: 'blue', 2: 'green', 3: 'yellow', 4: 'black'}
plt.figure(figsize=(10, 10))
for t in targets:
    idx = np.where(y_train==t)[0]
    plt.scatter(manifold[idx, 0], manifold[idx, 1], c=cdict[t], s=10, alpha=0.5, label=t)
plt.legend()
plt.show()

In [None]:
pca = PCA(n_components=2)
manifold = pca.fit_transform(X_train)

In [None]:
targets = np.unique(y_train)
cdict = {0: 'red', 1: 'blue', 2: 'green', 3: 'yellow', 4: 'black'}
plt.figure(figsize=(10, 10))
for t in targets:
    idx = np.where(y_train==t)[0]
    plt.scatter(manifold[idx, 0], manifold[idx, 1], c=cdict[t], s=10, alpha=0.5, label=t)
plt.legend()
plt.show()

### 3. Посмотрим на RF на сырых данных

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [None]:
def get_cv_score(X, Y, cv=5):
    rf_clf = RandomForestClassifier(n_estimators=100)
    scores = cross_val_score(rf_clf, X, Y, scoring='accuracy', cv=cv, n_jobs=-1)
    print("Accuracy: {0}".format(np.mean(scores)))

In [None]:
get_cv_score(X_train, y_train)

### 3. Feature processing

In [None]:
X = X_train.copy()
y = y_train.copy()

In [None]:
def plot_importance(clf, X):
    importances = clf.feature_importances_
    std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
    indices = np.argsort(importances)[::-1]

    plt.figure(figsize=(20, 8))
    plt.title("Feature importances")
    plt.bar(range(X.shape[1]), importances[indices],
           color="r", yerr=std[indices], align="center")
    plt.xticks(range(X.shape[1]), X.columns[indices])
    plt.xlim([-1, X.shape[1]])
    plt.show()

In [None]:
rf_clf = RandomForestClassifier(n_estimators=100).fit(X, y)
plot_importance(rf_clf, X)

In [None]:
from sklearn.feature_selection import SelectFromModel

In [None]:
#отберем признаки на основе важностей для random forest

In [None]:
get_cv_score(X_sfm, y)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV

# Create the RFE object and compute a cross-validated score.
rf_clf = RandomForestClassifier(n_estimators=100)
# The "accuracy" scoring is proportional to the number of correct
# classifications
rfecv = RFECV(estimator=rf_clf, step=1, cv=StratifiedKFold(2),
              scoring='accuracy')
rfecv.fit(X, y)

print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

In [None]:
X_rfecv = rfecv.transform(X)

In [None]:
get_cv_score(X_rfecv, y)

In [None]:
pca = PCA()
X_pca = pca.fit_transform(X)

In [None]:
plt.plot(pca.explained_variance_ratio_)
pca.explained_variance_ratio_

In [None]:
get_cv_score(X_pca, y)

In [None]:
get_cv_score(X_pca[:, :6], y)

In [None]:
plt.bar(range(X.shape[1]), abs(pca.components_[0]))