In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Загрузка всех трех датасетов
df1 = pd.read_csv('data/S07-hw-dataset-01.csv')
df2 = pd.read_csv('data/S07-hw-dataset-02.csv')
df3 = pd.read_csv('data/S07-hw-dataset-03.csv', header=None)
df3.columns = ['sample_id', 'feature1', 'feature2', 'feature3', 'feature4']

# Сохранение sample_id для каждого датасета
sample_ids_ds1 = df1['sample_id']
sample_ids_ds2 = df2['sample_id']
sample_ids_ds3 = df3['sample_id']

# Удаление sample_id из признаков
X_ds1 = df1.drop(columns=['sample_id'])
X_ds2 = df2.drop(columns=['sample_id'])
X_ds3 = df3.drop(columns=['sample_id'])

# Функция для вывода первичного анализа
def initial_analysis(df, dataset_name):
    print(f"\n{'='*50}")
    print(f"Анализ {dataset_name}")
    print(f"{'='*50}")
    print(f"Размер датасета: {df.shape}")
    print("\nПервые 5 строк:")
    display(df.head())
    print("\nИнформация о данных:")
    display(df.info())
    print("\nБазовые статистики:")
    display(df.describe())
    print("\nКоличество пропусков:")
    print(df.isnull().sum())
    
    # Определение типов признаков
    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
    
    print(f"\nЧисловые признаки: {numerical_features}")
    print(f"Категориальные признаки: {categorical_features}")

# Проведение первичного анализа для каждого датасета
initial_analysis(df1, "Dataset 01")
initial_analysis(df2, "Dataset 02")
initial_analysis(df3, "Dataset 03")

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Загрузка данных
# Dataset 1 и 2 загружаем как обычно
df1 = pd.read_csv('data/S07-hw-dataset-01.csv')
df2 = pd.read_csv('data/S07-hw-dataset-02.csv')

# Костыль
try:
    df3 = pd.read_csv('data/S07-hw-dataset-03.csv')
    
    if 'x1' in df3.columns or (df3.iloc[:, 1].dtype == 'object' and df3.iloc[0, 1] == 'x1'):
        df3 = pd.read_csv('data/S07-hw-dataset-03.csv', header=None)
        df3.columns = ['sample_id', 'feature1', 'feature2', 'feature3', 'feature4']
except Exception as e:
    print(f"Ошибка при загрузке Dataset 03: {e}")
    print("Пытаемся загрузить без заголовков...")
    df3 = pd.read_csv('data/S07-hw-dataset-03.csv', header=None)
    if df3.shape[1] == 5:
        df3.columns = ['sample_id', 'feature1', 'feature2', 'feature3', 'feature4']
    elif df3.shape[1] == 4:
        df3.columns = ['feature1', 'feature2', 'feature3', 'feature4']
        df3.insert(0, 'sample_id', range(len(df3)))
    else:
        print(f"Неожиданное количество столбцов: {df3.shape[1]}")
        df3.columns = ['sample_id', 'feature1', 'feature2', 'feature3', 'feature4'] + \
                     [f'feature{i}' for i in range(5, df3.shape[1]+1)]

sample_ids_ds1 = df1['sample_id']
sample_ids_ds2 = df2['sample_id']
sample_ids_ds3 = df3['sample_id']

X_ds1 = df1.drop(columns=['sample_id'])
X_ds2 = df2.drop(columns=['sample_id'])
X_ds3 = df3.drop(columns=['sample_id'])

In [None]:
def preprocess_data(X):
    """Препроцессинг данных с обработкой разных типов данных"""
    # Создаем копию данных для безопасности
    X_processed = X.copy()
    
    # Преобразуем все столбцы в числовой формат, если возможно
    for col in X_processed.columns:
        # Проверяем, можно ли преобразовать столбец в числовой формат
        try:
            # Попытка преобразовать в числовой формат
            X_processed[col] = pd.to_numeric(X_processed[col], errors='coerce')
        except:
            # Если не удалось, помечаем столбец для особой обработки
            print(f"Предупреждение: столбец {col} содержит нечисловые значения")
    
    # Проверяем наличие категориальных признаков
    categorical_features = X_processed.select_dtypes(include=['object']).columns.tolist()
    
    if categorical_features:
        print(f"Обнаружены категориальные признаки: {categorical_features}")
        # Преобразуем категориальные признаки в dummy-переменные
        X_processed = pd.get_dummies(X_processed, columns=categorical_features, drop_first=True)
    
    # Обработка пропусков
    imputer = SimpleImputer(strategy='mean')
    X_imputed = imputer.fit_transform(X_processed)
    
    # Масштабирование
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_imputed)
    
    return X_scaled

# Применение препроцессинга к каждому датасету
print("\nПрепроцессинг Dataset 01...")
X_processed_ds1 = preprocess_data(X_ds1)

print("\nПрепроцессинг Dataset 02...")
X_processed_ds2 = preprocess_data(X_ds2)

print("\nПрепроцессинг Dataset 03...")
# Для Dataset 03 дополнительно проверяем структуру данных перед препроцессингом
print("Структура Dataset 03 перед препроцессингом:")
print(f"Размер: {X_ds3.shape}")
print("Типы данных:")
print(X_ds3.dtypes)
print("Первые 2 строки:")
print(X_ds3.head(2))

X_processed_ds3 = preprocess_data(X_ds3)

print("\nПрепроцессинг успешно завершен для всех датасетов")

In [None]:
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

def run_clustering_algorithms(X, dataset_name, max_clusters=15):
    """Запуск и оценка алгоритмов кластеризации для одного датасета"""
    results = {}
    
    # KMeans с подбором оптимального k
    print(f"\nЗапуск KMeans для {dataset_name}...")
    silhouette_scores = []
    db_scores = []
    ch_scores = []
    
    for k in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = kmeans.fit_predict(X)
        
        silhouette_scores.append(silhouette_score(X, labels))
        db_scores.append(davies_bouldin_score(X, labels))
        ch_scores.append(calinski_harabasz_score(X, labels))
    
    # Выбор оптимального k по silhouette score
    best_k = range(2, max_clusters + 1)[np.argmax(silhouette_scores)]
    best_kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
    best_kmeans_labels = best_kmeans.fit_predict(X)
    
    # Сохранение результатов KMeans
    results['kmeans'] = {
        'best_k': best_k,
        'labels': best_kmeans_labels,
        'silhouette': silhouette_scores,
        'db_score': db_scores,
        'ch_score': ch_scores,
        'best_metrics': {
            'silhouette': silhouette_score(X, best_kmeans_labels),
            'db_score': davies_bouldin_score(X, best_kmeans_labels),
            'ch_score': calinski_harabasz_score(X, best_kmeans_labels)
        }
    }
    
    # DBSCAN с подбором параметров
    print(f"Запуск DBSCAN для {dataset_name}...")
    eps_range = np.linspace(0.1, 3.0, 30)
    min_samples = 5
    dbscan_results = []
    
    for eps in eps_range:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        labels = dbscan.fit_predict(X)
        
        # Доля шума
        noise_ratio = np.sum(labels == -1) / len(labels)
        
        # Оценка метрик только на не-шумовых точках
        non_noise_mask = labels != -1
        if np.sum(non_noise_mask) > 1 and len(np.unique(labels[non_noise_mask])) > 1:
            silhouette = silhouette_score(X[non_noise_mask], labels[non_noise_mask])
            db = davies_bouldin_score(X[non_noise_mask], labels[non_noise_mask])
            ch = calinski_harabasz_score(X[non_noise_mask], labels[non_noise_mask])
        else:
            silhouette = db = ch = np.nan
        
        dbscan_results.append({
            'eps': eps,
            'min_samples': min_samples,
            'noise_ratio': noise_ratio,
            'silhouette': silhouette,
            'db_score': db,
            'ch_score': ch,
            'n_clusters': len(np.unique(labels[non_noise_mask])) if np.sum(non_noise_mask) > 0 else 0
        })
    
    # Выбор лучших параметров DBSCAN
    valid_results = [r for r in dbscan_results if not np.isnan(r['silhouette']) and r['silhouette'] > 0]
    if valid_results:
        best_dbscan = max(valid_results, key=lambda x: x['silhouette'])
        best_dbscan_model = DBSCAN(eps=best_dbscan['eps'], min_samples=best_dbscan['min_samples'])
        best_dbscan_labels = best_dbscan_model.fit_predict(X)
        
        results['dbscan'] = {
            'best_eps': best_dbscan['eps'],
            'min_samples': best_dbscan['min_samples'],
            'noise_ratio': best_dbscan['noise_ratio'],
            'n_clusters': best_dbscan['n_clusters'],
            'labels': best_dbscan_labels,
            'results': dbscan_results,
            'best_metrics': {
                'silhouette': best_dbscan['silhouette'],
                'db_score': best_dbscan['db_score'],
                'ch_score': best_dbscan['ch_score']
            }
        }
    else:
        results['dbscan'] = None
    
    return results

# Запуск алгоритмов для всех датасетов
results_ds1 = run_clustering_algorithms(X_processed_ds1, 'Dataset 01')
results_ds2 = run_clustering_algorithms(X_processed_ds2, 'Dataset 02')
results_ds3 = run_clustering_algorithms(X_processed_ds3, 'Dataset 03')

In [None]:
from sklearn.decomposition import PCA

def plot_pca_clusters(X, labels, title, filename):
    """Визуализация кластеров с помощью PCA"""
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)
    
    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(
        X_pca[:, 0], 
        X_pca[:, 1], 
        c=labels, 
        cmap='viridis', 
        alpha=0.6,
        s=30,
        edgecolors='w',
        linewidth=0.5
    )
    
    plt.colorbar(scatter, label='Cluster')
    plt.title(title, fontsize=14)
    plt.xlabel(f'PCA 1 ({pca.explained_variance_ratio_[0]:.1%} variance)', fontsize=12)
    plt.ylabel(f'PCA 2 ({pca.explained_variance_ratio_[1]:.1%} variance)', fontsize=12)
    plt.grid(alpha=0.3)
    
    plt.savefig(f'artifacts/figures/{filename}', dpi=300, bbox_inches='tight')
    plt.close()

def plot_metrics_vs_param(param_values, metric_values, param_name, metric_name, title, filename):
    """Построение графика метрик в зависимости от параметра"""
    plt.figure(figsize=(10, 6))
    plt.plot(param_values, metric_values, 'b-o', linewidth=2, markersize=6)
    plt.title(title, fontsize=14)
    plt.xlabel(param_name, fontsize=12)
    plt.ylabel(metric_name, fontsize=12)
    plt.grid(alpha=0.3)
    
    plt.savefig(f'artifacts/figures/{filename}', dpi=300, bbox_inches='tight')
    plt.close()

# Визуализация PCA для каждого датасета с лучшим методом
plot_pca_clusters(X_processed_ds1, results_ds1['kmeans']['labels'], 
                  f'PCA: KMeans Clusters (k={results_ds1["kmeans"]["best_k"]}) - Dataset 01', 
                  'pca_clusters_ds1.png')

plot_pca_clusters(X_processed_ds2, results_ds2['kmeans']['labels'], 
                  f'PCA: KMeans Clusters (k={results_ds2["kmeans"]["best_k"]}) - Dataset 02', 
                  'pca_clusters_ds2.png')

plot_pca_clusters(X_processed_ds3, results_ds3['kmeans']['labels'], 
                  f'PCA: KMeans Clusters (k={results_ds3["kmeans"]["best_k"]}) - Dataset 03', 
                  'pca_clusters_ds3.png')

# Визуализация метрик для KMeans (silhouette vs k)
plot_metrics_vs_param(
    range(2, 16),
    results_ds1['kmeans']['silhouette'],
    'Number of clusters (k)',
    'Silhouette Score',
    'Silhouette Score vs k - Dataset 01',
    'silhouette_vs_k_ds1.png'
)

plot_metrics_vs_param(
    range(2, 16),
    results_ds2['kmeans']['silhouette'],
    'Number of clusters (k)',
    'Silhouette Score',
    'Silhouette Score vs k - Dataset 02',
    'silhouette_vs_k_ds2.png'
)

plot_metrics_vs_param(
    range(2, 16),
    results_ds3['kmeans']['silhouette'],
    'Number of clusters (k)',
    'Silhouette Score',
    'Silhouette Score vs k - Dataset 03',
    'silhouette_vs_k_ds3.png'
)

# Визуализация метрик для DBSCAN (silhouette vs eps)
if results_ds1['dbscan'] is not None:
    plot_metrics_vs_param(
        [r['eps'] for r in results_ds1['dbscan']['results']],
        [r['silhouette'] for r in results_ds1['dbscan']['results']],
        'eps',
        'Silhouette Score',
        'Silhouette Score vs eps - Dataset 01',
        'silhouette_vs_eps_ds1.png'
    )

if results_ds2['dbscan'] is not None:
    plot_metrics_vs_param(
        [r['eps'] for r in results_ds2['dbscan']['results']],
        [r['silhouette'] for r in results_ds2['dbscan']['results']],
        'eps',
        'Silhouette Score',
        'Silhouette Score vs eps - Dataset 02',
        'silhouette_vs_eps_ds2.png'
    )

if results_ds3['dbscan'] is not None:
    plot_metrics_vs_param(
        [r['eps'] for r in results_ds3['dbscan']['results']],
        [r['silhouette'] for r in results_ds3['dbscan']['results']],
        'eps',
        'Silhouette Score',
        'Silhouette Score vs eps - Dataset 03',
        'silhouette_vs_eps_ds3.png'
    )

In [None]:
from sklearn.metrics import adjusted_rand_score

def check_stability(X, n_clusters=4, n_runs=5, dataset_name='Dataset'):
    """Проверка устойчивости KMeans"""
    ari_scores = []
    all_labels = []
    
    for i in range(n_runs):
        kmeans = KMeans(n_clusters=n_clusters, random_state=i, n_init=10)
        labels = kmeans.fit_predict(X)
        all_labels.append(labels)
        
        if i > 0:
            ari = adjusted_rand_score(all_labels[i-1], labels)
            ari_scores.append(ari)
    
    plt.figure(figsize=(8, 5))
    plt.plot(range(1, n_runs), ari_scores, 'bo-', linewidth=2, markersize=8)
    plt.title(f'Устойчивость KMeans (k={n_clusters}) - {dataset_name}', fontsize=14)
    plt.xlabel('Номер запуска', fontsize=12)
    plt.ylabel('Adjusted Rand Index', fontsize=12)
    plt.ylim(0, 1.1)
    plt.grid(alpha=0.3)
    
    plt.savefig(f'artifacts/figures/stability_{dataset_name.lower().replace(" ", "_")}.png', dpi=300)
    plt.close()
    
    return np.mean(ari_scores), np.std(ari_scores)

# Проверка устойчивости для Dataset 02
print("\nПроверка устойчивости для Dataset 02...")
mean_ari, std_ari = check_stability(
    X_processed_ds2, 
    n_clusters=results_ds2['kmeans']['best_k'], 
    dataset_name='Dataset 02'
)
print(f"Средний ARI: {mean_ari:.4f}, Стандартное отклонение: {std_ari:.4f}")

In [None]:
import json

def select_best_algorithm(results, dataset_name):
    """Выбор лучшего алгоритма для датасета"""
    kmeans_score = results['kmeans']['best_metrics']['silhouette']
    dbscan_score = results['dbscan']['best_metrics']['silhouette'] if results['dbscan'] else -1
    
    # Выбираем KMeans, если он лучше или если DBSCAN не применим
    if dbscan_score > kmeans_score and results['dbscan']['noise_ratio'] < 0.3:
        return 'dbscan'
    else:
        return 'kmeans'

# Определение лучших алгоритмов для каждого датасета
best_alg_ds1 = select_best_algorithm(results_ds1, 'Dataset 01')
best_alg_ds2 = select_best_algorithm(results_ds2, 'Dataset 02')
best_alg_ds3 = select_best_algorithm(results_ds3, 'Dataset 03')

print(f"\nЛучший алгоритм для Dataset 01: {best_alg_ds1}")
print(f"Лучший алгоритм для Dataset 02: {best_alg_ds2}")
print(f"Лучший алгоритм для Dataset 03: {best_alg_ds3}")

# Подготовка и сохранение metrics_summary.json
metrics_summary = {
    "dataset_01": {
        "kmeans": {
            "silhouette": results_ds1['kmeans']['best_metrics']['silhouette'],
            "davies_bouldin": results_ds1['kmeans']['best_metrics']['db_score'],
            "calinski_harabasz": results_ds1['kmeans']['best_metrics']['ch_score']
        },
        "dbscan": {
            "silhouette": results_ds1['dbscan']['best_metrics']['silhouette'] if results_ds1['dbscan'] else None,
            "davies_bouldin": results_ds1['dbscan']['best_metrics']['db_score'] if results_ds1['dbscan'] else None,
            "calinski_harabasz": results_ds1['dbscan']['best_metrics']['ch_score'] if results_ds1['dbscan'] else None,
            "noise_ratio": results_ds1['dbscan']['noise_ratio'] if results_ds1['dbscan'] else None
        }
    },
    "dataset_02": {
        "kmeans": {
            "silhouette": results_ds2['kmeans']['best_metrics']['silhouette'],
            "davies_bouldin": results_ds2['kmeans']['best_metrics']['db_score'],
            "calinski_harabasz": results_ds2['kmeans']['best_metrics']['ch_score']
        },
        "dbscan": {
            "silhouette": results_ds2['dbscan']['best_metrics']['silhouette'] if results_ds2['dbscan'] else None,
            "davies_bouldin": results_ds2['dbscan']['best_metrics']['db_score'] if results_ds2['dbscan'] else None,
            "calinski_harabasz": results_ds2['dbscan']['best_metrics']['ch_score'] if results_ds2['dbscan'] else None,
            "noise_ratio": results_ds2['dbscan']['noise_ratio'] if results_ds2['dbscan'] else None
        }
    },
    "dataset_03": {
        "kmeans": {
            "silhouette": results_ds3['kmeans']['best_metrics']['silhouette'],
            "davies_bouldin": results_ds3['kmeans']['best_metrics']['db_score'],
            "calinski_harabasz": results_ds3['kmeans']['best_metrics']['ch_score']
        },
        "dbscan": {
            "silhouette": results_ds3['dbscan']['best_metrics']['silhouette'] if results_ds3['dbscan'] else None,
            "davies_bouldin": results_ds3['dbscan']['best_metrics']['db_score'] if results_ds3['dbscan'] else None,
            "calinski_harabasz": results_ds3['dbscan']['best_metrics']['ch_score'] if results_ds3['dbscan'] else None,
            "noise_ratio": results_ds3['dbscan']['noise_ratio'] if results_ds3['dbscan'] else None
        }
    }
}

with open('artifacts/metrics_summary.json', 'w') as f:
    json.dump(metrics_summary, f, indent=4)

# Подготовка и сохранение best_configs.json
best_configs = {
    "dataset_01": {
        "best_algorithm": best_alg_ds1,
        "parameters": {
            "k": results_ds1['kmeans']['best_k'] if best_alg_ds1 == 'kmeans' else None,
            "eps": results_ds1['dbscan']['best_eps'] if best_alg_ds1 == 'dbscan' else None,
            "min_samples": results_ds1['dbscan']['min_samples'] if best_alg_ds1 == 'dbscan' else None
        },
        "selection_criterion": "silhouette_score"
    },
    "dataset_02": {
        "best_algorithm": best_alg_ds2,
        "parameters": {
            "k": results_ds2['kmeans']['best_k'] if best_alg_ds2 == 'kmeans' else None,
            "eps": results_ds2['dbscan']['best_eps'] if best_alg_ds2 == 'dbscan' else None,
            "min_samples": results_ds2['dbscan']['min_samples'] if best_alg_ds2 == 'dbscan' else None
        },
        "selection_criterion": "silhouette_score"
    },
    "dataset_03": {
        "best_algorithm": best_alg_ds3,
        "parameters": {
            "k": results_ds3['kmeans']['best_k'] if best_alg_ds3 == 'kmeans' else None,
            "eps": results_ds3['dbscan']['best_eps'] if best_alg_ds3 == 'dbscan' else None,
            "min_samples": results_ds3['dbscan']['min_samples'] if best_alg_ds3 == 'dbscan' else None
        },
        "selection_criterion": "silhouette_score"
    }
}

with open('artifacts/best_configs.json', 'w') as f:
    json.dump(best_configs, f, indent=4)

# Сохранение меток кластеров для лучших моделей
best_labels_ds1 = results_ds1[best_alg_ds1]['labels']
best_labels_df1 = pd.DataFrame({
    'sample_id': sample_ids_ds1,
    'cluster_label': best_labels_ds1
})
best_labels_df1.to_csv('artifacts/labels/labels_hw07_ds1.csv', index=False)

best_labels_ds2 = results_ds2[best_alg_ds2]['labels']
best_labels_df2 = pd.DataFrame({
    'sample_id': sample_ids_ds2,
    'cluster_label': best_labels_ds2
})
best_labels_df2.to_csv('artifacts/labels/labels_hw07_ds2.csv', index=False)

best_labels_ds3 = results_ds3[best_alg_ds3]['labels']
best_labels_df3 = pd.DataFrame({
    'sample_id': sample_ids_ds3,
    'cluster_label': best_labels_ds3
})
best_labels_df3.to_csv('artifacts/labels/labels_hw07_ds3.csv', index=False)

print("\nВсе артефакты успешно сохранены:")
print("- metrics_summary.json")
print("- best_configs.json")
print("- labels_hw07_ds1.csv, labels_hw07_ds2.csv, labels_hw07_ds3.csv")
print("- 6 графиков в artifacts/figures/")