In [10]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import random

def generate_collision_dataset(num_features, num_samples):
    if num_features < 4:
        raise ValueError("Количество признаков должно быть не менее 4")
    
    df = pd.DataFrame()

    for obj_num in [1, 2]:
        for i in range(num_features):
            feature_type = i % 4
            
            if feature_type == 0: 
                col_values = np.random.randint(0, 2, size=num_samples)
                col_name = f"Obj{obj_num}_Feature{i+1}_binary"
            elif feature_type == 1: 
                categories = ['A', 'B', 'C']
                col_values = random.choices(categories, k=num_samples)
                col_name = f"Obj{obj_num}_Feature{i+1}_nominal"
            elif feature_type == 2: 
                levels = ['Low', 'Medium-Low', 'Medium', 'Medium-High', 'High']
                col_values = random.choices(levels, k=num_samples)
                col_name = f"Obj{obj_num}_Feature{i+1}_ordinal"
            else: 
                col_values = np.random.normal(0, 1, size=num_samples)
                col_name = f"Obj{obj_num}_Feature{i+1}_numeric"
            
            df[col_name] = col_values
    
    numeric_cols = [col for col in df.columns if 'numeric' in col]
    if len(numeric_cols) > 0:
        sum_numeric = df[numeric_cols].sum(axis=1)
        threshold = sum_numeric.median()
        df['Collision'] = (sum_numeric > threshold).astype(int)
    else:
        df['Collision'] = np.random.randint(0, 2, size=num_samples)
    
    return df

dataset = generate_collision_dataset(num_features=5, num_samples=100)
print(dataset.head())

   Obj1_Feature1_binary Obj1_Feature2_nominal Obj1_Feature3_ordinal  \
0                     0                     A           Medium-High   
1                     0                     C                  High   
2                     0                     B            Medium-Low   
3                     0                     C           Medium-High   
4                     0                     C                   Low   

   Obj1_Feature4_numeric  Obj1_Feature5_binary  Obj2_Feature1_binary  \
0              -0.419768                     0                     1   
1              -0.155999                     0                     1   
2               1.677286                     0                     1   
3              -0.094192                     0                     1   
4               0.708900                     0                     1   

  Obj2_Feature2_nominal Obj2_Feature3_ordinal  Obj2_Feature4_numeric  \
0                     A           Medium-High              -0.915146

In [11]:
dataset_params = [
    {'num_samples': 50, 'num_features': 5},
    {'num_samples': 80, 'num_features': 6},
    {'num_samples': 100, 'num_features': 4},
    
    {'num_samples': 200, 'num_features': 8},
    {'num_samples': 300, 'num_features': 9},
    {'num_samples': 500, 'num_features': 10},
    
    {'num_samples': 600, 'num_features': 12},
    {'num_samples': 800, 'num_features': 15},
    {'num_samples': 1000, 'num_features': 11},
    
    {'num_samples': 1500, 'num_features': 7},
    {'num_samples': 2000, 'num_features': 9},
    {'num_samples': 3000, 'num_features': 12}
]

datasets = []
for i, params in enumerate(dataset_params, 1):
    df = generate_collision_dataset(**params)
    datasets.append(df)
    df.to_csv(f'dataset_{i}.csv', index=False)
    print(f"Датасет {i} создан: {params['num_samples']} samples, {params['num_features']} features")

Датасет 1 создан: 50 samples, 5 features
Датасет 2 создан: 80 samples, 6 features
Датасет 3 создан: 100 samples, 4 features
Датасет 4 создан: 200 samples, 8 features
Датасет 5 создан: 300 samples, 9 features
Датасет 6 создан: 500 samples, 10 features
Датасет 7 создан: 600 samples, 12 features
Датасет 8 создан: 800 samples, 15 features
Датасет 9 создан: 1000 samples, 11 features
Датасет 10 создан: 1500 samples, 7 features
Датасет 11 создан: 2000 samples, 9 features
Датасет 12 создан: 3000 samples, 12 features


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time

X = dataset.drop('Collision', axis=1)
y = dataset['Collision']

X = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

def evaluate_models(dataset):

    models = {
        'Logistic Regression': LogisticRegression(max_iter=1000),
        'Decision Tree': DecisionTreeClassifier(),
        'Random Forest': RandomForestClassifier(n_estimators=100),
        'SVM': SVC(),
        'KNN': KNeighborsClassifier(),
        'Naive Bayes': GaussianNB()
    }
    
    results = {}
    
    for name, model in models.items():
        start_time = time.time()
        
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        training_time = time.time() - start_time
        
        results[name] = {
            'accuracy': accuracy,
            'time': training_time,
            'memory': model.__sizeof__()
        }
    
    return results

sample_results = evaluate_models(datasets[0])

for model, metrics in sample_results.items():
    print(f"{model}:")
    print(f"  Accuracy: {metrics['accuracy']:.4f}")
    print(f"  Time: {metrics['time']:.4f} sec")
    print(f"  Memory: {metrics['memory']} bytes")
    print()

Logistic Regression:
  Accuracy: 0.9333
  Time: 0.0028 sec
  Memory: 32 bytes

Decision Tree:
  Accuracy: 0.9000
  Time: 0.0016 sec
  Memory: 32 bytes

Random Forest:
  Accuracy: 0.9333
  Time: 0.0524 sec
  Memory: 32 bytes

SVM:
  Accuracy: 0.9000
  Time: 0.0018 sec
  Memory: 32 bytes

KNN:
  Accuracy: 0.8333
  Time: 0.0032 sec
  Memory: 32 bytes

Naive Bayes:
  Accuracy: 0.7667
  Time: 0.0015 sec
  Memory: 32 bytes



In [13]:
import joblib
import pickle

def save_model(model, filename, method='joblib'):
    if method == 'joblib':
        joblib.dump(model, filename)
    elif method == 'pickle':
        with open(filename, 'wb') as f:
            pickle.dump(model, f)
    else:
        raise ValueError("Неизвестный метод сохранения")

best_model = SVC().fit(X_train, y_train)
save_model(best_model, 'best_collision_model.joblib')

loaded_model = joblib.load('best_collision_model.joblib')