In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import numpy as np

# Lue CSV-tiedosto
file_path = r"D:\Koulu\AIDA_S24\AIDA_project\aida-projekti-local\archive\used_cars_data.csv"
df = pd.read_csv(file_path, nrows=10000, usecols=[
    'back_legroom', 'body_type', 'city', 'city_fuel_economy', 'daysonmarket', 
    'engine_displacement', 'engine_type', 'fleet', 'front_legroom', 
    'fuel_tank_volume', 'fuel_type', 'has_accidents', 'height',
    'highway_fuel_economy', 'horsepower', 'isCab', 'is_new', 'latitude', 
    'length', 'longitude', 'major_options', 'make_name', 'maximum_seating', 
    'mileage', 'model_name', 'price', 'seller_rating', 'transmission_display', 
    'wheel_system_display', 'width', 'year'
])

# Muunna daysonmarket nopeaan 1 ja hitaaseen 0 myyntiaikaan, nopea on alle 60pv kaupassa
df['quick_sale'] = df['daysonmarket'].apply(lambda x: 1 if x < 60 else 0)

df = df.drop(columns=['daysonmarket'])

# Poista sarakkeet joissa on pelkästään puuttuvia arvoja
df = df.dropna(axis=1, how='all')

X = df.drop(columns=['quick_sale'])
y = df['quick_sale']

# Määrittele esikäsittelyvaiheet
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X.select_dtypes(include=['object', 'bool']).columns

# Määritellään imputerit ja muuntajat numeerisille ja kategorisille arvoille
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())  # Skaalaa numeeriset arvot
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Yhdistetään muuntajat
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Luo pipeline, jossa on esikäsittely ja RandomForestClassifier
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(random_state=42))])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Laske ja tulosta tarkkuus
print("Test accuracy: %.2f" % model.score(X_test, y_test))

# Tulosta sekaannusmatriisi ja luokitteluraportti
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Hae ominaisuuksien tärkeysarvot
importances = model.named_steps['classifier'].feature_importances_

# Sarakenimet, mukaan lukien one-hot-koodatut kategoriset muuttujat
feature_names = model.named_steps['preprocessor'].transformers_[0][2].tolist() + \
                list(model.named_steps['preprocessor'].transformers_[1][1]['onehot'].get_feature_names_out(categorical_features))

for i in range(len(importances)):
    if importances[i] > 0.01:
        print("Feature: %20s - Relative importance: %4.1f%%" % (feature_names[i], importances[i] * 100))

Test accuracy: 0.80
[[ 517  454]
 [ 155 1874]]
              precision    recall  f1-score   support

           0       0.77      0.53      0.63       971
           1       0.80      0.92      0.86      2029

    accuracy                           0.80      3000
   macro avg       0.79      0.73      0.74      3000
weighted avg       0.79      0.80      0.79      3000

Feature:    city_fuel_economy - Relative importance:  1.6%
Feature:  engine_displacement - Relative importance:  1.2%
Feature: highway_fuel_economy - Relative importance:  1.8%
Feature:           horsepower - Relative importance:  1.8%
Feature:             latitude - Relative importance:  2.2%
Feature:            longitude - Relative importance:  2.2%
Feature:              mileage - Relative importance:  6.3%
Feature:                price - Relative importance:  8.7%
Feature:        seller_rating - Relative importance:  2.1%
Feature:                 year - Relative importance:  3.4%
Feature:         is_new_False - Rela