In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from collections import Counter

# Load and append vehicle status datasets
vehicle_status = pd.read_csv('vehicle_status.csv', delimiter=';', low_memory=False)
vehicle_status1 = pd.read_csv('vehicle_status1.csv', delimiter=';', low_memory=False)
vehicle_status = vehicle_status._append(vehicle_status1)

# Load other datasets (assuming they are used later in the process)
vehicle_roadworthiness = pd.read_csv('vehicle_roadworthiness.csv', delimiter=',')
rikke_koodid = pd.read_csv('rike.csv')

# Filter the vehicle status data
data_status = vehicle_status[vehicle_status['YLDINE_STAATUS'] == 'REGISTREERITUD'] 
data_status = data_status[data_status['VK/OM MAAKOND'] != 'MÄÄRAMATA']
data_status = data_status[(data_status['Kategooria'] == 'M1G') | 
                          (data_status['Kategooria'] == 'M1') | 
                          (data_status['Kategooria'] == 'N1G') | 
                          (data_status['Kategooria'] == 'N1')]
data_status.loc[(data_status['KAIGUKASTI_TYYP'] == 'KONSTANTNE_YLEKANNE') | 
                (data_status['KAIGUKASTI_TYYP'] == 'DCT') | 
                (data_status['KAIGUKASTI_TYYP'] == 'AUTOMAT_MANUAAL') | 
                (data_status['KAIGUKASTI_TYYP'] == 'AMT'), 'KAIGUKASTI_TYYP'] = 'MUU'
data_status.loc[data_status['MOOTORI_TYYP'] == 'BENSIIN_KATALYSAATOR', 'MOOTORI_TYYP'] = 'BENSIIN' 
data_status.loc[data_status['MOOTORI_TYYP'] == 'BENSIIN_HYBRIID', 'MOOTORI_TYYP'] = 'BENSIIN-ELEKTER'
data_status.loc[data_status['MOOTORI_TYYP'] == 'DIISEL_HYBRIID', 'MOOTORI_TYYP'] = 'DIISEL-ELEKTER'

In [4]:
if data_status is not None:
    print("DataFrame loaded successfully.")
    
    # Filter and select relevant columns
    data_status = data_status[(data_status['YLDINE_STAATUS'] != 'PEATATUD') & (data_status['VK/OM MAAKOND'] != 'MÄÄRAMATA')]
    relevant_columns = ['Mark', 'Mudel', 'VK/OM MAAKOND', 'VARV']
    data = data_status[relevant_columns]

    # One-hot encode categorical features with sparse output
    encoder = OneHotEncoder(sparse_output=True)
    features = encoder.fit_transform(data[['Mark', 'Mudel', 'VARV']])
    labels = data['VK/OM MAAKOND']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, stratify=labels)

    # Train the model with more estimators
    model = RandomForestClassifier(n_estimators=100, class_weight='balanced')
    model.fit(X_train, y_train)

    # Repeat prediction for 50 collections
    for _ in range(50):
        print("Cycle")
        # Select a random sample of 25 cars
        sample_data = data.sample(25)
        sample_features = encoder.transform(sample_data[['Mark', 'Mudel', 'VARV']])
        
        # Predict 'VK/OM MAAKOND' for the collection
        collection_prediction = model.predict(sample_features)
        most_common_prediction = Counter(collection_prediction).most_common(1)[0][0]
        print("Most common predicted county for the collection:", most_common_prediction)

    # Function to predict and check the correctness of the prediction
    def check_prediction_accuracy(X_test, y_test, model):
        predictions = model.predict(X_test)
        accuracy = np.mean(predictions == y_test)
        return accuracy

    # Test the accuracy function
    accuracy = check_prediction_accuracy(X_test, y_test, model)
    print(f"Model accuracy: {accuracy}")
else:
    print("DataFrame not loaded.")

DataFrame loaded successfully.
Cycle
Most common predicted county for the collection: HIIU MAAKOND
Cycle
Most common predicted county for the collection: LÄÄNE MAAKOND
Cycle
Most common predicted county for the collection: RAPLA MAAKOND
Cycle
Most common predicted county for the collection: JÄRVA MAAKOND
Cycle
Most common predicted county for the collection: PÕLVA MAAKOND
Cycle
Most common predicted county for the collection: PÕLVA MAAKOND
Cycle
Most common predicted county for the collection: JÕGEVA MAAKOND
Cycle
Most common predicted county for the collection: HARJU MAAKOND
Cycle
Most common predicted county for the collection: VÕRU MAAKOND
Cycle
Most common predicted county for the collection: LÄÄNE MAAKOND
Cycle
Most common predicted county for the collection: TARTU MAAKOND
Cycle
Most common predicted county for the collection: HIIU MAAKOND
Cycle
Most common predicted county for the collection: PÕLVA MAAKOND
Cycle
Most common predicted county for the collection: HARJU MAAKOND
Cycle