<a href="https://colab.research.google.com/github/IgorDiamandi/base_ML_Project/blob/BestResult/Colab_version.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import gdown
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from pathlib import Path

def download_from_gdrive(url, filename):
    # Extract the file ID from the URL
    file_id = url.split('/')[-2]
    download_url = f"https://drive.google.com/uc?id={file_id}"

    # Download the file
    if Path(filename).exists():
        print(f"File '{filename}' already exists. Skipping download.")
    else:
        gdown.download(download_url, filename, quiet=False)
        print(f"File downloaded as: {filename}")

train = 'https://drive.google.com/file/d/1guqSpDv1Q7ZZjSbXMYGbrTvGns0VCyU5/view?usp=drive_link'
valid = 'https://drive.google.com/file/d/1j7x8xhMimKbvW62D-XeDfuRyj9ia636q/view?usp=drive_link'
# Example usage

download_from_gdrive(train, 'train.csv')
download_from_gdrive(valid, 'valid.csv')

Downloading...
From (original): https://drive.google.com/uc?id=1guqSpDv1Q7ZZjSbXMYGbrTvGns0VCyU5
From (redirected): https://drive.google.com/uc?id=1guqSpDv1Q7ZZjSbXMYGbrTvGns0VCyU5&confirm=t&uuid=ee4530b2-1a7a-4756-b980-54cff6c44b49
To: /content/train.csv
100%|██████████| 116M/116M [00:01<00:00, 59.6MB/s]


File downloaded as: train.csv


Downloading...
From: https://drive.google.com/uc?id=1j7x8xhMimKbvW62D-XeDfuRyj9ia636q
To: /content/valid.csv
100%|██████████| 3.32M/3.32M [00:00<00:00, 34.2MB/s]

File downloaded as: valid.csv





In [4]:
# model functions

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

def train_and_evaluate_model(X_train, X_test, y_train, y_test, tree_depth, level_of_parallelism, number_of_trees,
                             max_features):
    for depth in tree_depth:
        model = RandomForestRegressor(
                random_state=100,
                n_jobs=level_of_parallelism,
                n_estimators=number_of_trees,
                max_depth=depth,
                max_features=max_features)


        print('Fitting the model...')
        model.fit(X_train, y_train)

        print('Testing the model...')
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        rmse_test = get_rmse(y_test, y_test_pred)
        rmse_train = get_rmse(y_train, y_train_pred)

        print(f'Tree depth - {depth}')
        print(f'STD Test - {y_test.std()}')
        print(f'STD Train - {y_train.std()}')
        print(f'RMSE Test - {rmse_test}')
        print(f'RMSE Train - {rmse_train}')

    return model


In [5]:
# data functions
import zipfile
from datetime import datetime
import numpy as np
from scipy.stats import zscore
import pandas as pd


#Function to retrieve statistic parameters from numerical columns of the train dataframe
def compute_statistics(df):
    numeric_df = df.select_dtypes(include='number')

    mean_values = numeric_df.mean()
    iqr_values = numeric_df.quantile(0.75) - numeric_df.quantile(0.25)
    zscore_values = (numeric_df.mean() / numeric_df.std()).mean()

    # Mean without extremes (assuming extremes are values outside 1.5*IQR)
    def mean_without_extremes(series):
        Q1 = series.quantile(0.25)
        Q3 = series.quantile(0.75)
        IQR = Q3 - Q1
        filtered_series = series[(series >= Q1 - 1.5 * IQR) & (series <= Q3 + 1.5 * IQR)]
        return filtered_series.mean()

    mean_no_extremes_values = numeric_df.apply(mean_without_extremes)

    statistics_df = pd.DataFrame({
        'mean': mean_values,
        'iqr': iqr_values,
        'zscore': [zscore_values] * len(numeric_df.columns),
        'mean_without_extremes': mean_no_extremes_values
    })

    statistics_df = statistics_df.T
    return statistics_df


def get_stat_value(method, column, statistics_df):
  if method not in statistics_df.index:
    raise ValueError(f"Method {method} not found in statistics DataFrame")
  return statistics_df.loc[method, column]


def replace_outliers(df, column, method, statistics_df):
    stat_value = get_stat_value(method, column, statistics_df)

    # Define the threshold for identifying outliers
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Replace outliers with the statistical method value
    df[column] = df[column].apply(lambda x: stat_value if x < lower_bound or x > upper_bound else x)

    return df


def replace_nans(df, column, method, statistics_df):
    stat_value = get_stat_value(method, column, statistics_df)
    df[column] = df[column].fillna(stat_value)

    return df


def get_rmse(y, y_pred):
    return mean_squared_error(y, y_pred) ** 0.5


def split_product_class_series(series):
    equipment_type = []
    details = []

    for item in series:
        if pd.isna(item):
            equipment_type.append(None)
            details.append(None)
        else:
            split_item = item.split(' - ', 1)
            equipment_type.append(split_item[0])
            details.append(split_item[1] if len(split_item) > 1 else None)

    return pd.Series(equipment_type), pd.Series(details)

In [6]:
df = pd.read_csv('train.csv')
df_valid = pd.read_csv('valid.csv')

  df = pd.read_csv('train.csv')


In [7]:
# get df columns datatypes

df.dtypes

SalesID                       int64
SalePrice                     int64
MachineID                     int64
ModelID                       int64
datasource                    int64
auctioneerID                float64
YearMade                      int64
MachineHoursCurrentMeter    float64
UsageBand                    object
saledate                     object
fiModelDesc                  object
fiBaseModel                  object
fiSecondaryDesc              object
fiModelSeries                object
fiModelDescriptor            object
ProductSize                  object
fiProductClassDesc           object
state                        object
ProductGroup                 object
ProductGroupDesc             object
Drive_System                 object
Enclosure                    object
Forks                        object
Pad_Type                     object
Ride_Control                 object
Stick                        object
Transmission                 object
Turbocharged                

In [8]:
# drop bad columns

df.drop(['MachineHoursLeft'], axis=1, inplace=True)
df_valid.drop(['MachineHoursLeft'], axis=1, inplace=True)

Index(['UsageBand', 'saledate', 'fiModelDesc', 'fiBaseModel',
       'fiSecondaryDesc', 'fiModelSeries', 'fiModelDescriptor', 'ProductSize',
       'fiProductClassDesc', 'state', 'ProductGroup', 'ProductGroupDesc',
       'Drive_System', 'Enclosure', 'Forks', 'Pad_Type', 'Ride_Control',
       'Stick', 'Transmission', 'Turbocharged', 'Blade_Extension',
       'Blade_Width', 'Enclosure_Type', 'Engine_Horsepower', 'Hydraulics',
       'Pushblock', 'Ripper', 'Scarifier', 'Tip_Control', 'Tire_Size',
       'Coupler', 'Coupler_System', 'Grouser_Tracks', 'Hydraulics_Flow',
       'Track_Type', 'Undercarriage_Pad_Width', 'Stick_Length', 'Thumb',
       'Pattern_Changer', 'Grouser_Type', 'Backhoe_Mounting', 'Blade_Type',
       'Travel_Controls', 'Differential_Type', 'Steering_Controls'],
      dtype='object')

In [80]:
# convert 'YearMade' column to 'Age'

df['Age'] = 2024 - df['YearMade']
df['AgeAtLastSale'] = (pd.to_datetime(df['saledate']) - pd.to_datetime(df['YearMade'].clip(lower=1900), format='%Y')).dt.days
df.drop('YearMade', axis=1, inplace=True)

df_valid['Age'] = 2024 - df_valid['YearMade']
df_valid['AgeAtLastSale'] = (pd.to_datetime(df_valid['saledate']) - pd.to_datetime(df_valid['YearMade'].clip(lower=1900), format='%Y')).dt.days
df_valid.drop('YearMade', axis=1, inplace=True)

In [82]:
# split 'fiProductDesc' column

df['ProductClassName'],df['ProductClassCharacteristic'] = split_product_class_series(df['fiProductClassDesc'])
df_valid['ProductClassName'],df_valid['ProductClassCharacteristic'] = split_product_class_series(df_valid['fiProductClassDesc'])

In [83]:
# convert 'MachineID' column into 'TimesAppearing' column

df['TimesAppearing'] = df['MachineID'].map(df['MachineID'].value_counts())
df.drop('MachineID', axis=1, inplace=True)

df_valid['TimesAppearing'] = df_valid['MachineID'].map(df_valid['MachineID'].value_counts())
df_valid.drop('MachineID', axis=1, inplace=True)

In [85]:
# remove duplicated columns: 'ProductGroupDesc'

df.drop(['ProductGroupDesc','ProductClassName'], axis=1, inplace=True)
df_valid.drop(['ProductGroupDesc','ProductClassName'], axis=1, inplace=True)

In [87]:
# replace NaN with -1 in 'ID' columns

df[['datasource', 'auctioneerID']] = df[['datasource', 'auctioneerID']].fillna(-1)
df_valid[['datasource', 'auctioneerID']] = df_valid[['datasource', 'auctioneerID']].fillna(-1)

In [88]:
#Split train and test data

X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=['SalePrice']), df['SalePrice'], test_size=0.3, random_state=100)

In [89]:
# get statistics dataframe using compute_statistics function from the X_train
statistics_df = compute_statistics(X_train)

In [92]:
""" Use replace_outliers function in order to replace outliers
in the 'MachineHoursCurrentMeter' column using statistics_df and IQR method """

X_train = replace_outliers(X_train, 'MachineHoursCurrentMeter', 'iqr', statistics_df)
X_test = replace_outliers(X_test, 'MachineHoursCurrentMeter', 'iqr', statistics_df)
df_valid = replace_outliers(df_valid, 'MachineHoursCurrentMeter', 'iqr', statistics_df)

In [93]:
""" Use replace_outliers function in order to replace outliers
in the 'Age' column using statistics_df and mean method """

X_train = replace_outliers(X_train, 'Age', 'mean_without_extremes', statistics_df)
X_test = replace_outliers(X_test, 'Age', 'mean_without_extremes', statistics_df)
df_valid = replace_outliers(df_valid, 'Age', 'mean_without_extremes', statistics_df)

In [94]:
""" Use replace_nans function in order to replace NaN values
in the 'MachineHoursCurrentMeter' column using statistics_df and IQR method """

X_train = replace_nans(X_train, 'MachineHoursCurrentMeter', 'iqr', statistics_df)
X_test = replace_nans(X_test, 'MachineHoursCurrentMeter', 'iqr', statistics_df)
df_valid = replace_nans(df_valid, 'MachineHoursCurrentMeter', 'iqr', statistics_df)

In [98]:
# replace NaN with 'Missing' in textual columns

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in X_train.columns:
    if X_train[col].dtype == 'object':
        if X_train[col].apply(type).nunique() > 1:
            X_train[col] = X_train[col].fillna('Missing').astype(str)
        X_train[col] = le.fit_transform(X_train[col])

for col in X_test.columns:
    if X_test[col].dtype == 'object':
        if X_test[col].apply(type).nunique() > 1:
            X_test[col] = X_test[col].fillna('Missing').astype(str)
        X_test[col] = le.fit_transform(X_test[col])


for col in df_valid.columns:
    if df_valid[col].dtype == 'object':
        if df_valid[col].apply(type).nunique() > 1:
            df_valid[col] = df_valid[col].fillna('Missing').astype(str)
        df_valid[col] = le.fit_transform(df_valid[col])



In [99]:
# convert categorical values to lables in all 3 dataframes
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

# Create the column transformer with OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'
)

# Fit the preprocessor on X_train and transform all dataframes
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
X_valid_transformed = preprocessor.transform(df_valid)

# Convert transformed arrays back to dataframes with proper column names
X_train_transformed = pd.DataFrame(X_train_transformed, columns=preprocessor.get_feature_names_out())
X_test_transformed = pd.DataFrame(X_test_transformed, columns=preprocessor.get_feature_names_out())
X_valid_transformed = pd.DataFrame(X_valid_transformed, columns=preprocessor.get_feature_names_out())

In [106]:
# Train model
model = train_and_evaluate_model(X_train_transformed, X_test_transformed, y_train, y_test, [16],
                                 -1, 20, 0.5)

Fitting the model...
Testing the model...
Tree depth - 16
STD Test - 22995.709788960918
STD Train - 23054.44179044156
RMSE Test - 11573.133611443149
RMSE Train - 6057.5839482426145


In [102]:
# Use Model
y_valid_pred = model.predict(X_valid_transformed)

In [103]:
# Create the prediction DataFrame with only 'SalesID' and 'Predicted_SalePrice'
df_predictions = pd.DataFrame({
    'SalesID': X_valid_transformed['SalesID'],
    'SalePrice': y_valid_pred
})
df_predictions.to_csv('valid_predictions.csv', index=False)

In [104]:
from google.colab import files
files.download('valid_predictions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [112]:
# print features importance order it descending

feature_importances = model.feature_importances_
feature_names = X_train.columns

importance_list = [(importance, feature) for feature, importance in zip(feature_names, feature_importances)]
importance_list.sort(reverse=True)

for importance, feature in importance_list:
    print(f"{feature}: {importance}")


AgeAtLastSale: 0.19799973933897025
ProductSize: 0.18456433912211362
Age: 0.10021552339870568
fiBaseModel: 0.06665314620390204
ModelID: 0.062032727261508006
Enclosure: 0.05932902460296898
fiProductClassDesc: 0.049593214926255
fiModelDesc: 0.04231276728386864
fiSecondaryDesc: 0.03463507852907022
Coupler_System: 0.03062715674315703
ProductClassCharacteristic: 0.030154666622460115
SalesID: 0.01767546463973773
fiModelDescriptor: 0.01579055099517206
Grouser_Tracks: 0.010833819732083156
Tire_Size: 0.009275402600574701
ProductGroup: 0.00842372027153318
Enclosure_Type: 0.0074547381414718904
saledate: 0.005834653788172878
Hydraulics_Flow: 0.005596966099010357
state: 0.004700372195093061
Blade_Width: 0.0046516516541280925
Ripper: 0.004318177286488369
fiModelSeries: 0.004231965967083417
Pushblock: 0.004176817442604224
Hydraulics: 0.003863466317511056
Tip_Control: 0.0031296014216171996
auctioneerID: 0.002851691830721068
MachineHoursCurrentMeter: 0.0022348654203995
Engine_Horsepower: 0.0021801216492