<a href="https://colab.research.google.com/github/IgorDiamandi/base_ML_Project/blob/BestResult/Colab_version.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import gdown
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from pathlib import Path

def download_from_gdrive(url, filename):
    # Extract the file ID from the URL
    file_id = url.split('/')[-2]
    download_url = f"https://drive.google.com/uc?id={file_id}"

    # Download the file
    if Path(filename).exists():
        print(f"File '{filename}' already exists. Skipping download.")
    else:
        gdown.download(download_url, filename, quiet=False)
        print(f"File downloaded as: {filename}")

train = 'https://drive.google.com/file/d/1guqSpDv1Q7ZZjSbXMYGbrTvGns0VCyU5/view?usp=drive_link'
valid = 'https://drive.google.com/file/d/1j7x8xhMimKbvW62D-XeDfuRyj9ia636q/view?usp=drive_link'
# Example usage

download_from_gdrive(train, 'train.csv')
download_from_gdrive(valid, 'valid.csv')

Downloading...
From (original): https://drive.google.com/uc?id=1guqSpDv1Q7ZZjSbXMYGbrTvGns0VCyU5
From (redirected): https://drive.google.com/uc?id=1guqSpDv1Q7ZZjSbXMYGbrTvGns0VCyU5&confirm=t&uuid=49f8d0be-2fbc-4513-aa9e-cf8ac9f8e1cb
To: /content/train.csv
100%|██████████| 116M/116M [00:01<00:00, 81.5MB/s]


File downloaded as: train.csv


Downloading...
From: https://drive.google.com/uc?id=1j7x8xhMimKbvW62D-XeDfuRyj9ia636q
To: /content/valid.csv
100%|██████████| 3.32M/3.32M [00:00<00:00, 22.7MB/s]

File downloaded as: valid.csv





In [2]:
# model functions

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

def train_and_evaluate_model(X_train, X_test, y_train, y_test, tree_depth, level_of_parallelism, number_of_trees,
                             max_features):
    for depth in tree_depth:
        model = RandomForestRegressor(
                random_state=100,
                n_jobs=level_of_parallelism,
                n_estimators=number_of_trees,
                max_depth=depth,
                max_features=max_features)


        print('Fitting the model...')
        model.fit(X_train, y_train)

        print('Testing the model...')
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        rmse_test = get_rmse(y_test, y_test_pred)
        rmse_train = get_rmse(y_train, y_train_pred)

        print(f'Tree depth - {depth}')
        print(f'STD Test - {y_test.std()}')
        print(f'STD Train - {y_train.std()}')
        print(f'RMSE Test - {rmse_test}')
        print(f'RMSE Train - {rmse_train}')

    return model


In [3]:
# data functions

import zipfile
from datetime import datetime
import numpy as np
from scipy.stats import zscore
import pandas as pd

def handle_outliers_iqr(df, columns):
    for column in columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
        df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])

    return df


# Function to handle outliers using Z-score method
def handle_outliers_zscore(df, columns, threshold=3):
    for column in columns:
        z_scores = zscore(df[column])
        abs_z_scores = np.abs(z_scores)
        filtered_entries = abs_z_scores < threshold
        df = df[filtered_entries]
    return df

# Function to handle outliers using 6-Sigma method
def handle_outliers_six_sigma(df, columns):
    for column in columns:
        mean = df[column].mean()
        std = df[column].std()
        lower_bound = mean - 3 * std
        upper_bound = mean + 3 * std
        outliers = (df[column] < lower_bound) | (df[column] > upper_bound)
        df.loc[outliers, column] = mean
    return df

def mean_without_extremums(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    filtered_df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    mean_value = filtered_df[column].mean()
    return mean_value


def replace_outliers_with_mean(df, columns, threshold=3):
    for column in columns:
        mean_value = mean_without_extremums(df, column)
        z_scores = zscore(df[column])
        outliers = np.abs(z_scores) > threshold
        print(f'Column: {column}')
        print(f'Outliers - Mean without extremums: {mean_value}')
        print(f'Outliers - {df.loc[outliers, column]}')
        df.loc[outliers, column] = mean_value

    return df


def remove_columns_with_many_nulls(df, threshold=0.5):
    null_percentage = df.isnull().mean()
    columns_to_remove = null_percentage[null_percentage > threshold].index
    df_cleaned = df.drop(columns=columns_to_remove)

    return df_cleaned


def replace_null_with_mean(df, columns):
    for column in columns:
        mean_value = df[column].mean()
        df[column].fillna(mean_value, inplace=True)
    return df



def handle_missing_values(features):
    numeric_cols = features.select_dtypes(include=['number']).columns
    non_numeric_cols = features.select_dtypes(exclude=['number']).columns

    features[numeric_cols] = features[numeric_cols].fillna(features[numeric_cols].median())
    features[non_numeric_cols] = features[non_numeric_cols].fillna(features[non_numeric_cols].mode().iloc[0])

    return features

def get_rmse(y, y_pred):
    return mean_squared_error(y, y_pred) ** 0.5


def split_product_class_series(series):
    equipment_type = []
    details = []

    for item in series:
        if pd.isna(item):
            equipment_type.append(None)
            details.append(None)
        else:
            split_item = item.split(' - ', 1)
            equipment_type.append(split_item[0])
            details.append(split_item[1] if len(split_item) > 1 else None)

    return pd.Series(equipment_type), pd.Series(details)

In [4]:
df = pd.read_csv('train.csv')
df_valid = pd.read_csv('valid.csv')

  df = pd.read_csv('train.csv')


In [5]:
# convert 'YearMade' column to 'Age'

df['Age'] = 2024 - df['YearMade']
df.drop('YearMade', axis=1, inplace=True)

df_valid['Age'] = 2024 - df_valid['YearMade']
df_valid.drop('YearMade', axis=1, inplace=True)

In [7]:
# handle outliers in the 'SalePrice' and the 'MachineCurrentMeter' columns using IQR method
df = handle_outliers_iqr(df, ['SalePrice', 'MachineHoursCurrentMeter'])
df_valid = handle_outliers_iqr(df_valid, ['MachineHoursCurrentMeter'])

In [8]:
# replace outliers in the 'Age' column using mean

df = replace_outliers_with_mean(df, ['Age'], 1)
df_valid = replace_outliers_with_mean(df_valid, ['Age'], 1)

Column: Age
Outliers - Mean without extremums: 30.16899511495046
Outliers - 9         1024
21        1024
33        1024
35        1024
36        1024
          ... 
401066    1024
401069    1024
401072    1024
401073    1024
401075    1024
Name: Age, Length: 38185, dtype: int64
Column: Age
Outliers - Mean without extremums: 23.986942862752798
Outliers - 0        1024
3        1024
6        1024
7        1024
8        1024
         ... 
11515    1024
11523    1024
11526    1024
11529    1024
11546    1024
Name: Age, Length: 1206, dtype: int64


In [9]:
# split 'fiProductDesc' column

df['ProductClassName'],df['ProductClassCharacteristic'] = split_product_class_series(df['fiProductClassDesc'])
df_valid['ProductClassName'],df_valid['ProductClassCharacteristic'] = split_product_class_series(df_valid['fiProductClassDesc'])

In [10]:
# convert 'MachineID' column into 'TimesAppearing' column

df['TimesAppearing'] = df['MachineID'].map(df['MachineID'].value_counts())
df.drop('MachineID', axis=1, inplace=True)

df_valid['TimesAppearing'] = df_valid['MachineID'].map(df_valid['MachineID'].value_counts())
df_valid.drop('MachineID', axis=1, inplace=True)

In [11]:
# remove duplicated columns: 'ProductGroupDesc'

df.drop(['ProductGroupDesc','ProductClassName'], axis=1, inplace=True)
df_valid.drop(['ProductGroupDesc','ProductClassName'], axis=1, inplace=True)

In [12]:
# replace NaN values with 'missing' for non-numeric fields and with -1 for numeric

for col in df.columns:
    if df[col].dtype == 'object':
        df[col].fillna('missing', inplace=True)
    else:
        df[col].fillna(-1, inplace=True)


for col in df_valid.columns:
    if df_valid[col].dtype == 'object':
        df_valid[col].fillna('missing', inplace=True)
    else:
        df_valid[col].fillna(-1, inplace=True)

In [14]:
# replace outliers in numeric columns with IQR method using replace_outliers_with_mean() function

df = replace_outliers_with_mean(df, ['MachineHoursCurrentMeter', 'SalePrice'], 1)
df_valid = replace_outliers_with_mean(df_valid, ['MachineHoursCurrentMeter'], 1)

Column: MachineHoursCurrentMeter
Outliers - Mean without extremums: -0.7793994316502052
Outliers - 4          722.0
5          508.0
10        1414.0
13        1921.0
16        1972.0
           ...  
397268    1721.0
397998    1631.0
399782    1528.0
400861    2196.0
401055    1120.0
Name: MachineHoursCurrentMeter, Length: 24648, dtype: float64
Column: SalePrice
Outliers - Mean without extremums: 25646.164709184908
Outliers - 2         10000.0
3         38500.0
4         11000.0
12        36000.0
16        13500.0
           ...   
401115    10000.0
401117    10000.0
401120    10500.0
401121    11000.0
401122    11500.0
Name: SalePrice, Length: 140318, dtype: float64
Column: MachineHoursCurrentMeter
Outliers - Mean without extremums: 187.50752290434633
Outliers - 1        4412.0
3        4682.0
7        3998.0
9        3211.0
15       3145.0
          ...  
10092    3817.0
10251    5200.0
10275    3397.0
11003    2200.0
11360    2683.0
Name: MachineHoursCurrentMeter, Length: 1544, dty

In [15]:
df.head()

Unnamed: 0,SalesID,SalePrice,ModelID,datasource,auctioneerID,MachineHoursCurrentMeter,UsageBand,saledate,fiModelDesc,fiBaseModel,...,Pattern_Changer,Grouser_Type,Backhoe_Mounting,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls,Age,ProductClassCharacteristic,TimesAppearing
0,1139246,30141.526245,3157,121,3.0,68.0,Low,11/16/2006 0:00,521D,521,...,missing,missing,missing,missing,missing,Standard,Conventional,20.0,110.0 to 120.0 Horsepower,1
1,1139248,30141.526245,77,121,3.0,-0.779399,Low,3/26/2004 0:00,950FII,950,...,missing,missing,missing,missing,missing,Standard,Conventional,28.0,150.0 to 175.0 Horsepower,1
2,1139249,25646.164709,7009,121,3.0,-0.779399,High,2/26/2004 0:00,226,226,...,missing,missing,missing,missing,missing,missing,missing,23.0,1351.0 to 1601.0 Lb Operating Capacity,3
3,1139251,25646.164709,332,121,3.0,-0.779399,High,5/19/2011 0:00,PC120-6E,PC120,...,missing,missing,missing,missing,missing,missing,missing,23.0,12.0 to 14.0 Metric Tons,1
4,1139253,25646.164709,17311,121,3.0,-0.779399,Medium,7/23/2009 0:00,S175,S175,...,missing,missing,missing,missing,missing,missing,missing,17.0,1601.0 to 1751.0 Lb Operating Capacity,1


In [16]:
df_valid.head()

Unnamed: 0,SalesID,ModelID,datasource,auctioneerID,MachineHoursCurrentMeter,UsageBand,saledate,fiModelDesc,fiBaseModel,fiSecondaryDesc,...,Pattern_Changer,Grouser_Type,Backhoe_Mounting,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls,Age,ProductClassCharacteristic,TimesAppearing
0,1222837,1376,121,3,0.0,missing,1/5/2012 0:00,375L,375,missing,...,None or Unspecified,Double,missing,missing,missing,missing,missing,23.986943,66.0 to 90.0 Metric Tons,1
1,1222839,36526,121,3,187.507523,Medium,1/5/2012 0:00,TX300LC2,TX300,LC,...,Yes,Double,missing,missing,missing,missing,missing,18.0,28.0 to 33.0 Metric Tons,1
2,1222841,4587,121,3,726.217192,Medium,1/5/2012 0:00,270LC,270,missing,...,None or Unspecified,Double,missing,missing,missing,missing,missing,24.0,24.0 to 28.0 Metric Tons,1
3,1222843,1954,121,3,187.507523,Low,1/5/2012 0:00,892DLC,892,D,...,None or Unspecified,Double,missing,missing,missing,missing,missing,23.986943,28.0 to 33.0 Metric Tons,1
4,1222845,4701,121,3,726.217192,Medium,1/4/2012 0:00,544H,544,H,...,missing,missing,missing,missing,missing,Standard,Conventional,22.0,120.0 to 135.0 Horsepower,1


In [17]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        # Check if column contains both strings and numbers
        if df[col].apply(type).nunique() > 1:
            # Handle mixed types (example: fill numeric values with a placeholder string)
            df[col] = df[col].fillna('Missing').astype(str)
        df[col] = le.fit_transform(df[col])
df.head()

Unnamed: 0,SalesID,SalePrice,ModelID,datasource,auctioneerID,MachineHoursCurrentMeter,UsageBand,saledate,fiModelDesc,fiBaseModel,...,Pattern_Changer,Grouser_Type,Backhoe_Mounting,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls,Age,ProductClassCharacteristic,TimesAppearing
0,1139246,30141.526245,3157,121,3.0,68.0,1,668,949,295,...,3,3,2,10,7,3,1,20.0,8,1
1,1139248,30141.526245,77,121,3.0,-0.779399,1,1722,1724,526,...,3,3,2,10,7,3,1,28.0,20,1
2,1139249,25646.164709,7009,121,3.0,-0.779399,0,1382,330,109,...,3,3,2,10,7,4,5,23.0,15,3
3,1139251,25646.164709,332,121,3.0,-0.779399,0,2394,3673,1374,...,3,3,2,10,7,4,5,23.0,9,1
4,1139253,25646.164709,17311,121,3.0,-0.779399,2,3179,4207,1528,...,3,3,2,10,7,4,5,17.0,25,1


In [18]:
for col in df_valid.columns:
    if df_valid[col].dtype == 'object':
        # Check if column contains both strings and numbers
        if df_valid[col].apply(type).nunique() > 1:
            # Handle mixed types (example: fill numeric values with a placeholder string)
            df_valid[col] = df_valid[col].fillna('Missing').astype(str)
        df_valid[col] = le.fit_transform(df_valid[col])

In [19]:
# show correlation of 'SalePrice' column with reso of the columns

df.corr()['SalePrice'].sort_values()

ProductSize                  -0.129917
Age                          -0.123912
fiModelDescriptor            -0.113487
Coupler                      -0.100409
Hydraulics                   -0.085575
Enclosure_Type               -0.083240
Grouser_Tracks               -0.074485
Tire_Size                    -0.072210
Coupler_System               -0.071141
Blade_Extension              -0.070311
Stick_Length                 -0.069379
Scarifier                    -0.068241
Engine_Horsepower            -0.066736
Tip_Control                  -0.065632
Blade_Width                  -0.063510
Hydraulics_Flow              -0.062499
Pushblock                    -0.057916
Ripper                       -0.046507
UsageBand                    -0.046254
Travel_Controls              -0.043382
Thumb                        -0.043040
Grouser_Type                 -0.040975
Differential_Type            -0.040967
Pattern_Changer              -0.039997
Steering_Controls            -0.033783
Enclosure                

In [21]:
target = df['SalePrice']
features = df.drop(columns=['SalePrice'])

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.3, random_state=100)

model = train_and_evaluate_model(X_train, X_test, y_train, y_test, [15], -1, 20,
                                 0.5)

Fitting the model...
Testing the model...
Tree depth - 15
STD Test - 3940.2197109280073
STD Train - 3959.6717755222676
RMSE Test - 3226.557745266436
RMSE Train - 2945.2377809521568


In [23]:
X_valid = df_valid
y_valid_pred = model.predict(X_valid)

# Create the prediction DataFrame with only 'SalesID' and 'Predicted_SalePrice'
df_predictions = pd.DataFrame({
    'SalesID': df_valid['SalesID'],
    'SalePrice': y_valid_pred
})
df_predictions.to_csv('valid_predictions.csv', index=False)

In [24]:
from google.colab import files
files.download('valid_predictions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# print features importance order it descending

feature_importances = model.feature_importances_
feature_names = X_train.columns

importance_list = [(importance, feature) for feature, importance in zip(feature_names, feature_importances)]
importance_list.sort(reverse=True)

for importance, feature in importance_list:
    print(f"{feature}: {importance}")


Age: 0.19937407506585206
ProductSize: 0.10691889839352231
SalesID: 0.08207153754382275
ModelID: 0.07300923476058667
ProductClassCharacteristic: 0.05319805945419628
fiSecondaryDesc: 0.05011915152936151
fiModelDesc: 0.04547696431017013
fiProductClassDesc: 0.04328313362428725
fiBaseModel: 0.03834234202871468
fiModelDescriptor: 0.03582192624646638
saledate: 0.030655716543375417
state: 0.020760749250534297
ProductGroup: 0.020564596106261848
Tire_Size: 0.016754501107996818
Enclosure: 0.015835870041758633
auctioneerID: 0.015216829140987745
MachineHoursCurrentMeter: 0.014636762750531834
Drive_System: 0.013355963856597172
Pad_Type: 0.012103941017270674
Coupler_System: 0.011823749254768676
Track_Type: 0.011386670539874175
datasource: 0.009893314497543201
Ripper: 0.007958961846226305
fiModelSeries: 0.007183775188782225
Hydraulics: 0.007051763375061887
Forks: 0.007007770072411807
TimesAppearing: 0.005472211436897077
Grouser_Tracks: 0.005382902937514509
Ride_Control: 0.005382876577428912
Coupler: 0