<a href="https://colab.research.google.com/github/IgorDiamandi/base_ML_Project/blob/BestResult/Colab_version.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [72]:
import gdown
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from pathlib import Path

def download_from_gdrive(url, filename):
    # Extract the file ID from the URL
    file_id = url.split('/')[-2]
    download_url = f"https://drive.google.com/uc?id={file_id}"

    # Download the file
    if Path(filename).exists():
        print(f"File '{filename}' already exists. Skipping download.")
    else:
        gdown.download(download_url, filename, quiet=False)
        print(f"File downloaded as: {filename}")

train = 'https://drive.google.com/file/d/1guqSpDv1Q7ZZjSbXMYGbrTvGns0VCyU5/view?usp=drive_link'
valid = 'https://drive.google.com/file/d/1j7x8xhMimKbvW62D-XeDfuRyj9ia636q/view?usp=drive_link'
# Example usage

download_from_gdrive(train, 'train.csv')
download_from_gdrive(valid, 'valid.csv')

File 'train.csv' already exists. Skipping download.
File 'valid.csv' already exists. Skipping download.


In [76]:
df = pd.read_csv('train.csv')

  df = pd.read_csv('train.csv')


In [73]:
# model functions

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

def train_and_evaluate_model(X_train, X_test, y_train, y_test, tree_depth, level_of_parallelism, number_of_trees,
                             max_features):
    for depth in tree_depth:
        model = RandomForestRegressor(
                random_state=100,
                n_jobs=level_of_parallelism,
                n_estimators=number_of_trees,
                max_depth=depth,
                max_features=max_features)


        print('Fitting the model...')
        model.fit(X_train, y_train)

        print('Testing the model...')
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        rmse_test = get_rmse(y_test, y_test_pred)
        rmse_train = get_rmse(y_train, y_train_pred)

        print(f'Tree depth - {depth}')
        print(f'STD Test - {y_test.std()}')
        print(f'STD Train - {y_train.std()}')
        print(f'RMSE Test - {rmse_test}')
        print(f'RMSE Train - {rmse_train}')

    return model


In [74]:
# helper functions

import zipfile
from datetime import datetime
import numpy as np
from scipy.stats import zscore
import pandas as pd

def handle_outliers_iqr(df, columns):
    for column in columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
        df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])

    return df


# Function to handle outliers using Z-score method
def handle_outliers_zscore(df, columns, threshold=3):
    for column in columns:
        z_scores = zscore(df[column])
        abs_z_scores = np.abs(z_scores)
        filtered_entries = abs_z_scores < threshold
        df = df[filtered_entries]
    return df

# Function to handle outliers using 6-Sigma method
def handle_outliers_six_sigma(df, columns):
    for column in columns:
        mean = df[column].mean()
        std = df[column].std()
        lower_bound = mean - 3 * std
        upper_bound = mean + 3 * std
        outliers = (df[column] < lower_bound) | (df[column] > upper_bound)
        df.loc[outliers, column] = mean
    return df

def mean_without_extremums(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    filtered_df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    mean_value = filtered_df[column].mean()
    return mean_value


def replace_outliers_with_mean(df, columns, threshold=3):
    for column in columns:
        mean_value = mean_without_extremums(df, column)
        z_scores = zscore(df[column])
        outliers = np.abs(z_scores) > threshold
        print(f'Column: {column}')
        print(f'Outliers - Mean without extremums: {mean_value}')
        print(f'Outliers - {df.loc[outliers, column]}')
        df.loc[outliers, column] = mean_value

    return df


def remove_columns_with_many_nulls(df, threshold=0.5):
    null_percentage = df.isnull().mean()
    columns_to_remove = null_percentage[null_percentage > threshold].index
    df_cleaned = df.drop(columns=columns_to_remove)

    return df_cleaned


def replace_null_with_mean(df, columns):
    for column in columns:
        mean_value = df[column].mean()
        df[column].fillna(mean_value, inplace=True)
    return df



def handle_missing_values(features):
    numeric_cols = features.select_dtypes(include=['number']).columns
    non_numeric_cols = features.select_dtypes(exclude=['number']).columns

    features[numeric_cols] = features[numeric_cols].fillna(features[numeric_cols].median())
    features[non_numeric_cols] = features[non_numeric_cols].fillna(features[non_numeric_cols].mode().iloc[0])

    return features

def get_rmse(y, y_pred):
    return mean_squared_error(y, y_pred) ** 0.5


def split_product_class_series(series):
    equipment_type = []
    details = []

    for item in series:
        if pd.isna(item):
            equipment_type.append(None)
            details.append(None)
        else:
            split_item = item.split(' - ', 1)
            equipment_type.append(split_item[0])
            details.append(split_item[1] if len(split_item) > 1 else None)

    return pd.Series(equipment_type), pd.Series(details)

In [75]:
# convert 'YearMade' column to 'Age'

df['Age'] = 2023 - df['YearMade']
df.drop('YearMade', axis=1, inplace=True)

KeyError: 'YearMade'

In [67]:
# handle outliers in the 'SalePrice' and the 'MachineCurrentMeter' columns using IQR method
df = handle_outliers_iqr(df, ['SalePrice', 'MachineHoursCurrentMeter'])

In [66]:
# replace outliers in the 'Age' column using mean

df = replace_outliers_with_mean(df, ['Age'], 1)

Column: Age
Outliers - Mean without extremums: 29.16899511495046
Outliers - 9         1023
21        1023
33        1023
35        1023
36        1023
          ... 
401066    1023
401069    1023
401072    1023
401073    1023
401075    1023
Name: Age, Length: 38185, dtype: int64


In [70]:
# split 'fiProductDesc' column

df['ProductClassName'],df['ProductClassCharacteristic'] = split_product_class_series(df['fiProductClassDesc'])

AttributeError: 'int' object has no attribute 'split'

In [41]:
# convert 'MachineID' column into 'TimesAppearing' column

df['TimesAppearing'] = df['MachineID'].map(df['MachineID'].value_counts())
df.drop('MachineID', axis=1, inplace=True)

In [42]:
# remove duplicated columns: 'ProductGroupDesc'

df.drop('ProductGroupDesc', axis=1, inplace=True)

In [43]:
# replace NaN values with 'missing' for non-numeric fields and with -1 for numeric

for col in df.columns:
    if df[col].dtype == 'object':
        df[col].fillna('missing', inplace=True)
    else:
        df[col].fillna(-1, inplace=True)

In [44]:
df.head()

Unnamed: 0,SalesID,SalePrice,ModelID,datasource,auctioneerID,MachineHoursCurrentMeter,UsageBand,saledate,fiModelDesc,fiBaseModel,...,Thumb,Pattern_Changer,Grouser_Type,Backhoe_Mounting,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls,Age,TimesAppearing
0,1139246,66000,3157,121,3.0,68.0,Low,11/16/2006 0:00,521D,521,...,missing,missing,missing,missing,missing,missing,Standard,Conventional,19,1
1,1139248,57000,77,121,3.0,4640.0,Low,3/26/2004 0:00,950FII,950,...,missing,missing,missing,missing,missing,missing,Standard,Conventional,27,1
2,1139249,10000,7009,121,3.0,2838.0,High,2/26/2004 0:00,226,226,...,missing,missing,missing,missing,missing,missing,missing,missing,22,3
3,1139251,38500,332,121,3.0,3486.0,High,5/19/2011 0:00,PC120-6E,PC120,...,missing,missing,missing,missing,missing,missing,missing,missing,22,1
4,1139253,11000,17311,121,3.0,722.0,Medium,7/23/2009 0:00,S175,S175,...,missing,missing,missing,missing,missing,missing,missing,missing,16,1


In [53]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        # Check if column contains both strings and numbers
        if df[col].apply(type).nunique() > 1:
            # Handle mixed types (example: fill numeric values with a placeholder string)
            df[col] = df[col].fillna('Missing').astype(str)
        df[col] = le.fit_transform(df[col])
df.head()

Unnamed: 0,SalesID,SalePrice,ModelID,datasource,auctioneerID,MachineHoursCurrentMeter,UsageBand,saledate,fiModelDesc,fiBaseModel,...,Thumb,Pattern_Changer,Grouser_Type,Backhoe_Mounting,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls,Age,TimesAppearing
0,1139246,66000,3157,121,3.0,68.0,1,668,949,295,...,3,3,3,2,10,7,3,1,19,1
1,1139248,57000,77,121,3.0,4640.0,1,1722,1724,526,...,3,3,3,2,10,7,3,1,27,1
2,1139249,10000,7009,121,3.0,2838.0,0,1382,330,109,...,3,3,3,2,10,7,4,5,22,3
3,1139251,38500,332,121,3.0,3486.0,0,2394,3673,1374,...,3,3,3,2,10,7,4,5,22,1
4,1139253,11000,17311,121,3.0,722.0,2,3179,4207,1528,...,3,3,3,2,10,7,4,5,16,1


In [30]:
# show correlation of 'SalePrice' column with reso of the columns

df.corr()['SalePrice'].sort_values()

ProductSize                -0.318978
Hydraulics_Flow            -0.306398
Coupler_System             -0.300528
Grouser_Tracks             -0.299573
Forks                      -0.229199
Turbocharged               -0.210779
fiSecondaryDesc            -0.203243
Hydraulics                 -0.192896
Pad_Type                   -0.189073
Tire_Size                  -0.174933
Age                        -0.156453
fiModelDescriptor          -0.153304
Stick_Length               -0.148793
Enclosure                  -0.138893
Steering_Controls          -0.125037
Blade_Width                -0.124854
Travel_Controls            -0.097374
Drive_System               -0.088841
UsageBand                  -0.084059
Stick                      -0.081081
Transmission               -0.069397
Coupler                    -0.066162
auctioneerID               -0.046865
ModelID                    -0.042125
saledate                   -0.018850
state                      -0.018149
fiModelSeries              -0.013087
U

In [54]:
target = df['SalePrice']
features = df.drop(columns=['SalePrice'])

In [62]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=100)

model = train_and_evaluate_model(X_train, X_test, y_train, y_test, [15], -1, 20,
                                 0.5)

Fitting the model...
Testing the model...
Tree depth - 15
STD Test - 22995.709788960918
STD Train - 23054.44179044156
RMSE Test - 9921.012221605435
RMSE Train - 8778.209233415591


In [64]:
# print features importance order it descending

feature_importances = model.feature_importances_
feature_names = X_train.columns
for feature, importance in zip(feature_names, feature_importances):
    print(f"{feature}: {importance}")


SalesID: 0.04628255181739576
ModelID: 0.07579313606288543
datasource: 0.005178239738481358
auctioneerID: 0.006046298602582372
MachineHoursCurrentMeter: 0.010332455854937151
UsageBand: 0.001851387013145455
saledate: 0.009056317902174562
fiModelDesc: 0.04977452533889872
fiBaseModel: 0.05972596756276794
fiSecondaryDesc: 0.03749051964396135
fiModelSeries: 0.005168438896148778
fiModelDescriptor: 0.03960065502102241
ProductSize: 0.1825214475231875
fiProductClassDesc: 0.05835408890514362
state: 0.0064759901825845005
ProductGroup: 0.006531724448395673
Drive_System: 0.0034319953967272337
Enclosure: 0.0628545495038255
Forks: 0.0014273619835003775
Pad_Type: 0.002273624808732749
Ride_Control: 0.0015599813862514891
Stick: 0.001388533688163324
Transmission: 0.001266412560359112
Turbocharged: 1.1806690057480538e-05
Blade_Extension: 0.004530164581305538
Blade_Width: 0.0034614905796355438
Enclosure_Type: 0.0021930105281405435
Engine_Horsepower: 0.00046122393776830455
Hydraulics: 0.0016671366104981224
P