<a href="https://colab.research.google.com/github/IgorDiamandi/base_ML_Project/blob/BestResult/Colab_version.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [62]:
import gdown
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from pathlib import Path

def download_from_gdrive(url, filename):
    # Extract the file ID from the URL
    file_id = url.split('/')[-2]
    download_url = f"https://drive.google.com/uc?id={file_id}"

    # Download the file
    if Path(filename).exists():
        print(f"File '{filename}' already exists. Skipping download.")
    else:
        gdown.download(download_url, filename, quiet=False)
        print(f"File downloaded as: {filename}")

train = 'https://drive.google.com/file/d/1guqSpDv1Q7ZZjSbXMYGbrTvGns0VCyU5/view?usp=drive_link'
valid = 'https://drive.google.com/file/d/1j7x8xhMimKbvW62D-XeDfuRyj9ia636q/view?usp=drive_link'
# Example usage

download_from_gdrive(train, 'train.csv')
download_from_gdrive(valid, 'valid.csv')

File 'train.csv' already exists. Skipping download.
File 'valid.csv' already exists. Skipping download.


In [72]:
from sklearn.ensemble import RandomForestRegressor


def train_and_evaluate_model(X_train, X_test, y_train, y_test, tree_depth, level_of_parallelism, number_of_trees,
                             max_features, min_samples_split, min_samples_leaf):
    for depth in tree_depth:
        model = RandomForestRegressor(
                random_state=100,
                n_jobs=level_of_parallelism,
                n_estimators=number_of_trees,
                max_depth=depth,
                max_features=max_features,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf)


        print('Fitting the model...')
        model.fit(X_train, y_train)

        print('Testing the model...')
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        rmse_test = get_rmse(y_test, y_test_pred)
        rmse_train = get_rmse(y_train, y_train_pred)

        print(f'Tree depth - {depth}')
        print(f'STD Test - {y_test.std()}')
        print(f'STD Train - {y_train.std()}')
        print(f'RMSE Test - {rmse_test}')
        print(f'RMSE Train - {rmse_train}')
        print(f'Test/Train Ratio - {1-rmse_train/rmse_test}')

    return model

In [66]:
!pip install sklearn.preprocessing

Collecting sklearn.preprocessing
  Downloading sklearn_preprocessing-0.1.0-py3-none-any.whl (10 kB)
Installing collected packages: sklearn.preprocessing
Successfully installed sklearn.preprocessing-0.1.0


In [69]:
import pandas as pd
import re
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline


#import category_encoders as ce

# Function to retrieve statistic parameters from numerical columns of the train dataframe
def compute_statistics(df):
    numeric_df = df.select_dtypes(include='number')

    mean_values = numeric_df.mean()
    iqr_values = numeric_df.quantile(0.75) - numeric_df.quantile(0.25)
    zscore_values = (numeric_df.mean() / numeric_df.std()).mean()

    # Mean without extremes (assuming extremes are values outside 1.5*IQR)
    def mean_without_extremes(series):
        Q1 = series.quantile(0.25)
        Q3 = series.quantile(0.75)
        IQR = Q3 - Q1
        filtered_series = series[(series >= Q1 - 1.5 * IQR) & (series <= Q3 + 1.5 * IQR)]
        return filtered_series.mean()

    mean_no_extremes_values = numeric_df.apply(mean_without_extremes)

    statistics_df = pd.DataFrame({
        'mean': mean_values,
        'iqr': iqr_values,
        'zscore': [zscore_values] * len(numeric_df.columns),
        'mean_without_extremes': mean_no_extremes_values
    })

    statistics_df = statistics_df.T
    return statistics_df


def get_stat_value(method, column, statistics_df):
    if method not in statistics_df.index:
        raise ValueError(f"Method {method} not found in statistics DataFrame")
    return statistics_df.loc[method, column]


def replace_outliers(df, column, method, statistics_df):
    stat_value = get_stat_value(method, column, statistics_df)

    # Define the threshold for identifying outliers
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Replace outliers with the statistical method value
    df[column] = df[column].apply(lambda x: stat_value if x < lower_bound or x > upper_bound else x)

    return df


def replace_nans(df, column, method, statistics_df):
    stat_value = get_stat_value(method, column, statistics_df)
    df[column] = df[column].fillna(stat_value)

    return df


def get_rmse(y, y_pred):
    return mean_squared_error(y, y_pred) ** 0.5


def split_product_class_series(series):
    equipment_type = []
    details = []

    for item in series:
        if pd.isna(item):
            equipment_type.append(None)
            details.append(None)
        else:
            split_item = item.split(' - ', 1)
            equipment_type.append(split_item[0])
            details.append(split_item[1] if len(split_item) > 1 else None)

    return pd.Series(equipment_type), pd.Series(details)


def replace_nan_with_string(df):
    le = LabelEncoder()
    for col in df.columns:
        if df[col].dtype == 'object':
            if df[col].apply(type).nunique() > 1:
                df[col] = df[col].fillna('Missing').astype(str)
            df[col] = le.fit_transform(df[col])


# function to apply target encoding to selected columns
def apply_target_encoding(df_train, df_test, df_valid, target_column, columns_to_encode):
    X_train = df_train.drop(target_column, axis=1)
    y_train = df_train[target_column]

    target_encoder = TargetEncoder(columns_to_encode)
    X_train_encoded = target_encoder.fit_transform(X_train, y_train)
    X_test_encoded = target_encoder.transform(df_test.drop(target_column, axis=1))
    X_valid_encoded = target_encoder.transform(df_valid.drop(target_column, axis=1))

    X_train_encoded = pd.concat([X_train_encoded, y_train.reset_index(drop=True)], axis=1)
    X_test_encoded = pd.concat([X_test_encoded, df_test[target_column].reset_index(drop=True)], axis=1)
    X_valid_encoded = pd.concat([X_valid_encoded, df_valid[target_column].reset_index(drop=True)], axis=1)

    return X_train_encoded, X_test_encoded, X_valid_encoded;


def apply_one_hot_encoder(df_train, df_test, df_valid, columns_to_encode):
    one_hot_encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')
    preprocessor = ColumnTransformer(transformers=[
        ('onehot', one_hot_encoder, columns_to_encode),
    ],
        remainder='passthrough'
    )
    X_train_encoded = preprocessor.fit_transform(df_train)
    X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=preprocessor.get_feature_names_out())
    X_Test_encoded = preprocessor.transform(df_test)
    X_Test_encoded_df = pd.DataFrame(X_Test_encoded, columns=preprocessor.get_feature_names_out())
    X_valid_encoded = preprocessor.transform(df_valid)
    X_valid_encoded_df = pd.DataFrame(X_valid_encoded, columns=preprocessor.get_feature_names_out())

    return X_train_encoded_df, X_Test_encoded_df, X_valid_encoded_df;


def unite_sparse_columns(df, columns_to_unite, new_column_name):
    columns_to_unite = [col for col in columns_to_unite if col != new_column_name]

    existing_columns = [col for col in columns_to_unite if col in df.columns]
    missing_columns = [col for col in columns_to_unite if col not in df.columns]

    if missing_columns:
        print(f"Warning: The following columns are missing and will be ignored: {missing_columns}")

    if not existing_columns:
        raise ValueError("None of the specified columns to unite exist in the DataFrame.")

    df[new_column_name + '_non_null_count'] = df[existing_columns].notnull().sum(axis=1)

    df[new_column_name + '_any_non_null'] = df[existing_columns].notnull().any(axis=1).astype(int)

    if df[existing_columns].apply(lambda col: col.map(lambda x: isinstance(x, (int, float)))).all().all():
        df[new_column_name + '_sum'] = df[existing_columns].sum(axis=1, skipna=True)

    if df[existing_columns].apply(lambda col: col.map(lambda x: isinstance(x, str))).all().all():
        df[new_column_name + '_mode'] = df[existing_columns].mode(axis=1)[0]

    df = df.drop(columns=existing_columns)

    return df


def missing_values_imputation(df, target_column, feature_columns):

    df_notnull = df.dropna(subset=[target_column])
    df_null = df[df[target_column].isnull()]

    if df_null.empty:
        return df

    X = df_notnull[feature_columns]
    y = df_notnull[target_column]
    X_null = df_null[feature_columns]

    categorical_features = [col for col in feature_columns if df[col].dtype == 'object']

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough'
    )

    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', DecisionTreeClassifier(random_state=0))
    ])

    model.fit(X, y)

    df.loc[df[target_column].isnull(), target_column] = model.predict(X_null)

    return df


def split_fiProductClassDesc(df, column_name):
    # Function to parse each value in the fiProductClassDesc column
    def parse_fiProductClassDesc(value):
        match = re.match(r'^(.*) - (\d+\.\d+) to (\d+\.\d+) (.*)$', value)
        if match:
            category, low_value, high_value, characteristic = match.groups()
            return category, float(low_value), float(high_value), characteristic
        else:
            return None, None, None, None

    parsed_values = df[column_name].apply(parse_fiProductClassDesc)

    df['Category'] = parsed_values.apply(lambda x: x[0])
    df['LowValue'] = parsed_values.apply(lambda x: x[1])
    df['HighValue'] = parsed_values.apply(lambda x: x[2])
    df['Characteristic'] = parsed_values.apply(lambda x: x[3])

    unique_characteristics = df['Characteristic'].dropna().unique()
    for characteristic in unique_characteristics:
        df[f'post_fi_{characteristic}'] = df.apply(
            lambda row: f"{row['LowValue']} to {row['HighValue']}" if row['Characteristic'] == characteristic else None,
            axis=1
        )

    df = df.drop(columns=['LowValue', 'HighValue', 'Characteristic'])

    return df

In [79]:
DTYPE_SPEC = {
    13: 'str',
    39: 'str',
    40: 'str',
    41: 'str'}

df = pd.read_csv('train.csv', dtype=DTYPE_SPEC)
df_valid = pd.read_csv('valid.csv', dtype=DTYPE_SPEC)

In [76]:
import pandas as pd
from sklearn.model_selection import train_test_split


# Constants
LEVEL_OF_PARALLELISM = -1
NUMBER_OF_TREES = 100
TREE_DEPTH = 8
MIN_SAMPLES_SPLIT = 3
MIN_SAMPLES_LEAF = 2
MAX_FEATURES = 0.9

In [77]:
KILL_THEM_COLUMNS = ['uControl_any_non_null', 'uMounting_any_non_null', 'uHydraulics_non_null_count',
                     'uControl_non_null_count', 'fiProductClassDesc']

SIZE_FIT_COLUMNS = ['fiModelDesc', 'post_fi_Horsepower', 'Drive_System', 'Stick_Length', 'Undercarriage_Pad_Width',
                    'Pad_Type', 'Differential_Type']

DUPLICATE_COLUMNS = ['fiBaseModel', 'fiSecondaryDesc', 'fiModelSeries', 'fiModelDescriptor']
COLUMN_GROUPS = {
    'uBladeStick': ['Blade_Extension', 'Stick_Length', 'Stick',], #'Blade_Type' 'Blade_Width'
    'uTrack': ['Pad_Type', 'Grouser_Type', 'Grouser_Tracks'], #'Track_Type' 'Undercarriage_Pad_Width'
    'uMounting': ['Backhoe_Mounting', 'Forks', 'Pushblock', 'Ripper', 'Scarifier', 'Thumb'],
    'uControl': ['Travel_Controls', 'Ride_Control', 'Transmission',
                 'Pattern_Changer', 'Tip_Control', 'Coupler', 'Coupler_System'], #'Steering_Controls'
    'uHydraulics': ['Turbocharged'], #'Hydraulics' 'Hydraulics_Flow'
    'uDrive': ['Differential_Type', 'Drive_System']
    }

In [80]:
df = split_fiProductClassDesc(df, 'fiProductClassDesc')
df_valid = split_fiProductClassDesc(df_valid, 'fiProductClassDesc')

df = missing_values_imputation(df, 'ProductSize', SIZE_FIT_COLUMNS)
df_valid = missing_values_imputation(df_valid, 'ProductSize', SIZE_FIT_COLUMNS)

In [81]:
for new_column, columns in COLUMN_GROUPS.items():
    df = unite_sparse_columns(df, columns, new_column)
    df_valid = unite_sparse_columns(df_valid, columns, new_column)

In [82]:
df.drop(KILL_THEM_COLUMNS, axis=1, inplace=True)
df_valid.drop(KILL_THEM_COLUMNS, axis=1, inplace=True)

df.drop(DUPLICATE_COLUMNS, axis=1, inplace=True)
df_valid.drop(DUPLICATE_COLUMNS, axis=1, inplace=True)

In [83]:
df['saleyear'] = pd.to_datetime(df['saledate']).dt.year
df_valid['saleyear'] = pd.to_datetime(df_valid['saledate']).dt.year
df['age'] = 2024 - df['YearMade']
df_valid['age'] = 2024 - df_valid['YearMade']
df['AgeAtSale'] = df['saleyear'] - df['YearMade']
df_valid['AgeAtSale'] = df_valid['saleyear'] - df_valid['YearMade']
df.drop(['saledate', 'saleyear', 'YearMade'], axis=1, inplace=True)
df_valid.drop(['saledate', 'saleyear', 'YearMade'], axis=1, inplace=True)

In [84]:
df['TimesAppearing'] = df['MachineID'].map(df['MachineID'].value_counts())
df.drop('MachineID', axis=1, inplace=True)
df_valid['TimesAppearing'] = df_valid['MachineID'].map(df_valid['MachineID'].value_counts())
df_valid.drop('MachineID', axis=1, inplace=True)

In [95]:
df[['datasource', 'auctioneerID']] = df[['datasource', 'auctioneerID']].fillna(0.0)
df_valid[['datasource', 'auctioneerID']] = df_valid[['datasource', 'auctioneerID']].fillna(0.0)

In [97]:
#Split train and test data
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=['SalePrice']), df['SalePrice'], test_size=0.3, random_state=42)

In [98]:
statistics_df = compute_statistics(X_train)

""" Use replace_outliers function in order to replace outliers
in the 'MachineHoursCurrentMeter' column using statistics_df and IQR method """
X_train = replace_outliers(X_train, 'MachineHoursCurrentMeter', 'mean_without_extremes', statistics_df)
X_test = replace_outliers(X_test, 'MachineHoursCurrentMeter', 'mean_without_extremes', statistics_df)
df_valid = replace_outliers(df_valid, 'MachineHoursCurrentMeter', 'mean_without_extremes', statistics_df)

""" Use replace_outliers function in order to replace outliers
in the 'Age' columns using statistics_df and mean method """
X_train = replace_outliers(X_train, 'AgeAtSale', 'mean_without_extremes', statistics_df)
X_test = replace_outliers(X_test, 'AgeAtSale', 'mean_without_extremes', statistics_df)
df_valid = replace_outliers(df_valid, 'AgeAtSale', 'mean_without_extremes', statistics_df)

""" Use replace_nans function in order to replace NaN values
in the 'MachineHoursCurrentMeter' column using statistics_df and IQR method """
X_train = replace_nans(X_train, 'MachineHoursCurrentMeter', 'iqr', statistics_df)
X_test = replace_nans(X_test, 'MachineHoursCurrentMeter', 'iqr', statistics_df)
X_valid = replace_nans(df_valid, 'MachineHoursCurrentMeter', 'iqr', statistics_df)

replace_nan_with_string(X_train)
replace_nan_with_string(X_test)
replace_nan_with_string(df_valid)

In [99]:
X_train_transformed, X_test_transformed, X_valid_transformed=(
    apply_one_hot_encoder(X_train, X_test, df_valid, X_train.select_dtypes(exclude=['number']).columns.tolist()))

In [100]:
model = train_and_evaluate_model(X_train_transformed, X_test_transformed, y_train, y_test, [TREE_DEPTH],
                                 LEVEL_OF_PARALLELISM, NUMBER_OF_TREES, MAX_FEATURES, MIN_SAMPLES_SPLIT,
                                MIN_SAMPLES_LEAF)

Fitting the model...
Testing the model...
Tree depth - 8
STD Test - 22932.400534045228
STD Train - 23081.526018913544
RMSE Test - 11324.008170534435
RMSE Train - 10856.934226244897
Test/Train Ratio - 0.04124634469135091


In [102]:
feature_importance = model.feature_importances_
feature_names = X_train.columns
importance_list = [(importance, feature) for feature, importance in zip(feature_names, feature_importance)]
importance_list.sort(reverse=True)
print('-------FeatureImportance-------')
for importance, feature in importance_list:
    print(f"{feature}: {importance}")

-------FeatureImportance-------
ProductSize: 0.28563368382457277
AgeAtSale: 0.2203272488746134
age: 0.09778244883449391
Enclosure: 0.074361648129583
ModelID: 0.0706229160308301
uMounting_non_null_count: 0.06464128006253068
fiModelDesc: 0.05340445439980177
post_fi_Horsepower: 0.04914993664323257
Hydraulics_Flow: 0.017210790148984842
Enclosure_Type: 0.010940113048986202
Blade_Type: 0.007606210983805998
Engine_Horsepower: 0.007202839303828809
post_fi_Metric Tons: 0.006991378166983932
uTrack_non_null_count: 0.0052482551209697146
SalesID: 0.005153438808763006
ProductGroupDesc: 0.0038924675152081935
ProductGroup: 0.003568123005345316
uDrive_non_null_count: 0.0031550520692196393
uDrive_any_non_null: 0.002895880914681679
Category: 0.0023901601202956195
uTrack_any_non_null: 0.0020211155225105617
post_fi_Lb Operating Capacity: 0.0018583386601856277
uBladeStick_any_non_null: 0.0012221138008103789
uBladeStick_non_null_count: 0.0008382628651069526
Track_Type: 0.0005763183466264921
Hydraulics: 0.000

In [103]:
y_valid_pred = model.predict(X_valid_transformed)
# Create the prediction DataFrame with only 'SalesID' and 'Predicted_SalePrice'
df_predictions = pd.DataFrame({
    'SalesID': X_valid_transformed['remainder__SalesID'].astype(int),
    'SalePrice': y_valid_pred
})
df_predictions.to_csv('valid_predictions_0.04.csv', index=False)