<a href="https://colab.research.google.com/github/IgorDiamandi/base_ML_Project/blob/BestResult/Colab_version.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [42]:
import gdown
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from pathlib import Path

def download_from_gdrive(url, filename):
    # Extract the file ID from the URL
    file_id = url.split('/')[-2]
    download_url = f"https://drive.google.com/uc?id={file_id}"

    # Download the file
    if Path(filename).exists():
        print(f"File '{filename}' already exists. Skipping download.")
    else:
        gdown.download(download_url, filename, quiet=False)
        print(f"File downloaded as: {filename}")

train = 'https://drive.google.com/file/d/1guqSpDv1Q7ZZjSbXMYGbrTvGns0VCyU5/view?usp=drive_link'
valid = 'https://drive.google.com/file/d/1j7x8xhMimKbvW62D-XeDfuRyj9ia636q/view?usp=drive_link'
# Example usage

download_from_gdrive(train, 'train.csv')
download_from_gdrive(valid, 'valid.csv')

File 'train.csv' already exists. Skipping download.
File 'valid.csv' already exists. Skipping download.


In [43]:
from sklearn.ensemble import RandomForestRegressor


def train_and_evaluate_model(X_train, X_test, y_train, y_test, tree_depth, level_of_parallelism, number_of_trees,
                             max_features, min_samples_split, min_samples_leaf):
    for depth in tree_depth:
        model = RandomForestRegressor(
                random_state=100,
                n_jobs=level_of_parallelism,
                n_estimators=number_of_trees,
                max_depth=depth,
                max_features=max_features,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf)


        print('Fitting the model...')
        model.fit(X_train, y_train)

        print('Testing the model...')
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        rmse_test = get_rmse(y_test, y_test_pred)
        rmse_train = get_rmse(y_train, y_train_pred)

        print(f'Tree depth - {depth}')
        print(f'STD Test - {y_test.std()}')
        print(f'STD Train - {y_train.std()}')
        print(f'RMSE Test - {rmse_test}')
        print(f'RMSE Train - {rmse_train}')
        print(f'Test/Train Ratio - {1-rmse_train/rmse_test}')

    return model

In [44]:
!pip install sklearn.preprocessing



In [45]:
import re

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import FeatureHasher
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.tree import DecisionTreeClassifier


def compute_statistics(df):
    numeric_df = df.select_dtypes(include='number')

    mean_values = numeric_df.mean()
    iqr_values = numeric_df.quantile(0.75) - numeric_df.quantile(0.25)
    zscore_values = (numeric_df.mean() / numeric_df.std()).mean()

    def mean_without_extremes(series):
        Q1 = series.quantile(0.25)
        Q3 = series.quantile(0.75)
        IQR = Q3 - Q1
        filtered_series = series[(series >= Q1 - 1.5 * IQR) & (series <= Q3 + 1.5 * IQR)]
        return filtered_series.mean()

    mean_no_extremes_values = numeric_df.apply(mean_without_extremes)

    statistics_df = pd.DataFrame({
        'mean': mean_values,
        'iqr': iqr_values,
        'zscore': [zscore_values] * len(numeric_df.columns),
        'mean_without_extremes': mean_no_extremes_values
    })

    return statistics_df.T


def get_stat_value(method, column, statistics_df):
    if method not in statistics_df.index:
        raise ValueError(f"Method {method} not found in statistics DataFrame")
    return statistics_df.loc[method, column]


def replace_outliers(dfs, column, method, statistics_df):
    stat_value = get_stat_value(method, column, statistics_df)

    def replace_outlier_values(df, column, lower_bound, upper_bound, stat_value):
        df[column] = df[column].apply(lambda x: stat_value if x < lower_bound or x > upper_bound else x)
        return df

    updated_dfs = []
    for df in dfs:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        updated_df = replace_outlier_values(df.copy(), column, lower_bound, upper_bound, stat_value)
        updated_dfs.append(updated_df)

    return updated_dfs


def replace_nans(dfs, column, method, statistics_df):
    stat_value = get_stat_value(method, column, statistics_df)
    for df in dfs:
        df[column] = df[column].fillna(stat_value)
    return dfs


def get_rmse(y_log, y_pred_log):
    y = np.expm1(y_log)
    y_pred = np.expm1(y_pred_log)

    return mean_squared_error(y, y_pred) ** 0.5

'''
def split_product_class_series(series):
    equipment_type = []
    details = []

    for item in series:
        if pd.isna(item):
            equipment_type.append(None)
            details.append(None)
        else:
            split_item = item.split(' - ', 1)
            equipment_type.append(split_item[0])
            details.append(split_item[1] if len(split_item) > 1 else None)

    return pd.Series(equipment_type), pd.Series(details)
'''


def apply_one_hot_encoder(dfs, columns_to_encode, excluded_columns=[]):
    one_hot_encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')
    preprocessor = ColumnTransformer(
        transformers=[
            ('onehot', one_hot_encoder, columns_to_encode)
        ],
        remainder='passthrough'
    )

    # Fit and transform the first dataframe to get the feature names
    X_encoded = [preprocessor.fit_transform(dfs[0])]
    feature_names = preprocessor.get_feature_names_out()

    # Transform the remaining dataframes
    for df in dfs[1:]:
        X_encoded.append(preprocessor.transform(df))

    # Create DataFrames from the encoded arrays
    encoded_dfs = [pd.DataFrame(X.toarray(), columns=feature_names) for X in X_encoded]

    # Remove excluded columns
    for df in encoded_dfs:
        df.drop(columns=excluded_columns, errors='ignore', inplace=True)

    return encoded_dfs


def unite_sparse_columns(df, columns_to_unite, new_column_name):
    columns_to_unite = [col for col in columns_to_unite if col != new_column_name]

    existing_columns = [col for col in columns_to_unite if col in df.columns]
    missing_columns = [col for col in columns_to_unite if col not in df.columns]

    if missing_columns:
        print(f"Warning: The following columns are missing and will be ignored: {missing_columns}")

    if not existing_columns:
        raise ValueError("None of the specified columns to unite exist in the DataFrame.")

    df[new_column_name + '_non_null_count'] = df[existing_columns].notnull().sum(axis=1)
    df[new_column_name + '_any_non_null'] = df[existing_columns].notnull().any(axis=1).astype(int)

    if df[existing_columns].apply(lambda col: col.map(lambda x: isinstance(x, (int, float)))).all().all():
        df[new_column_name + '_sum'] = df[existing_columns].sum(axis=1, skipna=True)

    if df[existing_columns].apply(lambda col: col.map(lambda x: isinstance(x, str))).all().all():
        df[new_column_name + '_mode'] = df[existing_columns].mode(axis=1)[0]

    df = df.drop(columns=existing_columns)

    return df


def missing_values_imputation(df, target_column, feature_columns):
    df_notnull = df.dropna(subset=[target_column])
    df_null = df[df[target_column].isnull()]

    if df_null.empty:
        return df

    X = df_notnull[feature_columns]
    y = df_notnull[target_column]
    X_null = df_null[feature_columns]

    categorical_features = [col for col in feature_columns if df[col].dtype == 'object']
    numerical_features = [col for col in feature_columns if df[col].dtype != 'object']

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', SimpleImputer(strategy='mean'), numerical_features),
            ('cat', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))
            ]), categorical_features)
        ]
    )

    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', DecisionTreeClassifier(random_state=0, criterion='entropy'))
    ])

    model.fit(X, y)
    df.loc[df[target_column].isnull(), target_column] = model.predict(X_null)

    return df


def split_fiProductClassDesc(df, column_name):
    def parse_fiProductClassDesc(value):
        match = re.match(r'^(.*) - (\d+\.\d+) to (\d+\.\d+) (.*)$', value)
        if match:
            category, low_value, high_value, characteristic = match.groups()
            return category, float(low_value), float(high_value), characteristic
        else:
            return None, None, None, None

    parsed_values = df[column_name].apply(parse_fiProductClassDesc)

    df['Category'] = parsed_values.apply(lambda x: x[0])
    df['LowValue'] = parsed_values.apply(lambda x: x[1])
    df['HighValue'] = parsed_values.apply(lambda x: x[2])
    df['Characteristic'] = parsed_values.apply(lambda x: x[3])

    # Calculate the average of LowValue and HighValue
    df['AverageValue'] = (df['LowValue'] + df['HighValue']) / 2

    unique_characteristics = df['Characteristic'].dropna().unique()
    for characteristic in unique_characteristics:
        df[f'post_fi_{characteristic}'] = df.apply(
            lambda row: row['AverageValue'] if row['Characteristic'] == characteristic else None,
            axis=1
        )

    df = df.drop(columns=['LowValue', 'HighValue', 'Characteristic', 'AverageValue'])

    return df


def apply_lable_encoding(dfs, columns):
    encoders = {col: LabelEncoder() for col in columns}

    # Combine unique values for each column across all DataFrames
    for col in columns:
        unique_values = pd.concat([df[col] for df in dfs]).unique()
        encoders[col].fit(unique_values)

    # Encode the columns in all DataFrames
    for df in dfs:
        for col in columns:
            df[col] = df[col].map(lambda s: '<unknown>' if s not in encoders[col].classes_ else s)

    # Add '<unknown>' to the encoder classes and transform the columns
    for col in columns:
        encoders[col].classes_ = np.append(encoders[col].classes_, '<unknown>')

    for df in dfs:
        for col in columns:
            df[col] = encoders[col].transform(df[col])

    return dfs

In [46]:
#region Constants
LEVEL_OF_PARALLELISM = -1
NUMBER_OF_TREES = 100
TREE_DEPTH = [17]
MIN_SAMPLES_SPLIT = 3
MIN_SAMPLES_LEAF = 2
MAX_FEATURES = 0.8

DONT_ENCODE_THEM_COLUMNS = ['ModelID', 'Income', 'AgeAtSale', 'SalesID', 'MachineHoursCurrentMeter']

KILL_THEM_COLUMNS = ['uControl_any_non_null', 'uMounting_any_non_null', 'uHydraulics_non_null_count',
                     'uControl_non_null_count', 'fiProductClassDesc', 'post_fi_Metric Tons',
                     'post_fi_Ft Standard Digging Depth', 'post_fi_Lb Operating Capacity']

SIZE_FIT_COLUMNS = ['fiModelDesc', 'post_fi_Horsepower',
                    'Drive_System', 'Stick_Length', 'Undercarriage_Pad_Width',
                    'post_fi_Metric Tons']

USAGE_FIT_COLUMNS = ['AgeAtSale', 'MachineHoursCurrentMeter']

DUPLICATE_COLUMNS = ['fiBaseModel', 'fiSecondaryDesc', 'fiModelSeries', 'fiModelDescriptor']

LABEL_ENCODING_COLUMNS = ['fiModelDesc', 'Enclosure', 'post_fi_Horsepower']

COLUMN_GROUPS = {
    'uBladeStick': ['Blade_Extension', 'Stick_Length', 'Stick'],
    'uTrack': ['Pad_Type', 'Grouser_Type', 'Grouser_Tracks'],
    'uMounting': ['Backhoe_Mounting', 'Forks', 'Pushblock', 'Ripper', 'Scarifier', 'Thumb'],
    'uControl': ['Travel_Controls', 'Ride_Control', 'Transmission',
                 'Pattern_Changer', 'Tip_Control', 'Coupler', 'Coupler_System'],
    'uHydraulics': ['Turbocharged', 'Hydraulics', 'Hydraulics_Flow'],
    'uDrive': ['Differential_Type', 'Drive_System']
}

In [47]:
DTYPE_SPEC = {
    13: 'str',
    39: 'str',
    40: 'str',
    41: 'str'}

df = pd.read_csv('train.csv', dtype=DTYPE_SPEC)
df_valid = pd.read_csv('valid.csv', dtype=DTYPE_SPEC)

In [48]:

#region Adding External Data
# adding external states demographics data to both datasets
#df = pd.merge(df, df_state_demographics, on='state', how='left')
#df_valid = pd.merge(df_valid, df_state_demographics, on='state', how='left')
#endregion

NameError: name 'df_state_demographics' is not defined

In [49]:
import re
#region Product size handling
df = split_fiProductClassDesc(df, 'fiProductClassDesc')
df_valid = split_fiProductClassDesc(df_valid, 'fiProductClassDesc')

df = missing_values_imputation(df, 'ProductSize', SIZE_FIT_COLUMNS)
df_valid = missing_values_imputation(df_valid, 'ProductSize', SIZE_FIT_COLUMNS)
#endregion

In [50]:
#region Merging bad columns to one bad column
# grouping columns with technical details
for new_column, columns in COLUMN_GROUPS.items():
    df = unite_sparse_columns(df, columns, new_column)
    df_valid = unite_sparse_columns(df_valid, columns, new_column)
#endregion

In [51]:
#region Convert dates to periods and categories
df['saleyear'] = pd.to_datetime(df['saledate']).dt.year
df_valid['saleyear'] = pd.to_datetime(df_valid['saledate']).dt.year

df['salemonth'] = pd.to_datetime(df['saledate']).dt.month
df_valid['salemonth'] = pd.to_datetime(df_valid['saledate']).dt.month

df['saledayofweek'] = pd.to_datetime(df['saledate']).dt.dayofweek
df_valid['saledayofweek'] = pd.to_datetime(df_valid['saledate']).dt.dayofweek

df['AgeAtSale'] = df['saleyear'] - df['YearMade']
df_valid['AgeAtSale'] = df_valid['saleyear'] - df_valid['YearMade']

# drop unnecessary columns
df.drop(['saledate', 'YearMade', 'saleyear'], axis=1, inplace=True)
df_valid.drop(['saledate', 'YearMade', 'saleyear'], axis=1, inplace=True)
#endregion


In [52]:
#region Usage_band handling
df = missing_values_imputation(df, 'UsageBand', USAGE_FIT_COLUMNS)
df_valid = missing_values_imputation(df_valid, 'UsageBand', USAGE_FIT_COLUMNS)
#endregion

In [53]:
#region convert 'MachineID' column into 'TimesAppearing' column
df['TimesAppearing'] = df['MachineID'].map(df['MachineID'].value_counts())
df.drop('MachineID', axis=1, inplace=True)
df_valid['TimesAppearing'] = df_valid['MachineID'].map(df_valid['MachineID'].value_counts())
df_valid.drop('MachineID', axis=1, inplace=True)
#endregion

In [54]:
# region Replace NaN with 0 in 'ID' columns
df[['datasource', 'auctioneerID']] = df[['datasource', 'auctioneerID']].fillna(0.0)
df_valid[['datasource', 'auctioneerID']] = df_valid[['datasource', 'auctioneerID']].fillna(0.0)
#endregion

In [55]:
#region Dropping
# drop unimportant columns
df.drop(KILL_THEM_COLUMNS, axis=1, inplace=True)
df_valid.drop(KILL_THEM_COLUMNS, axis=1, inplace=True)

# drop duplicated columns
df.drop(DUPLICATE_COLUMNS, axis=1, inplace=True)
df_valid.drop(DUPLICATE_COLUMNS, axis=1, inplace=True)
#endregion

In [56]:
#region Split train and test data + log normalization of the target column
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=['SalePrice']), df['SalePrice'], test_size=0.3, random_state=42)

y_train = np.log1p(y_train)
y_test = np.log1p(y_test)
#endregion

In [57]:
#region Get and use statistics dataframe
statistics_df = compute_statistics(X_train)

X_train, X_test, df_valid = replace_outliers([X_train, X_test, df_valid],
                                             'MachineHoursCurrentMeter', 'iqr', statistics_df)

X_train, X_test, df_valid = replace_outliers([X_train, X_test, df_valid],
                                             'AgeAtSale', 'iqr', statistics_df)

X_train, X_test, df_valid = replace_nans([X_train, X_test, df_valid],
                                         'MachineHoursCurrentMeter',
                                         'mean_without_extremes', statistics_df)
#endregion

In [58]:
#region Applying encodings
# label encoding
X_train, X_test, df_valid = apply_lable_encoding([X_train, X_test, df_valid],
                                                 LABEL_ENCODING_COLUMNS)

# one-hot encoding
X_train_transformed, X_test_transformed, X_valid_transformed = (
    apply_one_hot_encoder([X_train, X_test, df_valid],
                          X_train.select_dtypes(exclude=['number']).columns.tolist(),
                          LABEL_ENCODING_COLUMNS))
#endregion

In [59]:
# find nulls in X_train_transformed

null_columns = X_train_transformed.columns[X_train_transformed.isnull().any()]
print(null_columns)

Index([], dtype='object')


In [63]:
#region Train model
model = train_and_evaluate_model(X_train_transformed, X_test_transformed, y_train, y_test, TREE_DEPTH,
                                             LEVEL_OF_PARALLELISM, NUMBER_OF_TREES, MAX_FEATURES, MIN_SAMPLES_SPLIT,
                                             MIN_SAMPLES_LEAF)
#endregion

Fitting the model...
Testing the model...
Tree depth - 17
STD Test - 0.6936844473042342
STD Train - 0.6935474937562746
RMSE Test - 8473.899814198632
RMSE Train - 7392.37850675058
Test/Train Ratio - 0.12762970192730927


In [64]:
#region Feature importance
#print(results_df)
feature_importance = model.feature_importances_
feature_names = X_train_transformed.columns
importance_list = [(importance, feature) for feature, importance in zip(feature_names, feature_importance)]
importance_list.sort(reverse=True)
print('-------Feature Importance-------')
for importance, feature in importance_list:
    print(f"{feature}: {importance}")
#endregion

-------Feature Importance-------
remainder__AgeAtSale: 0.18319647032666417
onehot__ProductGroup_SSL: 0.10878678470402525
onehot__ProductGroupDesc_Skid Steer Loaders: 0.10365088647965759
remainder__ModelID: 0.09717687921482698
remainder__post_fi_Horsepower: 0.07051023742534929
onehot__ProductSize_Mini: 0.06841548228038705
remainder__fiModelDesc: 0.06802027364864201
onehot__ProductGroupDesc_Backhoe Loaders: 0.04809801079511014
onehot__ProductGroup_BL: 0.0430840371886523
remainder__SalesID: 0.03229840025946031
onehot__ProductSize_Large / Medium: 0.026490216621974337
onehot__ProductSize_Medium: 0.024688198364073856
remainder__Enclosure: 0.019222466700292627
onehot__ProductSize_Small: 0.012886399618552362
onehot__Category_Skid Steer Loader: 0.011432717842182362
onehot__ProductSize_Compact: 0.011232924773585345
remainder__uMounting_non_null_count: 0.005877213216760793
remainder__salemonth: 0.005872529664112986
onehot__ProductSize_Large: 0.004279326474067164
onehot__Blade_Type_Semi U: 0.00407

In [65]:
#region Use Model
y_valid_log_pred = model.predict(X_valid_transformed)
y_valid_pred = np.expm1(y_valid_log_pred);

# Create the prediction DataFrame with only 'SalesID' and 'Predicted_SalePrice'
df_predictions = pd.DataFrame({
    'SalesID': df_valid['SalesID'].astype(int),  # Ensure SalesID is included in df_valid
    'SalePrice': y_valid_pred
})

df_predictions.to_csv('valid_predictions_0.2.csv', index=False)
#endregion