In [None]:
import datetime

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from xgboost import XGBRegressor


In [None]:
import shutil

In [None]:
shutil.unpack_archive(
    "/content/drive/MyDrive/Data Car Resale Value Prediction (3).zip", 
    extract_dir="."
    )

In [None]:
data = pd.read_csv(
    "Data/autos.csv", encoding="Latin"
)

In [None]:
def infer_columns(X: pd.DataFrame):
    cat_cols = list()
    for colname in X.columns:
        subset = X.loc[:, colname]
        if subset.dtype == "object" or subset.dtype == "category":
            uniq_len = len(X.loc[:, colname].unique())
            print(colname, uniq_len)
            if uniq_len < 10:
                cat_cols.append(colname)

    for catcol in cat_cols:
        X.loc[:, catcol] = X.loc[:, catcol].astype("category")


infer_columns(data)


def convert_appropriate_dtypes(X:pd.DataFrame):
    X["dateCrawled"] = pd.to_datetime(X.dateCrawled)
    X["lastSeen"] = pd.to_datetime(X.lastSeen)
    X["monthOfRegistration"] = pd.to_numeric(X.monthOfRegistration, downcast="integer")
    X["dateCreated"] = pd.to_datetime(X.dateCreated)


convert_appropriate_dtypes(data)


def remove_redundant_data(X: pd.DataFrame):
    # Seller contains only 1 type of data and so remove it.
    X.drop("seller", axis=1, inplace=True)

    # Offer contains Most of the data as Angebot and 12 data as Gesuch so remove it
    X.drop("offerType", axis=1, inplace=True)

    # Both DateCreated and DateCrawled are correlated
    X.drop("dateCrawled", axis=1, inplace=True)

    # nrOfPicture contains no information since it has only 1 data i.e) 0
    X.drop("nrOfPictures", axis=1, inplace=True)


remove_redundant_data(data)

def filter_registration_year(X: pd.DataFrame):
    # REDUCE DATA RANGE BY YEAR OF REGISTRATION
    mini = 1900
    current_year = datetime.date.today().year

    X = X.loc[(X.yearOfRegistration > mini) & (X.yearOfRegistration < current_year)]

    return X

data = filter_registration_year(data)


def translate(X: pd.DataFrame):
    nrd_translate = {
        "ja": "Yes",
        "nein": "no",
    }

    X.notRepairedDamage.replace(nrd_translate, inplace=True)

    gb_translate = {
        "automatik": "automatic",
        "manuell": "manual"
    }

    X.gearbox.replace(gb_translate, inplace=True)

    fueltype_translate = {
        "benzin": "petrol",
        "andere": "other",
        "elektro": "electro"
    }

    X.fuelType.replace(fueltype_translate, inplace=True)

    
translate(data)

In [None]:
data["gearbox"] = data.gearbox.astype("category")
data["fuelType"] = data.gearbox.astype("category")
data["notRepairedDamage"] = data.notRepairedDamage.astype("category")