### Library Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### Import Dataset

In [2]:
url = "https://raw.githubusercontent.com/Lughaidh-w/Laptop-Prices/main/laptop_price1.csv"
df = pd.read_csv(url)

### Tuning


In [3]:
# target variable
target = "Price_euros"

# split variables
test_size = 0.3
random_state = 42

scaler = StandardScaler()

### Basic Data Preparation

In [4]:
def data_preparation(df):
    # columns
    categorical_columns = df.select_dtypes(include=['category', 'object']).columns.tolist()
    continuous_columns = df.select_dtypes(include=['float64', 'int32']).columns.tolist()
    continuous_columns.remove(target)
    # train test split
    X_train, X_test, y_train, y_test = train_test_split(df.drop(target, axis=1), df[target], 
                                                        test_size=test_size, random_state=random_state)
    # encoding
    X_train_dummy = encoding(X_train, categorical_columns, continuous_columns)
    X_test_dummy = encoding(X_test, categorical_columns, continuous_columns)
    # scaling
    X_train_scaled = scalar_standardise(X_train, continuous_columns)
    X_test_scaled = scalar_standardise(X_test, continuous_columns)
    # concat df
    X_train_processed = concat(X_train_scaled, X_train_dummy)
    X_test_processed = concat(X_test_scaled, X_test_dummy)
    return X_train_processed, X_test_processed
    
def encoding(df, categorical_columns, continuous_columns):
    df_dummy = pd.get_dummies(df, columns=categorical_columns)
    df_dummy = df_dummy.drop(continuous_columns, axis=1)
    return df_dummy

def scalar_standardise(df, continuous_columns):
    df_cont = df[continuous_columns]
    #scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df_cont)
    df_scaled = pd.DataFrame(scaled_data, columns=df_cont.columns, index=df.index)
    
def concat(df_scaled, df_dummies):
    df_processed = pd.concat([df_scaled, df_dummies], axis=1)
    df_processed = df_processed.sort_index(axis=0)
    return df_processed

In [5]:
X_train_processed, X_test_processed = data_preparation(df)
display(X_train_processed.head())

Unnamed: 0,laptop_ID,Company_Acer,Company_Apple,Company_Asus,Company_Chuwi,Company_Dell,Company_Fujitsu,Company_Google,Company_HP,Company_Huawei,...,Weight_4.0kg,Weight_4.14kg,Weight_4.2kg,Weight_4.36kg,Weight_4.3kg,Weight_4.42kg,Weight_4.4kg,Weight_4.6kg,Weight_4.7kg,Weight_4kg
0,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
