In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from mlxtend.regressor import StackingCVRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

from scipy.stats import skew, norm

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

%matplotlib inline
warnings.filterwarnings("ignore")

In [None]:
dtrain = pd.read_csv('C:\\Users\\user\\Desktop\\Jupyter\\CUAI\\3월_IMC\\Adv_IMC_train.csv')
dtest = pd.read_csv('C:\\Users\\user\\Desktop\\Jupyter\\CUAI\\3월_IMC\\Adv_IMC_test.csv')

# Let's take a look into the Data

In [None]:
dtrain.head()

### TypeName
We can see 'TypeName' has the same values in the train and test data

In [None]:
print(dtrain['TypeName'].unique().tolist())
print(dtest['TypeName'].unique().tolist())

### Inches
We can use the 'Inches' feature by deleting the NaN values and changing the rest into floats.

### ScreenResolution
In the 'ScreenResolution' column we can see that every value has a (number)x(number) at the end so he can take this value and store it in another column. Also we can find that there are a few words that repeatedly shows up in the values.

For example) 'Full HD', 'Touchscreen', 'Quad HD+' etc.
I believe that these words have meaning when it comes to predicting the prices of the laptops so we will process them into individual columns as well.

In [None]:
dtrain['ScreenResolution'].unique().tolist()

### Cpu
We can see that the Cpu feature contains Intel and AMD Cpu's. Which we will have to convert into individual features

In [None]:
dtrain['Cpu'].unique().tolist()

### Ram
Ram can easily be processed by just deleting the 'GB'

In [None]:
dtrain['Ram'].unique().tolist()

### Memory
We can do the same stuff we did with the ScreenResolution column to the Memory column but just a little differently.

In [None]:
dtrain['Memory'].unique().tolist()

### Gpu
basically the same thing as the Cpu

In [None]:
dtrain['Gpu'].unique().tolist()

### OpSys
We can see that the values for the train data and the test data are different. So I got rid of the Android value because I considered it as noise(I also didn't want to waste a column just for the 'android' value) and dropped it. Then I put the Windows OS'/Mac OS' into 1 column with different integer values(ex: Windows7 = 1, Windows10 = 2 etc.)

In [None]:
dtrain.OpSys.unique().tolist()

In [None]:
dtest.OpSys.unique().tolist()

### Weight
The Weight column can also be used by just deleting the 'Kg' and then changing it into a float just like what we did with the 'Ram'

In [None]:
dtrain.Weight.head()

# Drop noise
row 889 - has Samsung Cortex for Cpu and ARM Mali for Gpu(which is not in the test set and has nothing in common with the other index's)

row 16, 287, 314, 920, 956 = Intel Core M without further info / 889 has Samsung Cortex for Cpu and ARM Mali for Gpu(which is not in the test set and has nothing in common with the other index's)

row 219 contains 'Intel Iris Pro Graphics' which doesn't contain any information about its serial number nor generation.

row 268 & 712 contains 'Android' in OpSys which doesn't exist in the test set and there are only 2 rows containing 'Android' so I assumed the two rows as noise and dropped them

In [None]:
np.where(dtrain.values == 'Android')

In [None]:
#np.where(dtrain.values == 'Intel Iris Pro Graphics')
#np.where(dtrain.values == 'Android')
#dtrain.Gpu[219]
#dtrain.OpSys[268]
#dtrain.OpSys[712]
dtrain.drop(dtrain.index[[16, 219, 268, 287, 314, 712, 889, 920, 956]], inplace=True)

# Drop duplicates

In [None]:
#dtrain.duplicated().sum()
#dtrain.loc[dtrain.duplicated(keep='last'),:] # keep = first / last / False
dtrain.drop_duplicates(keep='first', inplace=True)

# Drop NaN in Inches

In [None]:
dtrain = dtrain.dropna()

# Reset index

In [None]:
dtrain.reset_index(drop=True, inplace=True) # reset the index to 0~

# Skewed Data

In [None]:
#skewness and kurtosis
print("Skewness: %f" % dtrain['price'].skew())
print("Kurtosis: %f" % dtrain['price'].kurt())

In [None]:
from sklearn.preprocessing import StandardScaler
#standardizing data

price_scaled = StandardScaler().fit_transform(dtrain['price'][:,np.newaxis]);
low_range = price_scaled[price_scaled[:,0].argsort()][:10]
high_range= price_scaled[price_scaled[:,0].argsort()][-10:]

print('outer range (low) of the distribution:')
print(low_range)
print('\nouter range (high) of the distribution:')
print(high_range)

In [None]:
dtrain['price'] = np.log1p(dtrain['price'])

In [None]:
dtrain['price']

# Combine train and test features

In [None]:
train_labels = dtrain['price'].reset_index(drop=True)
train = dtrain.drop(['price'], axis=1)
test = dtest
features = pd.concat([train, test]).reset_index(drop=True)
features.shape

# Preprocessing 'Cpu' Column

In [None]:
def CPU_transform(data):
    
    # Extract GHz from Cpu
    data['Cpu'] = data["Cpu"].str.replace(" ", "+")
    data['GHz'] = data["Cpu"].str.replace(r".+[+]", "").str.replace(r"GHz", "").astype(float)
    #-------------------------------------------------------------------------------------------
    # Intel Xeon E3
    data['Cpu_Xeon'] = data["Cpu"].str.replace(r"Intel[+]Xeon[+]E3[-]", "").str.replace(r"[+].+", "").str.replace(r"\D", "")
    data['Cpu_Xeon'] = data['Cpu_Xeon'].replace(r'^\s*$', np.nan, regex=True).fillna(0).astype(int)
    
    # Intel Core M - 가격 / 출시 날짜를 고려하여 인코딩
    data['Cpu_CoreM'] = data["Cpu"].str.replace(r"Intel[+]Core[+]M[+]", "").str.replace(r"M\d[-]", "").str.replace(r"6Y30", "1").str.replace(r"6Y54", "1").str.replace(r"6Y75", "2").str.replace(r"7Y30", "3")
    data['Cpu_CoreM'] = data["Cpu_CoreM"].str.replace(r"[+].+", "").str.replace(r"\D", "").replace(r'^\s*$', np.nan, regex=True).fillna(0).astype(int)
    
    # Intel Pentium Dual / Quad Core
    data['Cpu_Pentium2'] = data["Cpu"].str.replace(r"Intel[+]Pentium[+]Dual[+]Core[+]", "").str.replace(r"[+].+", "").str.replace(r"\D", "").replace(r'^\s*$', np.nan, regex=True).fillna(0).astype(int)
    data['Cpu_Pentium4'] = data["Cpu"].str.replace(r"Intel[+]Pentium[+]Quad[+]Core[+]", "").str.replace(r"[+].+", "").str.replace(r"\D", "").replace(r'^\s*$', np.nan, regex=True).fillna(0).astype(int)

    # Intel Atom
    data['Cpu_Atom'] = data["Cpu"].str.replace(r"Intel[+]Atom[+]\D\d[-]", "").str.replace(r"[+].+", "").str.replace(r"\D", "").replace(r'^\s*$', np.nan, regex=True).fillna(0).astype(int)

    # Intel Celeron Dual / Quad Core
    data['Cpu_Celeron2'] = data["Cpu"].str.replace(r"Intel[+]Celeron[+]Dual[+]Core[+]", "").str.replace(r"[+].+", "").str.replace(r"\D", "").replace(r'^\s*$', np.nan, regex=True).fillna(0).astype(int)
    data['Cpu_Celeron4'] = data["Cpu"].str.replace(r"Intel[+]Celeron[+]Quad[+]Core[+]", "").str.replace(r"[+].+", "").str.replace(r"\D", "").replace(r'^\s*$', np.nan, regex=True).fillna(0).astype(int)

    # Intel Core i-series
    data['Cpu_i3'] = data["Cpu"].str.replace(r"Intel[+]Core[+]i3[+]", "").str.replace(r"[+].+", "").str.replace(r"\D", "").replace(r'^\s*$', np.nan, regex=True).fillna(0).astype(int)
    data['Cpu_i5'] = data["Cpu"].str.replace(r"Intel[+]Core[+]i5[+]", "").str.replace(r"[+].+", "").str.replace(r"\D", "").replace(r'^\s*$', np.nan, regex=True).fillna(0).astype(int)
    data['Cpu_i7'] = data["Cpu"].str.replace(r"Intel[+]Core[+]i7[+]", "").str.replace(r"[+].+", "").str.replace(r"\D", "").replace(r'^\s*$', np.nan, regex=True).fillna(0).astype(int)

    # AMD_A Series
    data['Cpu_AMD_A'] = data["Cpu"].str.replace(r"AMD[+]A", "").str.replace(r"[-].+[+].+[-]", "").str.replace(r"[-]\D+[+]", "").str.replace(r"[+].+", "").str.replace(r"\D", "").replace(r'^\s*$', np.nan, regex=True).fillna(0).astype(int)

    # AMD Ryzen
    data['Cpu_AMD_Ryzen'] = data["Cpu"].str.replace(r"AMD[+]Ryzen[+]", "").str.replace(r"[+].+", "").str.replace(r"\D", "").replace(r'^\s*$', np.nan, regex=True).fillna(0).astype(int)

    # AMD FX
    data['Cpu_AMD_FX'] = data["Cpu"].str.replace(r"AMD[+]FX[+]", "").str.replace(r"[+].+", "").str.replace(r"\D", "").replace(r'^\s*$', np.nan, regex=True).fillna(0).astype(int)

    # AMD E-Series
    data['Cpu_AMD_E'] = data["Cpu"].str.replace(r"AMD[+]E[-]\D+", "").str.replace(r".+[-]", "").str.replace(r"[+].+", "").str.replace(r"\D", "").replace(r'^\s*$', np.nan, regex=True).fillna(0).astype(int)
    
    return data

In [None]:
features = CPU_transform(features)

# Preprocessing 'Gpu' Column

In [None]:
def GPU_transform(data):
    
    # AMD R17M-M1-70 is the same as AMD Radeon R7 M530
    data['Gpu'] = data["Gpu"].str.replace("AMD R17M-M1-70", "AMD Radeon R7 M530")
    data['Gpu'] = data["Gpu"].str.replace(" ", "+")
    
    # Intel HD Graphics
    # 참고문헌 - https://en.wikipedia.org/wiki/List_of_Intel_graphics_processing_units
    for a in data[(data['Gpu'] == 'Intel+HD+Graphics') & ((data['Cpu'] == 'Intel+Atom+x5-Z8350+1.44GHz') | (data['Cpu'] == 'Intel+Atom+X5-Z8350+1.44GHz') | (data['Cpu'] == 'Intel+Celeron+Dual+Core+N3060+1.6GHz'))].index:
        data['Gpu'][a] = data['Gpu'][a].replace('Intel+HD+Graphics', 'Intel+HD+Graphics+400')
    for b in data[(data['Gpu'] == 'Intel+HD+Graphics') & (data['Cpu'] == 'Intel+Pentium+Quad+Core+N3710+1.6GHz')].index:
        data['Gpu'][b] = data['Gpu'][b].replace('Intel+HD+Graphics', 'Intel+HD+Graphics+405')
    for c in data[(data['Gpu'] == 'Intel+HD+Graphics') & ((data['Cpu'] == 'Intel+Core+i5+7200U+2.5GHz') | (data['Cpu'] == 'Intel+Core+i7+7600U+2.8GHz'))].index:
        data['Gpu'][c] = data['Gpu'][c].replace('Intel+HD+Graphics', 'Intel+HD+Graphics+620')
    # 남아 있는 Intel HD Graphics들은 전부 8세대이므로 묶어줍니다.
    data['Gpu_HDG_default'] = data["Gpu"].str.replace(r"Intel[+]HD[+]Graphics", "").replace(r'^\s*$', 1, regex=True).str.replace(r"[+].+", "").replace(np.nan, 1, regex=True).str.replace(r"\D", "").replace(np.nan, 1, regex=True).replace(r'^\s*$', np.nan, regex=True).fillna(0).astype(int)
    data['Gpu_Intel_HD'] = data["Gpu"].str.replace(r"Intel[+]HD[+]Graphics[+]", "").str.replace(r"[+].+", "").str.replace(r"\D", "").replace(r'^\s*$', np.nan, regex=True).fillna(0).astype(int)
    
    # Intel UHD Graphics
    data['Gpu_Intel_UHD'] = data["Gpu"].str.replace(r"Intel[+]UHD[+]\D+[+]", "").str.replace(r"[+].+", "").str.replace(r"\D", "").replace(r'^\s*$', np.nan, regex=True).fillna(0).astype(int)
    
    # Intel Iris (default)/Plus
    data['Gpu_Intel_Iris'] = data["Gpu"].str.replace(r"Intel[+]Iris[+]Plus[+]\D+", "").str.replace(r"Intel[+]Iris[+]\D+", "").str.replace(r"[+].+", "").str.replace(r"\D", "").replace(r'^\s*$', np.nan, regex=True).fillna(0).astype(int)
    
    # Nvidia Geforce GTX_Ti
    data['Gpu_Nvidia_GTX_Ti'] = data["Gpu"].str.replace(r"Nvidia[+]GeForce[+]GTX.+Ti", "1").str.replace(r"[+].+", "").str.replace(r"\D", "").replace(r'^\s*$', np.nan, regex=True).fillna(0).astype(int)
    
    # Nvidia GeForce GTX_M
    # Ti들을 미리 삭제
    data['Gpu_Nvidia_GTX_M'] = data["Gpu"].str.replace(r"Nvidia[+]GeForce[+]GTX.+Ti", "0") # Ti들을 미리 삭제
    data['Gpu_Nvidia_GTX_M'] = data["Gpu_Nvidia_GTX_M"].str.replace(r"Nvidia[+]GeForce[+]GTX.+MX", "")
    data['Gpu_Nvidia_GTX_M'] = data["Gpu_Nvidia_GTX_M"].str.replace(r"Nvidia[+]GeForce[+]GTX[+]", "")

    data['Gpu_Nvidia_GTX_M'] = data["Gpu_Nvidia_GTX_M"].str.replace(r"[+].+", "")
    data['Gpu_Nvidia_GTX_M'] = data["Gpu_Nvidia_GTX_M"].str.replace(r"[^\d+M]", '')
    data['Gpu_Nvidia_GTX_M1'] = data['Gpu_Nvidia_GTX_M'].str.split('M')

    for i in range(len(data['Gpu_Nvidia_GTX_M'])):
        if len(data['Gpu_Nvidia_GTX_M'].str.split('M')[i]) == 2:
            data['Gpu_Nvidia_GTX_M1'][i] = data['Gpu_Nvidia_GTX_M'].str.split('M')[i][0]

    data['Gpu_Nvidia_GTX_M'] = data['Gpu_Nvidia_GTX_M1'].str.replace(r"[.+]", '')
    data['Gpu_Nvidia_GTX_M'] = data['Gpu_Nvidia_GTX_M'].replace(r'^\s*$', np.nan, regex=True)
    data['Gpu_Nvidia_GTX_M'] = data['Gpu_Nvidia_GTX_M'].fillna(0)
    data['Gpu_Nvidia_GTX_M'] = data['Gpu_Nvidia_GTX_M'].astype(int)
    
    # Nvidia GeForce GTX_MX
    # Ti들을 미리 삭제
    data['Gpu_Nvidia_GTX_MX'] = data["Gpu"].str.replace(r"Nvidia[+]GeForce[+]GTX.+Ti", "0") # Ti들을 미리 삭제
    data['Gpu_Nvidia_GTX_MX'] = data["Gpu_Nvidia_GTX_MX"].str.replace(r"Nvidia[+]GeForce[+]GTX[+]", "")

    data['Gpu_Nvidia_GTX_MX'] = data["Gpu_Nvidia_GTX_MX"].str.replace(r"[+].+", "")
    data['Gpu_Nvidia_GTX_MX'] = data["Gpu_Nvidia_GTX_MX"].str.replace(r"[^\d+MX]", '')
    data['Gpu_Nvidia_GTX_MX1'] = data['Gpu_Nvidia_GTX_MX'].str.split('MX')

    for i in range(len(data['Gpu_Nvidia_GTX_MX'])):
        if len(data['Gpu_Nvidia_GTX_MX'].str.split('MX')[i]) == 2:
            data['Gpu_Nvidia_GTX_MX1'][i] = data['Gpu_Nvidia_GTX_MX'].str.split('M')[i][0]

    data['Gpu_Nvidia_GTX_MX'] = data['Gpu_Nvidia_GTX_MX1'].str.replace(r"[.+]", '')
    data['Gpu_Nvidia_GTX_MX'] = data['Gpu_Nvidia_GTX_MX'].replace(r'^\s*$', np.nan, regex=True)
    data['Gpu_Nvidia_GTX_MX'] = data['Gpu_Nvidia_GTX_MX'].fillna(0)
    data['Gpu_Nvidia_GTX_MX'] = data['Gpu_Nvidia_GTX_MX'].astype(int)
    
    # Nvidia GeForce GTX
    # Ti들을 미리 삭제
    data['Gpu_Nvidia_GTX'] = data["Gpu"].str.replace(r"Nvidia[+]GeForce[+]GTX.+Ti", "0").str.replace(r"Nvidia[+]GeForce[+]GTX.+MX?", "").str.replace(r"Nvidia[+]GeForce[+]GTX[+]?", "").str.replace(r"<.+>", "")
    # Nvidia GTX 980 SLI drop 하긴 아까운 데이터라 그냥 이렇게 처리
    data['Gpu_Nvidia_GTX'] = data["Gpu_Nvidia_GTX"].str.replace(r"Nvidia[+]GTX[+]", "").str.replace(r"[+].+", "").str.replace(r"\D+", "").replace(r'^\s*$', np.nan, regex=True).fillna(0).astype(int)
    
    # Nvidia GeForce GT
    #GTX 들을 미리 삭제
    data['Gpu_Nvidia_GT'] = data["Gpu"].str.replace(r"Nvidia[+]GeForce[+]GTX.+", "").str.replace(r"Nvidia[+]GeForce[+]GT[+]", "").str.replace(r"[+].+", "").str.replace(r"\D+", "").replace(r'^\s*$', np.nan, regex=True).fillna(0).astype(int)
    
    # Nvidia Quadro
    data['Gpu_Nvidia_Q'] = data["Gpu"].str.replace(r"Nvidia[+]Quadro[+]", "").str.replace(r"[+].+", "").str.replace(r"\D+", "").replace(r'^\s*$', np.nan, regex=True).fillna(0).astype(int)
    
    # Nvidia GeForce(M, MX, default)구분없이
    data['Gpu_Nvidia_MX'] = data["Gpu"].str.replace(r"Nvidia[+]GeForce[+]", "").str.replace(r"GT.+", "").str.replace(r"[+].+", "").str.replace(r"\D+", "").replace(r'^\s*$', np.nan, regex=True).fillna(0).astype(int)
    
    # AMD Radeon R_Series
    data['Gpu_AMD_R'] = data["Gpu"].str.replace(r"AMD[+]Radeon[+]R", "").str.replace(r"AMD[+]FirePro.+", "").str.replace(r"Intel.+", "").str.replace(r"Nvidia.+", "").str.replace(r"[+]M", ".").str.replace(r"X", "10").str.replace(r"[+]", ".").str.replace(r"AMD.Radeon.+", "").str.replace(r"Graphics", "").str.replace(r"AMD.R", "").replace(r'^\s*$', np.nan, regex=True).fillna(0).astype(float)
    
    # AMD FirePro
    data['Gpu_AMD_FP'] = data["Gpu"].str.replace(r"AMD[+]FirePro[+]", "").str.replace(r"[+].+", "").str.replace(r"\D+", "").replace(r'^\s*$', np.nan, regex=True).fillna(0).astype(int)
    
    # AMD Pro
    data['Gpu_AMD_Pro'] = data["Gpu"].str.replace(r"AMD[+]Radeon[+]Pro[+]", "").str.replace(r"[+].+", "").str.replace(r"\D+", "").replace(r'^\s*$', np.nan, regex=True).fillna(0).astype(int)
    
    # AMD Radeon 이후에 숫자
    data['Gpu_AMD_NUM'] = data["Gpu"].str.replace(r"AMD[+]Radeon[+]R.+", "").str.replace(r"AMD[+]Radeon[+]Pro.+", "").str.replace(r"AMD[+]Radeon[+]", "").str.replace(r"AMD[+]FirePro.+", "").str.replace(r"Intel.+", "").str.replace(r"Nvidia.+", "").str.replace(r"\D+", "").replace(r'^\s*$', np.nan, regex=True).fillna(0).astype(int)

    data = data.drop(['Gpu_Nvidia_GTX_M1','Gpu_Nvidia_GTX_MX1'],axis=1)
    
    return data

In [None]:
features = GPU_transform(features)

# ScreenResolution

In [None]:
def SR_transform(data):
    
    data['SR_4K'] = data['ScreenResolution'].str.findall('.*4K Ultra HD.*')
    data['SR_4K'] = data['SR_4K'].explode('SR_4K').str.replace(r'.+', '1').replace(np.nan, 0, regex=True).astype(int)

    data['SR_Touch'] = data['ScreenResolution'].str.findall('.*Touchscreen.*')
    data['SR_Touch'] = data['SR_Touch'].explode('SR_Touch').str.replace(r'.+', '1').replace(np.nan, 0, regex=True).astype(int)

    data['SR_QuadHD'] = data['ScreenResolution'].str.findall('.*Quad HD.*')
    data['SR_QuadHD'] = data['SR_QuadHD'].explode('SR_QuadHD').str.replace(r'.+', '1').replace(np.nan, 0, regex=True).astype(int)

    data['SR_Retina'] = data['ScreenResolution'].str.findall('.*Retina Display.*')
    data['SR_Retina'] = data['SR_Retina'].explode('SR_Retina').str.replace(r'.+', '1').replace(np.nan, 0, regex=True).astype(int)

    data['SR_FullHD'] = data['ScreenResolution'].str.findall('.*Full HD.*')
    data['SR_FullHD'] = data['SR_FullHD'].explode('SR_FullHD').str.replace(r'.+', '1').replace(np.nan, 0, regex=True).astype(int)
    
    return data

In [None]:
features = SR_transform(features)

In [None]:
def Memory_transform(data):
    
    data['Memory'] = data['Memory'].astype(str).replace('\.0', '', regex=True) 
    data["Memory"] = data["Memory"].str.replace('GB', '')
    data["Memory"] = data["Memory"].str.replace('TB', '000')
    SSD_HDD_FS_Hy = data["Memory"].str.split("+", n = 1, expand = True)
    data["first"]= SSD_HDD_FS_Hy[0]
    data["first"]= data["first"].str.strip()
    data["second"]= SSD_HDD_FS_Hy[1]
    data["HDD1"] = data["first"].apply(lambda data: 1 if "HDD" in data else 0)
    data["SSD1"] = data["first"].apply(lambda data: 1 if "SSD" in data else 0)
    data["Hybrid1"] = data["first"].apply(lambda data: 1 if "Hybrid" in data else 0)
    data["Flash_Storage1"] = data["first"].apply(lambda data: 1 if "Flash Storage" in data else 0)
    data['first'] = data['first'].str.replace(r'\D', '')
    data["second"].fillna("0", inplace = True)
    data["HDD2"] = data["second"].apply(lambda data: 1 if "HDD" in data else 0)
    data["SSD2"] = data["second"].apply(lambda data: 1 if "SSD" in data else 0)
    data["Hybrid2"] = data["second"].apply(lambda data: 1 if "Hybrid" in data else 0)
    data["Flash_Storage2"] = data["second"].apply(lambda data: 1 if "Flash Storage" in data else 0)
    data['second'] = data['second'].str.replace(r'\D', '')
    data["first"] = data["first"].astype(int)
    data["second"] = data["second"].astype(int)
    data['HDD'] = data["first"]*data['HDD1'] + data["second"]*data['HDD2']
    data['SSD'] = data["first"]*data['SSD1'] + data["second"]*data['SSD2']
    data['Hybrid'] = data["first"]*data['Hybrid1'] + data["second"]*data['Hybrid2']
    data['Flash_Storage'] = data["first"]*data['Flash_Storage1'] + data["second"]*data['Flash_Storage2']
    
    return data

In [None]:
features = Memory_transform(features)

In [None]:
def SR_XY_transform(data):
    
    SR = data["ScreenResolution"].str.split("x", n = 1, expand = True)
    data["xres"]= SR[0]
    data["yres"]= SR[1]
    data["xres"] = data['xres'].str.replace(r'\D+.\d?\D+', '').astype(int)
    data["yres"] = data["yres"].astype(int)
    data["ScreenResolution"]=(data["xres"]*data["yres"]).astype(int)
    
    return data

In [None]:
features = SR_XY_transform(features)

In [None]:
def Ram_transform(data):
    
    data["Ram"] = data["Ram"].str.replace('GB', '') ## remove 'GB'
    data["Ram"] = data["Ram"].astype(int)
    
    return data

In [None]:
features = Ram_transform(features)

In [None]:
def Weight_transform(data):
    
    data["Weight"] = data["Weight"].str.replace('kg', '') ## remove 'kg'
    data["Weight"] = data["Weight"].astype(float)
    
    return data

In [None]:
features = Weight_transform(features)

In [None]:
def Drop_transform(data):
    
    data = data.drop(['Cpu', 'Gpu', 'Memory','first',
                'second','HDD1','SSD1','Hybrid1','Flash_Storage1',
                'HDD2','SSD2','Hybrid2','Flash_Storage2'],axis=1)
    
    return data

In [None]:
features = Drop_transform(features)

# Skewed data process

In [None]:
skewcolumn = ['Inches','ScreenResolution','Ram','Weight','GHz','HDD','SSD','Hybrid','Flash_Storage','xres','yres']
for i in skewcolumn:
    print('{} : {}'.format(i, features[i].skew()))

In [None]:
features['ScreenResolution'] = np.log1p(features['ScreenResolution'])
features['Weight'] = np.log1p(features['Weight'])
features['xres'] = np.log1p(features['xres'])
features['yres'] = np.log1p(features['yres'])

In [None]:
features

# One-Hot Encode

In [None]:
def one_hot_TypeName(data):
    TN = pd.get_dummies(data['TypeName'])
    return TN

In [None]:
TN = one_hot_TypeName(features)

In [None]:
def one_hot_OpSys(data):
    OS = pd.get_dummies(data['OpSys'])
    
    OS['Windows'] = OS['Windows 7'] + OS['Windows 10 S'] + OS['Windows 10']
    OS = OS.drop(['Windows 7', 'Windows 10 S', 'Windows 10'], axis=1)
    OS['MacOS'] = OS['Mac OS X']*1 + OS['macOS']*2 # macOS가 더 좋은 OS
    OS = OS.drop(['Mac OS X', 'macOS'], axis=1)
    return OS

In [None]:
OS = one_hot_OpSys(features)

In [None]:
features.Company.unique().tolist()

In [None]:
def one_hot_Company(data):
    Company = pd.get_dummies(data['Company'])
    return Company

In [None]:
CP = one_hot_Company(features)

In [None]:
def one_hot_Product(data):
    Product = pd.get_dummies(data['Product'])
    return Product

In [None]:
Pr = one_hot_Product(features)

In [None]:
def input_dummy():
    for i in TN.columns:
        features[i] = TN[i]
    for i in OS.columns:
        features[i] = OS[i]
    for i in CP.columns:
        features[i] = CP[i]
    for i in Pr.columns:
        features[i] = Pr[i]

input_dummy()

In [None]:
features = features.select_dtypes(exclude=['object'])

In [None]:
features.astype(np.float64)

In [None]:
features.dtypes

In [None]:
features.columns.tolist()

In [None]:
train = features.iloc[:len(train_labels), :]
test = features.iloc[len(train_labels):, :]
train.shape, train_labels.shape, test.shape

# BayesianOptimizer
With the BayesianOptimizer we will kind the ideal values for the parameters in each model

In [None]:
from bayes_opt import BayesianOptimization
import xgboost as xgb
from xgboost import XGBRegressor, XGBRegressor
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

# Models
Find what each parameters mean in each model by looking into the link below every model

## LightGBM
LightGBM - https://neptune.ai/blog/lightgbm-parameters-guide

## XGB Regressor
https://xgboost.readthedocs.io/en/latest/parameter.html

In [None]:
# 탐색 대상 함수 (XGBRegressor)
def XGB_cv(max_depth, learning_rate, n_estimators, gamma
             ,min_child_weight, subsample
             ,colsample_bytree, reg_alpha, reg_lambda, objective='reg:linear', silent=True, nthread=-1):

    # 모델 정의
    model = XGBRegressor(max_depth=int(max_depth),
                           learning_rate=learning_rate,
                           n_estimators=int(n_estimators),
                           gamma=gamma,
                           min_child_weight=min_child_weight,
                           subsample=subsample,
                           colsample_bytree=colsample_bytree,
                           reg_alpha=reg_alpha,
                           reg_lambda = reg_lambda,
                           objective=objective,
                           nthread=nthread
                           )

    # metric 계산
    rmse = np.sqrt(-cross_val_score(model, train, train_labels, scoring="neg_mean_squared_error", cv=5))

    
    # 오차 최적화로 사용할 metric 반환
    return -rmse.mean()

In [None]:
# 실험해보고자하는 hyperparameter 집합
pbounds = {'max_depth': (3, 8),
            'learning_rate': (0.001, 0.1),
            'n_estimators': (1000, 10000),
            'gamma': (0, 1),
            'min_child_weight': (0, 3),
            'subsample': (0.5, 1),
            'colsample_bytree' : (0.2, 1),
            'reg_alpha' : (0,5),
            'reg_lambda' : (0,10),
            }

bo=BayesianOptimization(f=XGB_cv, pbounds=pbounds, verbose=2, random_state=42)

bo.maximize(init_points=10, n_iter=30, acq='ei', xi=0.01)

print(bo.max)

In [None]:
#{'target': -0.1841129313945096, 'params': {'colsample_bytree': 0.5152651581814524, 'gamma': 0.021071952664826532, 'learning_rate': 0.07603995749131016, 'max_depth': 6.809844347927643, 'min_child_weight': 2.761301161457484, 'n_estimators': 6136.355874258778, 'reg_alpha': 1.3188365258147017, 'reg_lambda': 1.4406194115140336, 'subsample': 0.7777929105084442}}

## Ridge
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html

In [None]:
from sklearn.metrics import r2_score, mean_squared_error

# 탐색 대상 함수 (Ridge)
def Ridge_cv(alpha):

    # 모델 정의
    model = make_pipeline(RobustScaler(), Ridge(alpha=alpha))

    # metric 계산
    rmse = np.sqrt(-cross_val_score(model, train, train_labels, scoring="neg_mean_squared_error", cv=5))

    # 오차 최적화로 사용할 metric 반환
    return -rmse.mean()

In [None]:
# 실험해보고자하는 hyperparameter 집합
pbounds = {'alpha': (0.001, 10)}

bo=BayesianOptimization(f=Ridge_cv, pbounds=pbounds, verbose=2, random_state=42)
bo.maximize(init_points=20, n_iter=60, acq='ei', xi=0.01)
print(bo.max)

In [None]:
#{'target': -0.2162339615853202, 'params': {'alpha': 0.5817780380698264}}

## Lasso
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html

In [None]:
from sklearn.metrics import r2_score, mean_squared_error

# 탐색 대상 함수 (Lasso)
def Lasso_cv(alpha):

    # 모델 정의
    model = make_pipeline(RobustScaler(), Lasso(alpha=alpha))

    # metric 계산
    rmse = np.sqrt(-cross_val_score(model, train, train_labels, scoring="neg_mean_squared_error", cv=5))

    # 오차 최적화로 사용할 metric 반환
    return -rmse.mean()

In [None]:
# 실험해보고자하는 hyperparameter 집합

pbounds = {'alpha': (1e-15, 1)}

bo=BayesianOptimization(f=Lasso_cv, pbounds=pbounds, verbose=2, random_state=42)
bo.maximize(init_points=20, n_iter=60, acq='ei', xi=0.01)
print(bo.max)

In [None]:
#{'target': -0.24154720402835123, 'params': {'alpha': 1.1634755367141103e-05}}

## Elastic Net
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html

In [None]:
from sklearn.metrics import r2_score, mean_squared_error

# 탐색 대상 함수 (ElasticNet)
def ElasticNet_cv(alpha):

    # 모델 정의
    model = make_pipeline(RobustScaler(), ElasticNet(alpha=alpha))

    # metric 계산
    rmse = np.sqrt(-cross_val_score(model, train, train_labels, scoring="neg_mean_squared_error", cv=5))

    # 오차 최적화로 사용할 metric 반환
    return -rmse.mean()

In [None]:
# 실험해보고자하는 hyperparameter 집합
pbounds = {'alpha': (1e-15, 1)}

bo=BayesianOptimization(f=ElasticNet_cv, pbounds=pbounds, verbose=2, random_state=42)
bo.maximize(init_points=20, n_iter=60, acq='ei', xi=0.01)
print(bo.max)

In [None]:
# {'target': -0.24719560709406352, 'params': {'alpha': 1.1634755367141103e-05}}

## SVR
https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html

In [None]:
from sklearn.metrics import r2_score, mean_squared_error

# 탐색 대상 함수 (Support Vector Regressor)
def SVR_cv(C, epsilon, gamma):

    # 모델 정의
    model = make_pipeline(RobustScaler(), SVR(C=C, epsilon=epsilon, gamma=gamma))

    # metric 계산
    rmse = np.sqrt(-cross_val_score(model, train, train_labels, scoring="neg_mean_squared_error", cv=5))

    # 오차 최적화로 사용할 metric 반환
    return -rmse.mean()

In [None]:
# 실험해보고자하는 hyperparameter 집합
pbounds = {'C': (0.1, 100),
          'epsilon': (1e-8, 0.1),
          'gamma': (1e-8, 0.1)}

bo=BayesianOptimization(f=SVR_cv, pbounds=pbounds, verbose=2, random_state=42)
bo.maximize(init_points=20, n_iter=60, acq='ei', xi=0.01)
print(bo.max)

In [None]:
# {'target': -0.26173172662582117, 'params': {'C': 51.44116726098502, 'epsilon': 0.0388359599090226, 'gamma': 0.0006357750751616731}}

## GBR
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html

In [None]:
from sklearn.metrics import r2_score, mean_squared_error

# 탐색 대상 함수 (Gradient Boosting Regressor)
def GBR_cv(n_estimators, max_depth, min_samples_leaf, min_samples_split,learning_rate=0.001, max_features='sqrt', loss='huber'):

    # 모델 정의
    model = GradientBoostingRegressor(n_estimators=int(n_estimators),
                                      learning_rate=learning_rate,
                                      max_depth=int(max_depth),
                                      max_features=max_features,
                                      min_samples_leaf=int(min_samples_leaf),
                                      min_samples_split=int(min_samples_split),
                                      loss=loss
                                      )

    # metric 계산
    rmse = np.sqrt(-cross_val_score(model, train, train_labels, scoring="neg_mean_squared_error", cv=5))

    # 오차 최적화로 사용할 metric 반환
    return -rmse.mean()

In [None]:
# 실험해보고자하는 hyperparameter 집합

pbounds = {'n_estimators': (1000, 10000),
           'learning_rate': (0.001, 0.1),
           'max_depth': (2, 8),
           'min_samples_leaf': (5, 50),
           'min_samples_split': (5, 50)
           }

bo=BayesianOptimization(f=GBR_cv, pbounds=pbounds, verbose=2, random_state=42)
bo.maximize(init_points=20, n_iter=60, acq='ei', xi=0.01)
print(bo.max)

In [None]:
# {'target': -0.1757488282065423, 'params': {'learning_rate': 0.07105448853347894, 'max_depth': 4.2815300710953945, 'min_samples_leaf': 5.429323528975727, 'min_samples_split': 45.77064151175414, 'n_estimators': 4919.741973374068}}

## Random Forest Regressor
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

In [None]:
from sklearn.metrics import r2_score, mean_squared_error

# 탐색 대상 함수 (Gradient Boosting Regressor)
def RF_cv(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features=None, oob_score='True'):

    # 모델 정의
    model = RandomForestRegressor(n_estimators=int(n_estimators),
                                      max_depth=int(max_depth),
                                      min_samples_split=int(min_samples_split),
                                      min_samples_leaf=int(min_samples_leaf),
                                      max_features=max_features,
                                      oob_score=oob_score
                                      )

    # metric 계산
    rmse = np.sqrt(-cross_val_score(model, train, train_labels, scoring="neg_mean_squared_error", cv=5))

    # 오차 최적화로 사용할 metric 반환
    return -rmse.mean()

In [None]:
# 실험해보고자하는 hyperparameter 집합
pbounds = {'n_estimators': (1000, 6000),
           'max_depth': (10, 30),
           'min_samples_split': (2, 8),
           'min_samples_leaf': (2, 8)
           }

bo=BayesianOptimization(f=RF_cv, pbounds=pbounds, verbose=2, random_state=42)
bo.maximize(init_points=10, n_iter=30, acq='ei', xi=0.01)
print(bo.max)

In [None]:
#{'target': -0.2080683544796602, 'params': {'max_depth': 22.237057894447588, 'min_samples_leaf': 2.836963163912251, 'min_samples_split': 3.752867891211309, 'n_estimators': 2831.8092164684585}}

## KNN Regressor

In [None]:
from sklearn.metrics import r2_score, mean_squared_error

# 탐색 대상 함수 (K-Neighbors)
def KNR_cv(n_neighbors, weights='distance'):

    # 모델 정의
    model = KNeighborsRegressor(n_neighbors=int(n_neighbors),
                                weights=weights
                                      )

    # metric 계산
    rmse = np.sqrt(-cross_val_score(model, train, train_labels, scoring="neg_mean_squared_error", cv=5))

    # 오차 최적화로 사용할 metric 반환
    return -rmse.mean()

In [None]:
# 실험해보고자하는 hyperparameter 집합
pbounds = {'n_neighbors': (3, 10)
           }

bo=BayesianOptimization(f=KNR_cv, pbounds=pbounds, verbose=2, random_state=42)
bo.maximize(init_points=20, n_iter=60, acq='ei', xi=0.01)
print(bo.max)

In [None]:
#{'target': -0.27512379240601337, 'params': {'n_neighbors': 4.092130483097056}}

## Set Models

In [None]:
# Setup cross validation folds
kf = KFold(n_splits=5, random_state=42, shuffle=True)

In [None]:
# Define error metrics
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def mse(y, y_pred):
    return np.mean(np.square(y-y_pred))

def cv_rmse(model, train=train):
    rmse = np.sqrt(-cross_val_score(model, train, train_labels, scoring="neg_mean_squared_error", cv=kf))
    return (rmse)

In [None]:
# XGBoost Regressor
xgboost = XGBRegressor(learning_rate=0.076,
                       n_estimators=6000,
                       max_depth=6,
                       min_child_weight=2,
                       gamma=0.021071952664826532,
                       subsample=0.7777929105084442,
                       colsample_bytree=0.5152651581814524,
                       objective='reg:linear',
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=1.3188365258147017,
                       reg_lambda=1.4406194115140336,
                       verbosity = 0,
                       random_state=42)

# Ridge Lasso ElasticNet 3인방은 비슷하기 때문에 3 중에서 성능이 제일 좋은 것만 가져갔습니다. - Ridge

# Ridge Regressor
ridge = make_pipeline(RobustScaler(), Ridge(alpha=0.5817780380698264))

# Lasso Regressor
lasso = make_pipeline(RobustScaler(), Lasso(alpha=1.1634755367141103e-05))

# Elastic Net Regressor
elasticnet = make_pipeline(RobustScaler(), ElasticNet(alpha=1.1634755367141103e-05))

# Support Vector Regressor
svr = make_pipeline(RobustScaler(), SVR(C=51.44116726098502, epsilon=0.0388359599090226, gamma=0.0006357750751616731))

# Gradient Boosting Regressor
gbr = GradientBoostingRegressor(n_estimators=5000,
                                learning_rate=0.07105448853347894,
                                max_depth=4,
                                max_features='sqrt',
                                min_samples_leaf=5,
                                min_samples_split=45,
                                loss='huber',
                                random_state=42) 

# Random Forest Regressor
rf = RandomForestRegressor(n_estimators=3000,
                          max_depth=22,
                          min_samples_split=3,
                          min_samples_leaf=2,
                          max_features=None,
                          oob_score=True,
                          random_state=42)

#K-Neighbors Regressor
kn = KNeighborsRegressor(n_neighbors=4, weights='distance')

# Stack up all the models above, optimized using gbr
stack_gen = StackingCVRegressor(regressors=(xgboost, ridge, svr, gbr, rf, kn),
                                meta_regressor=gbr,
                                use_features_in_secondary=True)

# Train Models

In [None]:
scores = {}

In [None]:
score = cv_rmse(xgboost)
print("xgboost: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['xgb'] = (score.mean(), score.std())

In [None]:
score = cv_rmse(ridge)
print("ridge: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['ridge'] = (score.mean(), score.std())

In [None]:
score = cv_rmse(lasso)
print("lasso: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['lasso'] = (score.mean(), score.std())

In [None]:
score = cv_rmse(elasticnet)
print("elasticnet: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['elasticnet'] = (score.mean(), score.std())

In [None]:
score = cv_rmse(svr)
print("SVR: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['svr'] = (score.mean(), score.std())

In [None]:
score = cv_rmse(gbr)
print("gbr: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['gbr'] = (score.mean(), score.std())

In [None]:
score = cv_rmse(rf)
print("rf: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['rf'] = (score.mean(), score.std())

In [None]:
score = cv_rmse(kn)
print("kn: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['kn'] = (score.mean(), score.std())

# Fit Models

In [None]:
stack_gen_model = stack_gen.fit(np.array(train), np.array(train_labels))
print('complete : stack_gen')

In [None]:
xgb_model_full_data = xgboost.fit(train, train_labels)
print('complete : xgboost')

In [None]:
ridge_model_full_data = ridge.fit(train, train_labels)
print('complete : Ridge')

In [None]:
svr_model_full_data = svr.fit(train, train_labels)
print('complete : Svr')

In [None]:
gbr_model_full_data = gbr.fit(train, train_labels)
print('complete : GradientBoosting')

In [None]:
rf_model_full_data = rf.fit(train, train_labels)
print('complete : RandomForest')

In [None]:
kn_model_full_data = kn.fit(train, train_labels)
print('complete : KNR')

# Blend Models

In [None]:
# to prevent overfitting
def blended_predictions(X):
    blended = ((0.5 * gbr_model_full_data.predict(X)) + (0.5 * stack_gen_model.predict(np.array(X))))
    return blended

In [None]:
# Get final precitions from the blended model
blended_score_rmsle = rmsle(train_labels, blended_predictions(train))
blended_score_mse = mse(np.expm1(train_labels), np.expm1(blended_predictions(train)))
rmse = np.sqrt(blended_score_mse)
scores['blended'] = (blended_score_rmsle, 0)
print('RMSLE score on train data:')
print(blended_score_rmsle)
print('MSE score on train data:')
print(blended_score_mse)
print('RMSE score on train data:')
print(rmse)

In [None]:
# Let's see how accurate is our model.
from sklearn import metrics

accuracy=metrics.r2_score(np.expm1(train_labels),np.expm1(blended_predictions(train)))

print(accuracy)

In [None]:
#Plot the predictions for each model
sns.set_style("white")
fig = plt.figure(figsize=(24, 12))

ax = sns.pointplot(x=list(scores.keys()), y=[score for score, _ in scores.values()], markers=['o'], linestyles=['-'])
for i, score in enumerate(scores.values()):
    ax.text(i, score[0] + 0.002, '{:.6f}'.format(score[0]), horizontalalignment='left', size='large', color='black', weight='semibold')

plt.ylabel('Score (RMSE)', size=20, labelpad=12.5)
plt.xlabel('Model', size=20, labelpad=12.5)
plt.tick_params(axis='x', labelsize=13.5)
plt.tick_params(axis='y', labelsize=12.5)

plt.title('Scores of Models', size=20)

plt.show()

In [None]:
predicted_prices = blended_predictions(test)
print(predicted_prices)

In [None]:
np.floor(np.expm1(blended_predictions(test)))

In [None]:
submission = np.expm1(blended_predictions(test))

In [None]:
MK = pd.DataFrame({'price': submission})

In [None]:
MK.to_csv('김민구_3차.csv', index=False)