### importing the required packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

#### loading the cleaned dataset as pandas dataframe

In [2]:
df = pd.read_csv('data/cleaned_data.csv')

In [3]:
df.head()

Unnamed: 0,Company,TypeName,Ram,Gpu,Weight,Price,Touchscreen,IPS_screen,PPI,CPU_Brand,HDD,SSD,OS
0,Apple,Ultrabook,8,Intel,1.37,71378.6832,0,1,272.209528,Intel Core i5,0,128,Mac
1,Apple,Ultrabook,8,Intel,1.34,47895.5232,0,0,153.117859,Intel Core i5,0,0,Mac
2,HP,Notebook,8,Intel,1.86,30636.0,0,0,174.057054,Intel Core i5,0,256,Others/No OS/Linux
3,Apple,Ultrabook,16,AMD,1.83,135195.336,0,1,264.476303,Intel Core i7,0,512,Mac
4,Apple,Ultrabook,8,Intel,1.37,96095.808,0,1,272.209528,Intel Core i5,0,256,Mac


In [4]:
df.shape

(1273, 13)

#### Dividing the data into independent and dependent features

In [5]:
x = df.drop(columns=['Price'])
y = np.log(df['Price']) # as it is skewed, we are taking log transformed data as our target

#### spliting the data into train and test data

In [6]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2)

In [44]:
x_train.head()

Unnamed: 0,Company,TypeName,Ram,Gpu,Weight,Touchscreen,IPS_screen,PPI,CPU_Brand,HDD,SSD,OS
1205,HP,Notebook,8,Intel,1.91,0,0,123.834341,Intel Core i5,0,256,windows
279,Lenovo,Notebook,8,Nvidia,2.8,0,0,156.953181,Intel Core i7,2000,0,Others/No OS/Linux
1204,Dell,Notebook,8,AMD,2.36,0,0,174.057054,Intel Core i5,0,256,windows
1123,Dell,Notebook,4,Intel,2.09,0,0,123.834341,Intel Core i5,500,0,Others/No OS/Linux
391,Asus,Gaming,16,Nvidia,3.0,0,0,156.953181,Intel Core i7,1000,256,windows


In [46]:
x_train.iloc[:,[2,4,7,9,10]].columns

Index(['Ram', 'Weight', 'PPI', 'HDD', 'SSD'], dtype='object')

#### for all the categorical features, we will be applying the oneHotEncoder using a column transformer

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [10]:
x_train.head()

Unnamed: 0,Company,TypeName,Ram,Gpu,Weight,Touchscreen,IPS_screen,PPI,CPU_Brand,HDD,SSD,OS
1205,HP,Notebook,8,Intel,1.91,0,0,123.834341,Intel Core i5,0,256,windows
279,Lenovo,Notebook,8,Nvidia,2.8,0,0,156.953181,Intel Core i7,2000,0,Others/No OS/Linux
1204,Dell,Notebook,8,AMD,2.36,0,0,174.057054,Intel Core i5,0,256,windows
1123,Dell,Notebook,4,Intel,2.09,0,0,123.834341,Intel Core i5,500,0,Others/No OS/Linux
391,Asus,Gaming,16,Nvidia,3.0,0,0,156.953181,Intel Core i7,1000,256,windows


In [27]:
trf = ColumnTransformer(transformers=[
                    ('OHE',OneHotEncoder(sparse_output=False,drop='first',dtype='int16'),[0,1,3,8,11]),
                    ('scaler',StandardScaler(),[2,4,7,9,10])]
                  ,remainder='passthrough')

trf.set_output(transform='pandas')

In [28]:
tranformed_x_train = trf.fit_transform(x_train)
tranformed_x_test = trf.transform(x_test)

In [29]:
tranformed_x_train.shape

(1018, 38)

#### Importing the Regression models

In [30]:
from sklearn.linear_model import LinearRegression,Lasso,LassoCV,Ridge,RidgeCV,ElasticNet,ElasticNetCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor,RandomForestRegressor,GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,root_mean_squared_error


### model selection


In [26]:
#Linear Regression

Lr = LinearRegression()
Lr.fit(tranformed_x_train,y_train)
y_pred = Lr.predict(tranformed_x_test)
print(f"r2_Score: {r2_score(y_test,y_pred)}")
print(f"MSE : {mean_squared_error(y_test,y_pred)}")

r2_Score: 0.7951766680058057
MSE : 0.07106382538173439


#### Evaluation function



In [31]:
def evaluate_func(test,pred):
    """Function which will return the metrics for the regression model at once."""
    r2 = r2_score(test,pred)
    mse = mean_squared_error(test,pred)
    rmse = root_mean_squared_error(test,pred)
    mae = mean_absolute_error(test,pred)
    return r2,mse,rmse,mae

In [41]:
models ={
    "LinearRegression" : LinearRegression(),
    "Lasso":Lasso(),
    "Ridge" : Ridge(),
    "ElasticNet" : ElasticNet(),
    "SupportVectorRegressor" : SVR(),
    "KNeighborsRegressor" : KNeighborsRegressor(),
    "DecisionTreeRegressor" : DecisionTreeRegressor(),
    "RandomForestRegressor" : RandomForestRegressor(),
    "AdaBoostRegressor" : AdaBoostRegressor(),
    "XGBRegressor" : XGBRegressor()
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(tranformed_x_train,y_train)

    y_pred = model.predict(tranformed_x_test)

    r2,mse,rmse,mae = evaluate_func(y_test,y_pred)

    model_list.append(list(models.keys())[i])
    print(f"Model Name : {list(models.keys())[i]}")
    print(f"R2_score : {r2}")
    print(f"Mean Square Error : {mse}")
    print(f"Root Mean square error : {rmse}")
    print(f"Mean Absoulte error : {mae}")
    print("--"*10)

    
    r2_list.append(r2)




Model Name : LinearRegression
R2_score : 0.7965917731820482
Mean Square Error : 0.07057285208214888
Root Mean square error : 0.2656555139313861
Mean Absoulte error : 0.20718449265910796
--------------------
Model Name : Lasso
R2_score : -0.0035141493859984774
Mean Square Error : 0.34817104860928483
Root Mean square error : 0.5900602076138374
Mean Absoulte error : 0.47119642097632025
--------------------
Model Name : Ridge
R2_score : 0.802565225281838
Mean Square Error : 0.06850035207537403
Root Mean square error : 0.26172571917061194
Mean Absoulte error : 0.20530884937739963
--------------------
Model Name : ElasticNet
R2_score : -0.0035141493859984774
Mean Square Error : 0.34817104860928483
Root Mean square error : 0.5900602076138374
Mean Absoulte error : 0.47119642097632025
--------------------
Model Name : SupportVectorRegressor
R2_score : 0.8785050907531441
Mean Square Error : 0.04215287844127523
Root Mean square error : 0.20531166172742168
Mean Absoulte error : 0.1632554256679636


In [42]:
result = pd.DataFrame(list(zip(model_list,r2_list)),columns=['Model_Name','R2_score']).sort_values(by=['R2_score'],ascending=False)

In [43]:
result

Unnamed: 0,Model_Name,R2_score
4,SupportVectorRegressor,0.878505
9,XGBRegressor,0.861776
7,RandomForestRegressor,0.843798
5,KNeighborsRegressor,0.840188
8,AdaBoostRegressor,0.805649
2,Ridge,0.802565
0,LinearRegression,0.796592
6,DecisionTreeRegressor,0.731621
1,Lasso,-0.003514
3,ElasticNet,-0.003514


### conclusion

- we will be using the Support vector machine as our model