# Machine learning - Assignment 2
#### Laptop price prediction with multiple linear regression

#### Importing libraries

In [23]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt

#### Data importing

In [25]:
data_df = pd.read_csv('./datasets/laptop-price-specs.csv')
data_df.head()
data_df.count()

Company        1302
TypeName       1302
Ram            1302
OpSys          1302
Weight         1302
TouchScreen    1302
IPS            1302
PPI            1302
CPU_name       1302
HDD            1302
SSD            1302
Gpu brand      1302
Price          1302
dtype: int64

#### Visualizing actual price flow

In [None]:
# Plotting a actual price distribution.
sn.histplot(data_df['Price'],color='green')

In [None]:
# Information related to various type of laptops.
sn.countplot(data_df['TypeName'])
plt.xticks(rotation = 'vertical')

In [None]:
# Plotting GPU manufacture information.
sn.countplot(data_df['Gpu brand'])
plt.show()


In [None]:
# Extracting CPU manufacture details from the dataset.
cpu_manufacture = data_df['CPU_name'].str.split(' ',n = 1,expand=True)
cpu_manufacture.head()
data_df['CPU_manufacture'] = cpu_manufacture[0]

In [None]:
# Plotting the CPU manufacture details
cpu_value = data_df.value_counts('CPU_manufacture')
cpu_value.sort_values().plot(kind='pie')

In [None]:
# Converting the label of notebook type into numeric identifier
notebook_type = data_df.value_counts("TypeName")
print(notebook_type)

# Method to create numeric values.
def type_convert_numeric(value):
    data_df['Type'] = np.select(
        condlist=[
            data_df['TypeName'] == "Notebook",
            data_df['TypeName'] == "Gaming",
            data_df['TypeName'] == "Ultrabook",
            data_df['TypeName'] == "2 in 1 Convertible",
            data_df['TypeName'] == "Workstation",
            data_df['TypeName'] == "Netbook",
        ],
        choicelist=[
            0,
            1,
            2,
            3,
            4,
            5
        ],
        default=0
    )

listTypes = ["Notebook", "Gaming", "Ultrabook", "2 in 1 Convertible", "Workstation", "Netbook"]
for value in listTypes:
    type_convert_numeric(value)

data_df.head()

In [None]:
# Drop columns in Dataframe.
data_df.drop(['Type'], axis=1)

In [None]:
# Check for NULL values before partitioning.
data_df.isnull().sum()


In [None]:
# Define X and Y values.
# x = data_df.drop(['Price', 'Company', 'TypeName', 'OpSys', 'CPU_name', 'CPU_manufacture', 'Gpu brand'], axis=1).values
# y = data_df['Price'].values

# Newer data splitting method.
train = data_df.drop(['Price'], axis=1)
# Only change here is usage of np.log().
testing = np.log(data_df['Price'])

In [None]:
# Validating the X dataset
train

In [None]:
# Validating the Y dataset
testing

In [None]:
# Split data into train test validation.
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train, testing, test_size=0.3, random_state=1)

#### Prepare pipeline and mapper for the data.

In [None]:
mapper = {i:value for i,value in enumerate(x_train.columns)}
mapper

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn import metrics
# Select the number of columns.
selected_columns = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,3,8,11,12,10])
],remainder='passthrough')

# Create the model as a another layer in the pipeline
linear_regression_model = LinearRegression()

# Create the pipeline to connect the layers.
pipe = Pipeline([
    ('step1',selected_columns),
    ('step2',linear_regression_model)
])

pipe.fit(x_train,y_train)

y_pred = pipe.predict(x_test)

print('R2 score',metrics.r2_score(y_test,y_pred))

#### Define the machine learning model

In [None]:
# Creating a LinearRegression model from sklearn
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_train, y_train)

In [None]:
# Creating a SVM model from sklearn. (Error due to columns being continuous variables)
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
model = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
model.fit(x_train, y_train)

In [None]:
# Creating a ElasticNet model from sklearn.
from sklearn.linear_model import ElasticNet
model = ElasticNet(random_state=42, selection='cyclic').fit(x_train, y_train)

In [None]:
# Creating a Logistic Regression model from sklearn.
from sklearn.neural_network import MLPRegressor
model = MLPRegressor(random_state=2, max_iter=5000, solver="adam").fit(x_train, y_train)

In [None]:
# Creating a DecisionTree classifier
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(max_depth=7)
model.fit(x_train, y_train)

In [None]:
# RandomForest model - Salitha
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
model = RandomForestRegressor(random_state=42, max_depth=5,
                                       n_estimators=100, oob_score=True)
model.fit(x_train, y_train)

In [None]:
# Predict results with the trained model
y_pred = model.predict(x_test)
print(y_pred)

In [None]:
# Currently, working dataframe.
data_df.head()

In [None]:
# Evaluate the accuracy of the model using r2_score
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

#### Visualizing data - 2

In [None]:
# Evaluate the margin of error.
df_py = pd.DataFrame({'Actual Price': y_test, 'Predicted value': y_pred, 'Margin of error': y_test-y_pred})
df_py[0:20]