In [1]:
%matplotlib inline

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option("precision", 2) # number precision for pandas
# pd.set_option("display.max_rows", 10)
pd.set_option("display.float_format", "{:20,.2f}".format) # display float values to two decimal places
plt.style.use("seaborn") # pretty matplotlib plots

# To display print all the interactive output, not only the last result.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# 1. Regression

## 1.1 Data Selection and Filtering

In [2]:
# load datasets

Laptop_df = pd.read_csv("../datasets/2022_March_LaptopData_India.csv")
Laptop_df.head(5)
# Laptop_df[Laptop_df['ram_gb']  == "NVIDIA"]

Unnamed: 0,brand,model,processor_brand,processor_name,processor_gnrtn,ram_gb,ram_type,ssd,hdd,os,...,display_size,warranty,Touchscreen,msoffice,latest_price,old_price,discount,star_rating,ratings,reviews
0,ASUS,Celeron,Intel,Celeron Dual,Missing,4,DDR4,0,1024,Windows,...,15.6,1,No,No,23990,26990,11,3.8,15279,1947
1,ASUS,VivoBook,Intel,Core i3,10th,8,DDR4,512,0,Windows,...,15.6,1,No,No,37990,50990,25,4.3,990,108
2,ASUS,Vivobook,Intel,Core i3,10th,8,DDR4,0,1024,Windows,...,14.1,1,No,No,32890,46990,30,3.9,28,4
3,HP,Core,Intel,Core i3,11th,8,DDR4,512,0,Windows,...,15.6,1,No,Yes,42990,57330,25,4.4,158,18
4,HP,Core,Intel,Core i5,11th,8,DDR4,512,0,Windows,...,15.6,0,No,No,54990,70171,21,4.2,116,15


> [reference data link](https://www.kaggle.com/datasets/kuchhbhi/2022-march-laptop-data?select=Cleaned_Laptop_data.csv)
### Each column means
Dataset has a total of 23 columns. But I didn't write columns that could be understood easily.
* brand: Name of Manufacturer company
* model: Model of laptop
* processor_gnrtn: Generation of processor
* ram_gb: RAM installed in laptop
* ram_type: type of RAM
* weight: thinNight, casual, heavy(gaming)
* warranty: (measure: year)
* msoffice: pre-installed or not
* latest_price: (measure: INR), INR is the Indian Rupee, which is The Republic of India's official currency.
* old_price: Actual price in INR
* discount: (measure: %)
* star_rating: out of 5 star

In this analysis, I'll predict the old_price(Actual price), not a latest_price.

In [3]:
# Remove several columns 

Laptop_df = Laptop_df.drop(["processor_gnrtn", "ram_type", "os", "os_bit", "warranty", 
                            "latest_price", "discount"], axis = 1, inplace = False)
Laptop_df.head(100)

Unnamed: 0,brand,model,processor_brand,processor_name,ram_gb,ssd,hdd,graphic_card_gb,weight,display_size,Touchscreen,msoffice,old_price,star_rating,ratings,reviews
0,ASUS,Celeron,Intel,Celeron Dual,4,0,1024,0,Casual,15.6,No,No,26990,3.80,15279,1947
1,ASUS,VivoBook,Intel,Core i3,8,512,0,0,Casual,15.6,No,No,50990,4.30,990,108
2,ASUS,Vivobook,Intel,Core i3,8,0,1024,0,Casual,14.1,No,No,46990,3.90,28,4
3,HP,Core,Intel,Core i3,8,512,0,0,ThinNlight,15.6,No,Yes,57330,4.40,158,18
4,HP,Core,Intel,Core i5,8,512,0,0,ThinNlight,15.6,No,No,70171,4.20,116,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,APPLE,2020,Apple,M1 Processor,8,512,0,0,Casual,13.3,No,No,142900,4.60,399,45
96,ASUS,VivoBook,Intel,Core i3,8,256,0,0,Casual,15.6,No,No,58990,4.50,816,147
97,MSI,GF63,Intel,Core i5,8,512,0,4,Casual,15.6,No,No,83990,4.50,1597,228
98,MSI,Katana,Intel,i7,,0,0,4,Casual,0,No,No,117990,4.50,218,27


In [4]:
# Exchange INR into dollars
# In 2022, April 22th, 1 INR = 0.013 dollars

Laptop_df["old_price"] = Laptop_df["old_price"] * 0.013
Laptop_df["old_price"]

0                   350.87
1                   662.87
2                   610.87
3                   745.29
4                   912.22
              ...         
891                 768.31
892                 714.90
893                   0.00
894                 519.87
895                 571.99
Name: old_price, Length: 896, dtype: float64

In [5]:
# Rename columns 

Laptop_df = Laptop_df.rename(columns = {"ram_gb" : "RAM", "graphic_card_gb": "Graphic_card",
                                       "old_price" : "Actual_price"}, inplace = False)
Laptop_df.head(5)

Unnamed: 0,brand,model,processor_brand,processor_name,RAM,ssd,hdd,Graphic_card,weight,display_size,Touchscreen,msoffice,Actual_price,star_rating,ratings,reviews
0,ASUS,Celeron,Intel,Celeron Dual,4,0,1024,0,Casual,15.6,No,No,350.87,3.8,15279,1947
1,ASUS,VivoBook,Intel,Core i3,8,512,0,0,Casual,15.6,No,No,662.87,4.3,990,108
2,ASUS,Vivobook,Intel,Core i3,8,0,1024,0,Casual,14.1,No,No,610.87,3.9,28,4
3,HP,Core,Intel,Core i3,8,512,0,0,ThinNlight,15.6,No,Yes,745.29,4.4,158,18
4,HP,Core,Intel,Core i5,8,512,0,0,ThinNlight,15.6,No,No,912.22,4.2,116,15


In [6]:
# Reorder columns: Response(old_price) go to farthest to the right

Laptop_df.columns

Laptop_df = Laptop_df[['brand', 'model', 'processor_brand', 'processor_name', 'RAM', 'ssd',
       'hdd', 'Graphic_card', 'weight', 'display_size', 'Touchscreen',
       'msoffice', 'star_rating', 'ratings', 'reviews', 'Actual_price' ]]

Laptop_df.head(3)

Index(['brand', 'model', 'processor_brand', 'processor_name', 'RAM', 'ssd',
       'hdd', 'Graphic_card', 'weight', 'display_size', 'Touchscreen',
       'msoffice', 'Actual_price', 'star_rating', 'ratings', 'reviews'],
      dtype='object')

Unnamed: 0,brand,model,processor_brand,processor_name,RAM,ssd,hdd,Graphic_card,weight,display_size,Touchscreen,msoffice,star_rating,ratings,reviews,Actual_price
0,ASUS,Celeron,Intel,Celeron Dual,4,0,1024,0,Casual,15.6,No,No,3.8,15279,1947,350.87
1,ASUS,VivoBook,Intel,Core i3,8,512,0,0,Casual,15.6,No,No,4.3,990,108,662.87
2,ASUS,Vivobook,Intel,Core i3,8,0,1024,0,Casual,14.1,No,No,3.9,28,4,610.87


In [7]:
Laptop_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 896 entries, 0 to 895
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   brand            896 non-null    object 
 1   model            896 non-null    object 
 2   processor_brand  896 non-null    object 
 3   processor_name   895 non-null    object 
 4   RAM              871 non-null    object 
 5   ssd              896 non-null    int64  
 6   hdd              896 non-null    int64  
 7   Graphic_card     896 non-null    int64  
 8   weight           896 non-null    object 
 9   display_size     896 non-null    object 
 10  Touchscreen      896 non-null    object 
 11  msoffice         896 non-null    object 
 12  star_rating      896 non-null    float64
 13  ratings          896 non-null    int64  
 14  reviews          896 non-null    int64  
 15  Actual_price     896 non-null    float64
dtypes: float64(2), int64(5), object(9)
memory usage: 112.1+ KB


In [8]:
Laptop_df.index[444]

# Remove data that is misplaced 
misplaced_data = np.array([444, 543, 684, 693, 712, 720, 771, 822, 848, 884]) 
misplaced_data -= 2
Laptop_df = Laptop_df.drop(index = misplaced_data)

# Remove NaN values
Laptop_df = Laptop_df.dropna(axis = 0, inplace = False)


# Change column data type
Laptop_df["RAM"] = Laptop_df["RAM"].astype(float)
Laptop_df["display_size"] = Laptop_df["display_size"].astype(float)

print("Laptop_df.describe() : \n")
Laptop_df.describe()

444

Laptop_df.describe() : 



Unnamed: 0,RAM,ssd,hdd,Graphic_card,display_size,star_rating,ratings,reviews,Actual_price
count,861.0,861.0,861.0,861.0,861.0,861.0,861.0,861.0,861.0
mean,10.14,449.15,198.62,1.12,14.25,2.95,356.6,44.17,1137.82
std,4.85,310.53,407.37,2.0,2.88,1.97,1103.71,133.88,707.66
min,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8.0,256.0,0.0,0.0,14.0,0.0,0.0,0.0,714.99
50%,8.0,512.0,0.0,0.0,14.96,4.1,18.0,3.0,1005.25
75%,16.0,512.0,0.0,2.0,15.6,4.4,165.0,21.0,1421.59
max,32.0,3072.0,2048.0,8.0,17.3,5.0,15279.0,1947.0,4911.37


## 1.2 Simple Linear Regression

In [9]:
from sklearn import linear_model

# Ordinary Least Squares(OLS) model with intercept
# What is OLS? a type of linear least squares method for estimating the unknown parameters in a linear regression model.(Wikipedia)

OLS_sklearn = linear_model.LinearRegression(fit_intercept = True)

Real_Number_Predictors = ['RAM', 'ssd', 'hdd', 'Graphic_card', 
                          'display_size', 'star_rating', 'ratings', 'reviews']

# fitted OLS model (.values.reshape(-1, 1) is required for single predictor) 
for i in range(len(Real_Number_Predictors)):
    X_train = Laptop_df[Real_Number_Predictors[i]].values.reshape(-1, 1)
    y_true = Laptop_df["Actual_price"]
    OLS_sklearn.fit(X_train, y_true)
    
    print("Predictor", Real_Number_Predictors[i], "intercept: ", OLS_sklearn.intercept_)
    print("Predictor", Real_Number_Predictors[i], "coefficient: ", OLS_sklearn.coef_)



LinearRegression()

Predictor RAM intercept:  170.42469826266029
Predictor RAM coefficient:  [95.40304925]


LinearRegression()

Predictor ssd intercept:  580.910240592212
Predictor ssd coefficient:  [1.23990952]


LinearRegression()

Predictor hdd intercept:  1227.1932002909816
Predictor hdd coefficient:  [-0.44998958]


LinearRegression()

Predictor Graphic_card intercept:  947.538608072581
Predictor Graphic_card coefficient:  [169.94894756]


LinearRegression()

Predictor display_size intercept:  1052.838428179336
Predictor display_size coefficient:  [5.96431893]


LinearRegression()

Predictor star_rating intercept:  1226.4650562863885
Predictor star_rating coefficient:  [-30.00310015]


LinearRegression()

Predictor ratings intercept:  1170.9382596577411
Predictor ratings coefficient:  [-0.09287613]


LinearRegression()

Predictor reviews intercept:  1173.5964562687116
Predictor reviews coefficient:  [-0.81001898]
