In [1]:
%matplotlib inline

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option("precision", 2) # number precision for pandas
# pd.set_option("display.max_rows", 10)
pd.set_option("display.float_format", "{:20,.2f}".format) # display float values to two decimal places
plt.style.use("seaborn") # pretty matplotlib plots

# To display print all the interactive output, not only the last result.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# 1. Regression

## 1.1 Data Selection and Filtering

In [2]:
# load datasets

Laptop_df = pd.read_csv("../datasets/2022_March_LaptopData_India.csv")
Laptop_df.head(5)
# Laptop_df[Laptop_df['ram_gb']  == "NVIDIA"]

Unnamed: 0,brand,model,processor_brand,processor_name,processor_gnrtn,ram_gb,ram_type,ssd,hdd,os,...,display_size,warranty,Touchscreen,msoffice,latest_price,old_price,discount,star_rating,ratings,reviews
0,ASUS,Celeron,Intel,Celeron Dual,Missing,4,DDR4,0,1024,Windows,...,15.6,1,No,No,23990,26990,11,3.8,15279,1947
1,ASUS,VivoBook,Intel,Core i3,10th,8,DDR4,512,0,Windows,...,15.6,1,No,No,37990,50990,25,4.3,990,108
2,ASUS,Vivobook,Intel,Core i3,10th,8,DDR4,0,1024,Windows,...,14.1,1,No,No,32890,46990,30,3.9,28,4
3,HP,Core,Intel,Core i3,11th,8,DDR4,512,0,Windows,...,15.6,1,No,Yes,42990,57330,25,4.4,158,18
4,HP,Core,Intel,Core i5,11th,8,DDR4,512,0,Windows,...,15.6,0,No,No,54990,70171,21,4.2,116,15


> [reference data link](https://www.kaggle.com/datasets/kuchhbhi/2022-march-laptop-data?select=Cleaned_Laptop_data.csv)
### Each column means
Dataset has a total of 23 columns. But I didn't write columns that could be understood easily.
* brand: Name of Manufacturer company
* model: Model of laptop
* processor_gnrtn: Generation of processor
* ram_gb: RAM installed in laptop
* ram_type: type of RAM
* weight: thinNight, casual, heavy(gaming)
* warranty: (measure: year)
* msoffice: pre-installed or not
* latest_price: (measure: INR), INR is the Indian Rupee, which is The Republic of India's official currency.
* old_price: Actual price in INR
* discount: (measure: %)
* star_rating: out of 5 star

In this analysis, I'll predict the old_price(Actual price), not a latest_price.

In [3]:
# Remove several columns 

Laptop_df = Laptop_df.drop(["processor_gnrtn", "ram_type", "os", "os_bit", "warranty", 
                            "latest_price", "discount"], axis = 1, inplace = False)
Laptop_df.head(100)

Unnamed: 0,brand,model,processor_brand,processor_name,ram_gb,ssd,hdd,graphic_card_gb,weight,display_size,Touchscreen,msoffice,old_price,star_rating,ratings,reviews
0,ASUS,Celeron,Intel,Celeron Dual,4,0,1024,0,Casual,15.6,No,No,26990,3.80,15279,1947
1,ASUS,VivoBook,Intel,Core i3,8,512,0,0,Casual,15.6,No,No,50990,4.30,990,108
2,ASUS,Vivobook,Intel,Core i3,8,0,1024,0,Casual,14.1,No,No,46990,3.90,28,4
3,HP,Core,Intel,Core i3,8,512,0,0,ThinNlight,15.6,No,Yes,57330,4.40,158,18
4,HP,Core,Intel,Core i5,8,512,0,0,ThinNlight,15.6,No,No,70171,4.20,116,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,APPLE,2020,Apple,M1 Processor,8,512,0,0,Casual,13.3,No,No,142900,4.60,399,45
96,ASUS,VivoBook,Intel,Core i3,8,256,0,0,Casual,15.6,No,No,58990,4.50,816,147
97,MSI,GF63,Intel,Core i5,8,512,0,4,Casual,15.6,No,No,83990,4.50,1597,228
98,MSI,Katana,Intel,i7,,0,0,4,Casual,0,No,No,117990,4.50,218,27


In [4]:
# Exchange INR into dollars
# In 2022, April 22th, 1 INR = 0.013 dollars

# Exchange INR into dollars
Laptop_df["old_price"] = Laptop_df["old_price"] * 0.013

# Revise measure 1 dollar to 100 dollars because, 
# after doing Linear Regression, coefficient and intercept values are too high.
Laptop_df["old_price"] = Laptop_df["old_price"] / 100
Laptop_df["old_price"]

0                     3.51
1                     6.63
2                     6.11
3                     7.45
4                     9.12
              ...         
891                   7.68
892                   7.15
893                   0.00
894                   5.20
895                   5.72
Name: old_price, Length: 896, dtype: float64

In [5]:
# Rename columns 

Laptop_df = Laptop_df.rename(columns = {"ram_gb" : "RAM", "graphic_card_gb": "Graphic_card",
                                       "old_price" : "Actual_price(measure: $100)"}, 
                                         inplace = False)
Laptop_df.head(5)

Unnamed: 0,brand,model,processor_brand,processor_name,RAM,ssd,hdd,Graphic_card,weight,display_size,Touchscreen,msoffice,Actual_price,star_rating,ratings,reviews
0,ASUS,Celeron,Intel,Celeron Dual,4,0,1024,0,Casual,15.6,No,No,3.51,3.8,15279,1947
1,ASUS,VivoBook,Intel,Core i3,8,512,0,0,Casual,15.6,No,No,6.63,4.3,990,108
2,ASUS,Vivobook,Intel,Core i3,8,0,1024,0,Casual,14.1,No,No,6.11,3.9,28,4
3,HP,Core,Intel,Core i3,8,512,0,0,ThinNlight,15.6,No,Yes,7.45,4.4,158,18
4,HP,Core,Intel,Core i5,8,512,0,0,ThinNlight,15.6,No,No,9.12,4.2,116,15


In [6]:
# Reorder columns: Response(old_price) go to farthest to the right

Laptop_df.columns

Laptop_df = Laptop_df[['brand', 'model', 'processor_brand', 'processor_name', 'RAM', 'ssd',
       'hdd', 'Graphic_card', 'weight', 'display_size', 'Touchscreen',
       'msoffice', 'star_rating', 'ratings', 'reviews', 'Actual_price(measure: $100)' ]]

Laptop_df.head(3)

Index(['brand', 'model', 'processor_brand', 'processor_name', 'RAM', 'ssd',
       'hdd', 'Graphic_card', 'weight', 'display_size', 'Touchscreen',
       'msoffice', 'Actual_price', 'star_rating', 'ratings', 'reviews'],
      dtype='object')

Unnamed: 0,brand,model,processor_brand,processor_name,RAM,ssd,hdd,Graphic_card,weight,display_size,Touchscreen,msoffice,star_rating,ratings,reviews,Actual_price
0,ASUS,Celeron,Intel,Celeron Dual,4,0,1024,0,Casual,15.6,No,No,3.8,15279,1947,3.51
1,ASUS,VivoBook,Intel,Core i3,8,512,0,0,Casual,15.6,No,No,4.3,990,108,6.63
2,ASUS,Vivobook,Intel,Core i3,8,0,1024,0,Casual,14.1,No,No,3.9,28,4,6.11


In [7]:
Laptop_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 896 entries, 0 to 895
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   brand            896 non-null    object 
 1   model            896 non-null    object 
 2   processor_brand  896 non-null    object 
 3   processor_name   895 non-null    object 
 4   RAM              871 non-null    object 
 5   ssd              896 non-null    int64  
 6   hdd              896 non-null    int64  
 7   Graphic_card     896 non-null    int64  
 8   weight           896 non-null    object 
 9   display_size     896 non-null    object 
 10  Touchscreen      896 non-null    object 
 11  msoffice         896 non-null    object 
 12  star_rating      896 non-null    float64
 13  ratings          896 non-null    int64  
 14  reviews          896 non-null    int64  
 15  Actual_price     896 non-null    float64
dtypes: float64(2), int64(5), object(9)
memory usage: 112.1+ KB


In [8]:
Laptop_df.index[444]

# Remove data that is misplaced 
misplaced_data = np.array([444, 543, 684, 693, 712, 720, 771, 822, 848, 884]) 
misplaced_data -= 2
Laptop_df = Laptop_df.drop(index = misplaced_data)

# Remove NaN values
Laptop_df = Laptop_df.dropna(axis = 0, inplace = False)


# Change column data type
Laptop_df["RAM"] = Laptop_df["RAM"].astype(float)
Laptop_df["display_size"] = Laptop_df["display_size"].astype(float)

print("Laptop_df.describe() : \n")
Laptop_df.describe()

444

Laptop_df.describe() : 



Unnamed: 0,RAM,ssd,hdd,Graphic_card,display_size,star_rating,ratings,reviews,Actual_price
count,861.0,861.0,861.0,861.0,861.0,861.0,861.0,861.0,861.0
mean,10.14,449.15,198.62,1.12,14.25,2.95,356.6,44.17,11.38
std,4.85,310.53,407.37,2.0,2.88,1.97,1103.71,133.88,7.08
min,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8.0,256.0,0.0,0.0,14.0,0.0,0.0,0.0,7.15
50%,8.0,512.0,0.0,0.0,14.96,4.1,18.0,3.0,10.05
75%,16.0,512.0,0.0,2.0,15.6,4.4,165.0,21.0,14.22
max,32.0,3072.0,2048.0,8.0,17.3,5.0,15279.0,1947.0,49.11


## 1.2 Simple Linear Regression

In [9]:
from sklearn import linear_model

# Ordinary Least Squares(OLS) model with intercept
# What is OLS? a type of linear least squares method for estimating the unknown parameters in a linear regression model(Wikipedia)

OLS_sklearn = linear_model.LinearRegression(fit_intercept = True)

Real_Number_Predictors = ['RAM', 'ssd', 'hdd', 'Graphic_card', 
                          'display_size', 'star_rating', 'ratings', 'reviews']

# fitted OLS model (.values.reshape(-1, 1) is required for single predictor) 
for i in range(len(Real_Number_Predictors)):
    X_train = Laptop_df[Real_Number_Predictors[i]].values.reshape(-1, 1)
    y_true = Laptop_df["Actual_price(measure: $100)"]
    OLS_sklearn.fit(X_train, y_true)
    
    print("Predictor", Real_Number_Predictors[i], "intercept: ", OLS_sklearn.intercept_)
    print("Predictor", Real_Number_Predictors[i], "coefficient: ", OLS_sklearn.coef_)



LinearRegression()

Predictor RAM intercept:  1.7042469826266036
Predictor RAM coefficient:  [0.95403049]


LinearRegression()

Predictor ssd intercept:  5.8091024059221175
Predictor ssd coefficient:  [0.0123991]


LinearRegression()

Predictor hdd intercept:  12.271932002909814
Predictor hdd coefficient:  [-0.0044999]


LinearRegression()

Predictor Graphic_card intercept:  9.475386080725809
Predictor Graphic_card coefficient:  [1.69948948]


LinearRegression()

Predictor display_size intercept:  10.528384281793358
Predictor display_size coefficient:  [0.05964319]


LinearRegression()

Predictor star_rating intercept:  12.264650562863885
Predictor star_rating coefficient:  [-0.300031]


LinearRegression()

Predictor ratings intercept:  11.70938259657741
Predictor ratings coefficient:  [-0.00092876]


LinearRegression()

Predictor reviews intercept:  11.735964562687114
Predictor reviews coefficient:  [-0.00810019]


In [10]:
from sklearn.metrics import mean_squared_error, explained_variance_score, r2_score
# accuracy_score is used in classification

Real_Number_Predictors = ['RAM', 'ssd', 'hdd', 'Graphic_card', 
                          'display_size', 'star_rating', 'ratings', 'reviews']

for i in Real_Number_Predictors:
    y_pred = OLS_sklearn.predict(Laptop_df[i].values.reshape(-1, 1))
    y_true = Laptop_df["Actual_price(measure: $100)"]
    
    OLS_sklearn_summary = {"MSE": mean_squared_error(y_true, y_pred),
                          "Ex. Var": explained_variance_score(y_true, y_pred),
                          "R2": r2_score(y_true, y_pred)}
    
    print('Predictor is "%s"' % i)
    for k, v in OLS_sklearn_summary.items():
        print(k, ':', v)
    
    print() # make it easier to distinguish

Predictor is "RAM"
MSE : 15.018139029964932
Ex. Var : -0.003576160401595363
R2 : -0.818466395876438

Predictor is "ssd"
MSE : 55.95411007702001
Ex. Var : -1.1950276994106126
R2 : -5.775184906945801

Predictor is "hdd"
MSE : 39.218944696135885
Ex. Var : -1.69257890044682
R2 : -3.7488129434253947

Predictor is "Graphic_card"
MSE : 14.623185809043823
Ex. Var : -0.0010105689891684033
R2 : -0.7706436157866285

Predictor is "display_size"
MSE : 15.296693845785741
Ex. Var : -0.016265992694920905
R2 : -0.8521951135936607

Predictor is "star_rating"
MSE : 14.690430974421803
Ex. Var : -5.135595242866664e-05
R2 : -0.7787859743891841

Predictor is "ratings"
MSE : 110.81401332774774
Ex. Var : -8.886222304120084
R2 : -12.417878142334892

Predictor is "reviews"
MSE : 16.8709719549788
Ex. Var : -0.04555486547213761
R2 : -1.042816057614723



In [11]:
# using statsmodels
import statsmodels.api as sm

# predictor & depentent var
X_train = Laptop_df["display_size"]
y_true = Laptop_df["Actual_price(measure: $100)"]

# ols model with intercept added to predictor
OLS_sm = sm.OLS(y_true, sm.add_constant(X_train))

# fitted model and summary
OLS_sm_results = OLS_sm.fit()
OLS_sm_results.summary()

0,1,2,3
Dep. Variable:,Actual_price,R-squared:,0.001
Model:,OLS,Adj. R-squared:,-0.001
Method:,Least Squares,F-statistic:,0.5048
Date:,"Mon, 25 Apr 2022",Prob (F-statistic):,0.478
Time:,16:02:05,Log-Likelihood:,-2905.8
No. Observations:,861,AIC:,5816.0
Df Residuals:,859,BIC:,5825.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,10.5284,1.220,8.629,0.000,8.134,12.923
display_size,0.0596,0.084,0.711,0.478,-0.105,0.224

0,1,2,3
Omnibus:,316.702,Durbin-Watson:,1.956
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1333.97
Skew:,1.687,Prob(JB):,2.15e-290
Kurtosis:,8.079,Cond. No.,73.8
