# Data preprocessing and exploration

## Importing libraries and dataset

In [148]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [149]:
df = pd.read_csv('Cellphone.csv')
df.head()

Unnamed: 0,Product_id,Sale,Weight,Resolution,Ppi,Cpu_core,Cpu_freq,Storage,Ram,RearCam,FrontCam,Battery,Thickness,Price
0,203,10,135.0,5.2,424,8,1.35,16.0,3.0,13.0,8.0,2610,7.4,2357
1,880,10,125.0,4.0,233,2,1.3,4.0,1.0,3.15,0.0,1700,9.9,1749
2,40,10,110.0,4.7,312,4,1.2,8.0,1.5,13.0,5.0,2000,7.6,1916
3,99,11,118.5,4.0,233,2,1.3,4.0,0.512,3.15,0.0,1400,11.0,1315
4,880,11,125.0,4.0,233,2,1.3,4.0,1.0,3.15,0.0,1700,9.9,1749


In [150]:
df.shape

(161, 14)

In [151]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161 entries, 0 to 160
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Product_id  161 non-null    int64  
 1   Sale        161 non-null    int64  
 2   Weight      161 non-null    float64
 3   Resolution  161 non-null    float64
 4   Ppi         161 non-null    int64  
 5   Cpu_core    161 non-null    int64  
 6   Cpu_freq    161 non-null    float64
 7   Storage     161 non-null    float64
 8   Ram         161 non-null    float64
 9   RearCam     161 non-null    float64
 10  FrontCam    161 non-null    float64
 11  Battery     161 non-null    int64  
 12  Thickness   161 non-null    float64
 13  Price       161 non-null    int64  
dtypes: float64(8), int64(6)
memory usage: 17.7 KB


## Statistical description

In [152]:
df.describe()

Unnamed: 0,Product_id,Sale,Weight,Resolution,Ppi,Cpu_core,Cpu_freq,Storage,Ram,RearCam,FrontCam,Battery,Thickness,Price
count,161.0,161.0,161.0,161.0,161.0,161.0,161.0,161.0,161.0,161.0,161.0,161.0,161.0,161.0
mean,675.559006,621.465839,170.426087,5.209938,335.055901,4.857143,1.502832,24.501714,2.204994,10.378261,4.503106,2842.111801,8.921739,2215.596273
std,410.851583,1546.618517,92.888612,1.509953,134.826659,2.444016,0.599783,28.804773,1.609831,6.181585,4.342053,1366.990838,2.192564,768.187171
min,10.0,10.0,66.0,1.4,121.0,0.0,0.0,0.0,0.0,0.0,0.0,800.0,5.1,614.0
25%,237.0,37.0,134.1,4.8,233.0,4.0,1.2,8.0,1.0,5.0,0.0,2040.0,7.6,1734.0
50%,774.0,106.0,153.0,5.15,294.0,4.0,1.4,16.0,2.0,12.0,5.0,2800.0,8.4,2258.0
75%,1026.0,382.0,170.0,5.5,428.0,8.0,1.875,32.0,3.0,16.0,8.0,3240.0,9.8,2744.0
max,1339.0,9807.0,753.0,12.2,806.0,8.0,2.7,128.0,6.0,23.0,20.0,9500.0,18.5,4361.0


In [153]:
df['Cpu_core'].value_counts()

4    81
8    52
2    14
0    10
6     2
1     2
Name: Cpu_core, dtype: int64

# Checking for missing values

In [155]:
df.isnull().sum()

Product_id    0
Sale          0
Weight        0
Resolution    0
Ppi           0
Cpu_core      0
Cpu_freq      0
Storage       0
Ram           0
RearCam       0
FrontCam      0
Battery       0
Thickness     0
Price         0
dtype: int64

# Checking for duplicated values

In [178]:
df[df.duplicated()]

Unnamed: 0,Product_id,Sale,Weight,Resolution,Ppi,Cpu_core,Cpu_freq,Storage,Ram,RearCam,FrontCam,Battery,Thickness,Price


# Data Preprocessing

In [157]:
X = df.iloc[:,0:12].values
y = df.iloc[:,13].values

In [158]:
X.shape

(161, 12)

In [159]:
y.shape

(161,)

## Splitting the dataset into the Training set and Test set

In [160]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [161]:
print ('X_train Shape=', X_train.shape) # taille de la partie x_train
print ('X_test Shape=', X_test.shape) # taille de la partie x_test
print ('y_train Shape=', y_train.shape) # taille de la partie y_train
print ('y_test Shape=', y_test.shape) # taille de la partie y_test

X_train Shape= (128, 12)
X_test Shape= (33, 12)
y_train Shape= (128,)
y_test Shape= (33,)


## Fitting Multiple linear regression to the training set

## Linear Regression

In [162]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)

LinearRegression()

In [163]:
print(f'b = {regressor.intercept_}')
print(f'a = {regressor.coef_}')

b = 463.69915663219626
a = [ 3.80907194e-02 -2.09032831e-02 -3.01197740e+00  1.07609759e+02
  1.10770257e+00  6.11245444e+01  1.25355656e+02  7.27204566e+00
  8.60437946e+01  3.17984526e+00  6.13234319e+00  1.46060502e-01]


In [164]:
y_pred = regressor.predict(X_test)

In [165]:
for i in range(10):
    print([y_test[i],y_pred[i]])

[2975, 2911.6313283592763]
[754, 792.8401534593438]
[1921, 1756.934707522062]
[2137, 2370.0891538708547]
[2859, 3043.9394068705274]
[2744, 2490.459926680929]
[2001, 2029.876922928734]
[2746, 2983.9865751887546]
[1396, 1557.8987336170976]
[791, 801.4444996864945]


## Mean Squared Error, Mean Absolute Error, RMSE and R squared

In [166]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [167]:
def rmse(targets, predictions): 
    return np.sqrt(((predictions - targets) ** 2).mean())

In [168]:
print(f'MSE = {mean_squared_error(y_test, y_pred)}')
print(f'RMSE = {rmse(y_test, y_pred)}')
print(f'MAE = {mean_absolute_error(y_test, y_pred)}')
print(f'R2 = {r2_score(y_test,y_pred)}')

MSE = 24985.228255639646
RMSE = 158.0671637489572
MAE = 123.65270270176529
R2 = 0.9467233823043943


## K-NN

In [169]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
knn.fit(X_train,y_train)

KNeighborsRegressor()

In [170]:
y_pred_knn = knn.predict(X_test)

In [171]:
for i in range(10):
    print([y_test[i],y_pred_knn[i]])

[2975, 2692.6]
[754, 1317.6]
[1921, 2460.4]
[2137, 2008.2]
[2859, 3461.4]
[2744, 2371.2]
[2001, 3350.6]
[2746, 2515.6]
[1396, 2434.6]
[791, 773.4]


In [172]:
print(f'MSE = {mean_squared_error(y_test, y_pred_knn)}')
print(f'RMSE = {rmse(y_test, y_pred_knn)}')
print(f'MAE = {mean_absolute_error(y_test, y_pred_knn)}')
print(f'R2 = {r2_score(y_test,y_pred_knn)}')

MSE = 282649.7345454545
RMSE = 531.648130388375
MAE = 420.7333333333333
R2 = 0.3972990082352532


## Decision Tree

In [173]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit(X_train,y_train)

DecisionTreeRegressor()

In [174]:
y_pred_dt = dt.predict(X_test)

In [175]:
for i in range(10):
    print([y_test[i],y_pred_dt[i]])

[2975, 2975.0]
[754, 754.0]
[1921, 1921.0]
[2137, 2137.0]
[2859, 2859.0]
[2744, 2744.0]
[2001, 2508.0]
[2746, 2858.0]
[1396, 1396.0]
[791, 791.0]


In [176]:
print(f'MSE = {mean_squared_error(y_test, y_pred_dt)}')
print(f'RMSE = {rmse(y_test, y_pred_dt)}')
print(f'MAE = {mean_absolute_error(y_test, y_pred_dt)}')
print(f'R2 = {r2_score(y_test,y_pred_dt)}')

MSE = 39333.15151515151
RMSE = 198.32587202670132
MAE = 81.87878787878788
R2 = 0.9161289520914001


## Ridge Regularization

In [179]:
from sklearn.linear_model import Ridge
ridge = Ridge()
ridge.fit(X_train, y_train)

Ridge()

In [180]:
print(ridge.intercept_)
print(ridge.coef_)

465.85570594354726
[ 3.72022963e-02 -2.12207539e-02 -2.99278779e+00  1.06455895e+02
  1.13185597e+00  6.11613092e+01  1.19642383e+02  7.30801956e+00
  8.42788863e+01  3.37267996e+00  6.23360827e+00  1.46852843e-01]


In [181]:
y_pred_r = ridge.predict(X_test)

In [182]:
for i in range(10):
    print([y_test[i],y_pred_r[i]])

[2975, 2916.3580795542766]
[754, 798.2678500135455]
[1921, 1756.057287224193]
[2137, 2367.5465627694093]
[2859, 3044.593852205053]
[2744, 2496.246562504839]
[2001, 2028.7953466641732]
[2746, 2983.094914838078]
[1396, 1556.1591885857197]
[791, 804.5868308362009]


In [183]:
print(f'MSE = {mean_squared_error(y_test, y_pred_r)}')
print(f'RMSE = {rmse(y_test, y_pred_r)}')
print(f'MAE = {mean_absolute_error(y_test, y_pred_r)}')
print(f'R2 = {r2_score(y_test,y_pred_r)}')

MSE = 24886.03506885305
RMSE = 157.75308259699094
MAE = 123.62364559315321
R2 = 0.9469348943801044
