In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, r2_score

In [2]:
df = pd.read_csv('data/mobile_phone_price_regression.csv', index_col=0)
df

Unnamed: 0,Name,Rating,Spec_score,No_of_sim,Ram,Battery,Display,Camera,External_Memory,Android_version,Price,company,Inbuilt_memory,fast_charging,Screen_resolution,Processor,Processor_name
0,Samsung Galaxy F14 5G,4.65,68,"Dual Sim, 3G, 4G, 5G, VoLTE,",4 GB RAM,6000 mAh Battery,6.6 inches,50 MP + 2 MP Dual Rear &amp; 13 MP Front Camera,"Memory Card Supported, upto 1 TB",13,9999,Samsung,128 GB inbuilt,25W Fast Charging,2408 x 1080 px Display with Water Drop Notch,Octa Core Processor,Exynos 1330
1,Samsung Galaxy A11,4.20,63,"Dual Sim, 3G, 4G, VoLTE,",2 GB RAM,4000 mAh Battery,6.4 inches,13 MP + 5 MP + 2 MP Triple Rear &amp; 8 MP Fro...,"Memory Card Supported, upto 512 GB",10,9990,Samsung,32 GB inbuilt,15W Fast Charging,720 x 1560 px Display with Punch Hole,1.8 GHz Processor,Octa Core
2,Samsung Galaxy A13,4.30,75,"Dual Sim, 3G, 4G, VoLTE,",4 GB RAM,5000 mAh Battery,6.6 inches,50 MP Quad Rear &amp; 8 MP Front Camera,"Memory Card Supported, upto 1 TB",12,11999,Samsung,64 GB inbuilt,25W Fast Charging,1080 x 2408 px Display with Water Drop Notch,2 GHz Processor,Octa Core
3,Samsung Galaxy F23,4.10,73,"Dual Sim, 3G, 4G, VoLTE,",4 GB RAM,6000 mAh Battery,6.4 inches,48 MP Quad Rear &amp; 13 MP Front Camera,"Memory Card Supported, upto 1 TB",12,11999,Samsung,64 GB inbuilt,,720 x 1600 px,Octa Core,Helio G88
4,Samsung Galaxy A03s (4GB RAM + 64GB),4.10,69,"Dual Sim, 3G, 4G, VoLTE,",4 GB RAM,5000 mAh Battery,6.5 inches,13 MP + 2 MP + 2 MP Triple Rear &amp; 5 MP Fro...,"Memory Card Supported, upto 1 TB",11,11999,Samsung,64 GB inbuilt,15W Fast Charging,720 x 1600 px Display with Water Drop Notch,Octa Core,Helio P35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1365,TCL 40R,4.05,75,"Dual Sim, 3G, 4G, 5G, VoLTE,",4 GB RAM,5000 mAh Battery,6.6 inches,50 MP + 2 MP + 2 MP Triple Rear &amp; 8 MP Fro...,Memory Card (Hybrid),12,18999,TCL,64 GB inbuilt,15W Fast Charging,720 x 1612 px,Octa Core,Dimensity 700 5G
1366,TCL 50 XL NxtPaper 5G,4.10,80,"Dual Sim, 3G, 4G, VoLTE,",8 GB RAM,5000 mAh Battery,6.8 inches,50 MP + 2 MP Dual Rear &amp; 16 MP Front Camera,Memory Card (Hybrid),14,24990,TCL,128 GB inbuilt,33W Fast Charging,1200 x 2400 px,Octa Core,Dimensity 7050
1367,TCL 50 XE NxtPaper 5G,4.00,80,"Dual Sim, 3G, 4G, 5G, VoLTE,",6 GB RAM,5000 mAh Battery,6.6 inches,50 MP + 2 MP Dual Rear &amp; 16 MP Front Camera,"Memory Card Supported, upto 1 TB",13,23990,TCL,256 GB inbuilt,18W Fast Charging,720 x 1612 px,Octa Core,Dimensity 6080
1368,TCL 40 NxtPaper 5G,4.50,79,"Dual Sim, 3G, 4G, 5G, VoLTE,",6 GB RAM,5000 mAh Battery,6.6 inches,50 MP + 2 MP + 2 MP Triple Rear &amp; 8 MP Fro...,"Memory Card Supported, upto 1 TB",13,22499,TCL,256 GB inbuilt,15W Fast Charging,720 x 1612 px,Octa Core,Dimensity 6020


In [3]:
df = df[['Rating', 'Spec_score', 'Ram', 'Battery', 'Display', 'Android_version', 'Price', 'Inbuilt_memory']]
df

Unnamed: 0,Rating,Spec_score,Ram,Battery,Display,Android_version,Price,Inbuilt_memory
0,4.65,68,4 GB RAM,6000 mAh Battery,6.6 inches,13,9999,128 GB inbuilt
1,4.20,63,2 GB RAM,4000 mAh Battery,6.4 inches,10,9990,32 GB inbuilt
2,4.30,75,4 GB RAM,5000 mAh Battery,6.6 inches,12,11999,64 GB inbuilt
3,4.10,73,4 GB RAM,6000 mAh Battery,6.4 inches,12,11999,64 GB inbuilt
4,4.10,69,4 GB RAM,5000 mAh Battery,6.5 inches,11,11999,64 GB inbuilt
...,...,...,...,...,...,...,...,...
1365,4.05,75,4 GB RAM,5000 mAh Battery,6.6 inches,12,18999,64 GB inbuilt
1366,4.10,80,8 GB RAM,5000 mAh Battery,6.8 inches,14,24990,128 GB inbuilt
1367,4.00,80,6 GB RAM,5000 mAh Battery,6.6 inches,13,23990,256 GB inbuilt
1368,4.50,79,6 GB RAM,5000 mAh Battery,6.6 inches,13,22499,256 GB inbuilt


In [4]:
def process_data(df):
    def process_rating(v):
        return float(v)

    def process_spec_score(v):
        return int(v)

    def process_ram(v):
        try: return float(v.upper().split(' GB')[0])
        except: return 0

    def process_battery(v):
        try: return float(v.upper().split(' MAH')[0])
        except: return 0

    def process_display(v):
        try: return float(v.split(' ')[0])
        except: return 0

    def process_android_version(v):
        try: return float(v.split(' ')[0].split('.')[0])
        except: return 0

    def process_price(v):
        try: return int(v.replace(',', ''))
        except: return 0

    def process_inbuilt_memory(v):
        try: return float(v.upper().split(' GB')[0].split(' TB')[0])
        except: return 0


    df.loc[:, 'Rating'] = df['Rating'].apply(process_rating)
    df.loc[:, 'Spec_score'] = df['Spec_score'].apply(process_spec_score)
    df.loc[:, 'Ram'] = df['Ram'].apply(process_ram)
    df.loc[:, 'Battery'] = df['Battery'].apply(process_battery)
    df.loc[:, 'Display'] = df['Display'].apply(process_display)
    df.loc[:, 'Android_version'] = df['Android_version'].apply(process_android_version)
    df.loc[:, 'Price'] = df['Price'].apply(process_price)
    df.loc[:, 'Inbuilt_memory'] = df['Inbuilt_memory'].apply(process_inbuilt_memory)
    return df

In [5]:
processed_df = process_data(df)
processed_df

Unnamed: 0,Rating,Spec_score,Ram,Battery,Display,Android_version,Price,Inbuilt_memory
0,4.65,68,4.0,6000.0,6.6,13.0,9999,128.0
1,4.20,63,2.0,4000.0,6.4,10.0,9990,32.0
2,4.30,75,4.0,5000.0,6.6,12.0,11999,64.0
3,4.10,73,4.0,6000.0,6.4,12.0,11999,64.0
4,4.10,69,4.0,5000.0,6.5,11.0,11999,64.0
...,...,...,...,...,...,...,...,...
1365,4.05,75,4.0,5000.0,6.6,12.0,18999,64.0
1366,4.10,80,8.0,5000.0,6.8,14.0,24990,128.0
1367,4.00,80,6.0,5000.0,6.6,13.0,23990,256.0
1368,4.50,79,6.0,5000.0,6.6,13.0,22499,256.0


In [6]:
X = np.array(df[['Rating', 'Spec_score', 'Ram', 'Battery', 'Display', 'Android_version', 'Inbuilt_memory']])
X.shape

(1370, 7)

In [7]:
y = np.array(df[['Price']])
y.shape

(1370, 1)

In [8]:
linear_regression_model = LinearRegression()
linear_regression_model

In [9]:
linear_regression_model.fit(X, y)

In [10]:
coef_result = linear_regression_model.coef_
coef_result

array([[-3570.83828676,  1651.62415927,   110.29424838,    -8.63131149,
          -10.14175835,  -318.18731867,    77.73127691]])

In [11]:
intercept_result = linear_regression_model.intercept_
intercept_result

array([-52519.96910331])

In [12]:
y_pred = linear_regression_model.predict(X)
y_pred

array([[-2414.38357008],
       [ 1470.79507494],
       [14371.47603404],
       ...,
       [38527.65466074],
       [35090.61135809],
       [61757.53195364]])

In [13]:
root_mean_squared_error(y, y_pred)

21539.38198345468

In [14]:
r2_score(y, y_pred)

0.49933170907427327