In [1]:
# importing important packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.svm import SVC
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor
from math import sqrt

In [2]:
# reading the dataset
dataset = pd.read_excel("Flipkart_Mobiles.xlsx")
 
# Printing first 5 records of the dataset
print(dataset.head(5))

  Brand Model  Memory  Storage  Selling Price  Original Price
0  OPPO   A53     4.0     64.0          11990           15990
1  OPPO   A53     4.0     64.0          11990           15990
2  OPPO   A53     6.0    128.0          13990           17990
3  OPPO   A53     6.0    128.0          13990           17990
4  OPPO   A53     4.0     64.0          11990           15990


In [3]:
print('Shape of dataset:', dataset.shape)
# Checking null values for training dataset
dataset.isnull().sum()

Shape of dataset: (2936, 6)


Brand              0
Model              0
Memory            38
Storage           39
Selling Price      0
Original Price     0
dtype: int64

In [4]:
#Remove records with null values
dataset = dataset.dropna()
dataset.isnull().sum()


Brand             0
Model             0
Memory            0
Storage           0
Selling Price     0
Original Price    0
dtype: int64

In [5]:
# Changing the Datatype
dataset['Storage'] = dataset['Original Price'].astype('int64')
dataset['Memory'] = dataset['Memory'].astype('int64')
dataset['Selling Price'] = dataset['Selling Price'].astype('int64')
dataset['Original Price'] = dataset['Original Price'].astype('int64')
dataset['Model'] = dataset['Model'].astype('string')
dataset['Brand'] = dataset['Brand'].astype('string')

dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2859 entries, 0 to 2935
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Brand           2859 non-null   string
 1   Model           2859 non-null   string
 2   Memory          2859 non-null   int64 
 3   Storage         2859 non-null   int64 
 4   Selling Price   2859 non-null   int64 
 5   Original Price  2859 non-null   int64 
dtypes: int64(4), string(2)
memory usage: 156.4 KB


In [6]:
# label_encoder object knows how to understand word labels and changes them into numerical values.
label_encoder = preprocessing.LabelEncoder()

# Encode labels in column 'Brand' and Model.
dataset['Brand']= label_encoder.fit_transform(dataset['Brand'])
dataset['Model']= label_encoder.fit_transform(dataset['Model'])

In [7]:
# Split the dataset into training and test set
features =['Brand' ,'Model', 'Memory','Storage','Original Price']
X = dataset.loc[:,features]     
Y = dataset.loc[:,'Selling Price']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8, test_size=0.2, random_state=0)

In [8]:
# Using SVM Algorithm
model_SVR = svm.SVR()
model_SVR.fit(X_train,Y_train)
Y_pred = model_SVR.predict(X_test)
#performance metrics
print('mean absolute percentage error:',mean_absolute_percentage_error(Y_test, Y_pred)) 

mean absolute percentage error: 0.6829209325684032


In [9]:
#Using Random Forest Regressor 
model_RFR = RandomForestRegressor(n_estimators=10)
model_RFR.fit(X_train, Y_train)
Y_pred = model_RFR.predict(X_test)
#performance metric
print('mean absolute percentage error:',mean_absolute_percentage_error(Y_test, Y_pred))

mean absolute percentage error: 0.05626823041354862
