In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV
from statistics import mean
import pickle

In [2]:
data = pd.read_csv('train.csv') # Read the data

# Separating independent variables and target variable
features = data.drop(['price_range'],axis = 1)
target = data.price_range

# Splitting the data using stratified k fold
SKfolds = StratifiedKFold(n_splits=5)

## Fields with its description

|Columns|Description|
|-------|-----------|
|battery_power|Total energy a battery can store in one time measured in mAh|
|blue|Has bluetooth or not|
|clock_speed|speed at which microprocessor executes instructions|
|dual_sim|has dual sim support or not|
|fc|front camera megapixels|
|four_g|has 4g or not|
|int_memory|Internal memory in gigabytes|
|m_dep|Mobile depth in cm|
|mobile_wt|Weight of mobile phone|
|n_cores|number of cores of processor|
|pc|primary camera mega pixels|
|px_height|Pixel resolution height|
|touch_screen|Pixel resolution width|
|wifi|has wifi or not|
|ram|ram in megabytes|
|sc_h|screen height in cm|
|sc_w|screen width in cm|
|talk_time|longest time that a single battery charge will last when you are|
|three_g|has 3g or not|
|touch_screen|is touch screen or not|
|price_range|range of price (target feature)|

In [3]:
data.describe()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1238.5185,0.495,1.52225,0.5095,4.3095,0.5215,32.0465,0.50175,140.249,4.5205,...,645.108,1251.5155,2124.213,12.3065,5.767,11.011,0.7615,0.503,0.507,1.5
std,439.418206,0.5001,0.816004,0.500035,4.341444,0.499662,18.145715,0.288416,35.399655,2.287837,...,443.780811,432.199447,1084.732044,4.213245,4.356398,5.463955,0.426273,0.500116,0.500076,1.118314
min,501.0,0.0,0.5,0.0,0.0,0.0,2.0,0.1,80.0,1.0,...,0.0,500.0,256.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0
25%,851.75,0.0,0.7,0.0,1.0,0.0,16.0,0.2,109.0,3.0,...,282.75,874.75,1207.5,9.0,2.0,6.0,1.0,0.0,0.0,0.75
50%,1226.0,0.0,1.5,1.0,3.0,1.0,32.0,0.5,141.0,4.0,...,564.0,1247.0,2146.5,12.0,5.0,11.0,1.0,1.0,1.0,1.5
75%,1615.25,1.0,2.2,1.0,7.0,1.0,48.0,0.8,170.0,7.0,...,947.25,1633.0,3064.5,16.0,9.0,16.0,1.0,1.0,1.0,2.25
max,1998.0,1.0,3.0,1.0,19.0,1.0,64.0,1.0,200.0,8.0,...,1960.0,1998.0,3998.0,19.0,18.0,20.0,1.0,1.0,1.0,3.0


In [4]:
# No null values present as well as everything is in numerical format
features.isnull().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
dtype: int64

In [5]:
# Find out the categorical and numerical features based on number of labels

cat_cols = [col for col in features.columns if features[col].nunique() < 3]
num_cols = [col for col in features.columns if col not in cat_cols]
print(f'Categorical Features\n{cat_cols}\nNumerical Features\n{num_cols}')

Categorical Features
['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']
Numerical Features
['battery_power', 'clock_speed', 'fc', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time']


## We don't require that much preprocessing

In [6]:
# Estimators
rf = RandomForestClassifier()
dt = DecisionTreeClassifier()
svc = SVC()
gnb = GaussianNB()
knn = KNeighborsClassifier()

estimator = {'K Neighbors Classification':knn,'Gaussian Naive Bayes':gnb,'Support Vector Classification':svc,'Random Forest Classification':rf,'Decision Tree Classification':dt}
model_with_accuracy = {} # Creating dictionary calculating cross valiadation accuracy with models

for i in estimator.keys():
  model_with_accuracy[i] = mean(cross_val_score(estimator[i],features,target,cv=SKfolds,scoring='accuracy'))
  


In [7]:
model_with_accuracy

{'K Neighbors Classification': 0.9195,
 'Gaussian Naive Bayes': 0.8089999999999999,
 'Support Vector Classification': 0.947,
 'Random Forest Classification': 0.884,
 'Decision Tree Classification': 0.8170000000000001}

## Support Vector Machine wins

In [8]:
## Hyperparameter tuning of Support Vector Classification
params = {'C':[0.1, 1, 10, 100, 1000],'gamma': [1, 0.1, 0.01, 0.001, 0.0001],'kernel': ['linear','sigmoid','rbf']}
tuning = HalvingRandomSearchCV(svc,params,scoring='accuracy',cv=SKfolds)
tuning.fit(features,target)

In [9]:
tuning.best_score_ # Final score achieved

0.9768518518518519

In [10]:
final_estimator = tuning.best_estimator_

In [11]:
# Save the model
filename = 'final_estimator.sav'
pickle.dump(final_estimator, open(filename, 'wb'))

In [12]:
# Load the model
estimator = pickle.load(open(filename,'rb'))

In [13]:
test_data = pd.read_csv('test.csv')
y_pred = estimator.predict(test_data.drop(['id'],axis=1)) # Predict the test data