# Model Development

### Loading the data

In [30]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.ensemble import RandomForestClassifier, StackingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
import pickle
import plotly.express as px

In [6]:
# Load the dataset
df = pd.read_excel('Dataset/PreprocessedTrain.xlsx')      

print(len(df))

1991


In [7]:
correlation_with_val_eur = np.abs(df.corrwith(df['price_range']))            # absolute to handle -ve values
sorted_correlations = correlation_with_val_eur.sort_values(ascending=False)
print("Chosen features:", sorted_correlations[1:5].index)                    # Choosing the top 4 features (excluding the target column)
# We can't rely on the ram only, as that would make the model prone to overfitting, and easily affected by noise

x_data = df[sorted_correlations[1:5].index]             
y_data = df['price_range']
x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

Chosen features: Index(['ram', 'battery_power', 'px_width', 'px_height'], dtype='object')


### Evaluation Metrics

In [22]:
def ModelEvaluation(golden_output, predicted_output):
    # Accuracy is a good measure as the price_range is balanced
    accuracy = accuracy_score(golden_output, predicted_output)
    print("Accuracy: ", accuracy)

    # We will also look at the precision and recall, as well as f1-score to see if the model is biased towards a particular class
    print("Classification Report: \n", classification_report(golden_output, predicted_output))

    # Finally, we will check the confusion matrix to get even finer insights
    print("Confusion Matrix: \n", confusion_matrix(golden_output, predicted_output))
    return accuracy


###
<a id='4.11'></a>
<p style="font-size: 34px; color: #FFFFFFF; 
  font-family: 'Roboto'; 
  text-align: center; 
  padding: 10px 20px; /* Add padding for spacing */
  background-image: linear-gradient(to right, #9746ff, #000000); 
  border-radius: 5px 5px;"><strong>KNN</strong></p>

Since we will be working with a small number of features, KNN should theoretically be a good estimator. It would converge quickly due to the low dimensionality of the data.

In [23]:
knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_val)
knn_acc = ModelEvaluation(y_val, y_pred)

Accuracy:  0.9047619047619048
Classification Report: 
               precision    recall  f1-score   support

           0       0.94      0.97      0.96       106
           1       0.88      0.87      0.87        89
           2       0.88      0.85      0.86       107
           3       0.92      0.93      0.92        97

    accuracy                           0.90       399
   macro avg       0.90      0.90      0.90       399
weighted avg       0.90      0.90      0.90       399

Confusion Matrix: 
 [[103   3   0   0]
 [  6  77   6   0]
 [  0   8  91   8]
 [  0   0   7  90]]


### Conclusion
KNN shows promising results for the first experiment, however it's not very capable of predicting the 1 and 2 price ranges.

###
<a id='4.11'></a>
<p style="font-size: 34px; color: #FFFFFFF; 
  font-family: 'Roboto'; 
  text-align: center; 
  padding: 10px 20px; /* Add padding for spacing */
  background-image: linear-gradient(to right, #9746ff, #000000); 
  border-radius: 5px 5px;"><strong>Logistic Regression</strong></p>

The data is rather simple and the price_range is heavily dependent on the ram, as well as the rest of the features. It can be predicted using a simple logistic regression model.

In [24]:
lr = LogisticRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_val)
lr_acc = ModelEvaluation(y_val, y_pred)


Accuracy:  0.9423558897243107
Classification Report: 
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       106
           1       0.91      0.96      0.93        89
           2       0.95      0.85      0.90       107
           3       0.92      0.97      0.94        97

    accuracy                           0.94       399
   macro avg       0.94      0.94      0.94       399
weighted avg       0.94      0.94      0.94       399

Confusion Matrix: 
 [[106   0   0   0]
 [  2  85   2   0]
 [  0   8  91   8]
 [  0   0   3  94]]


### Conclusion
As expected, the logistic model does indeed fare well in this case, as the data is linearly separable to some extent. It shows slighlty superior results to the KNN model.

###
<a id='4.11'></a>
<p style="font-size: 34px; color: #FFFFFFF; 
  font-family: 'Roboto'; 
  text-align: center; 
  padding: 10px 20px; /* Add padding for spacing */
  background-image: linear-gradient(to right, #9746ff, #000000); 
  border-radius: 5px 5px;"><strong>Random Forest</strong></p>

Decision trees are prone to overfitting, however, random forests are a good way to mitigate this issue. Since the data is not very complex, some rules can be deduced such as the ones proposed in the EDA analysis. As such, a random forest model should be able to predict the price range with a reasonably good accuracy.

In [25]:
rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_val)
rf_acc = ModelEvaluation(y_val, y_pred)


Accuracy:  0.9323308270676691
Classification Report: 
               precision    recall  f1-score   support

           0       0.95      0.94      0.95       106
           1       0.88      0.91      0.90        89
           2       0.95      0.90      0.92       107
           3       0.94      0.98      0.96        97

    accuracy                           0.93       399
   macro avg       0.93      0.93      0.93       399
weighted avg       0.93      0.93      0.93       399

Confusion Matrix: 
 [[100   6   0   0]
 [  5  81   3   0]
 [  0   5  96   6]
 [  0   0   2  95]]


### Conclusion
While the Random Forest model does show good results, the logistic regression is still superior. The accuracy increases minimally by increasing the number of estimators, but this comes at a heavy cost of computational power, making the trade-off not worth it.

###
<a id='4.11'></a>
<p style="font-size: 34px; color: #FFFFFFF; 
  font-family: 'Roboto'; 
  text-align: center; 
  padding: 10px 20px; /* Add padding for spacing */
  background-image: linear-gradient(to right, #9746ff, #000000); 
  border-radius: 5px 5px;"><strong>SVM</strong></p>

Support Vector Machines are known for their versatility. They can be used for linearly separable data, as well as for non-linearly separable data. Since the logistic regression showed promising results, it is expected that the SVM will perform as well, if not better. This can be attributed to the margin the SVM adds, which helps the model generalize better.

In [26]:
svm = SVC(kernel='linear', C=2)
svm.fit(x_train, y_train)
y_pred = svm.predict(x_val)
svm_acc = ModelEvaluation(y_val, y_pred)

Accuracy:  0.9624060150375939
Classification Report: 
               precision    recall  f1-score   support

           0       0.99      0.98      0.99       106
           1       0.94      0.98      0.96        89
           2       0.96      0.93      0.94       107
           3       0.96      0.97      0.96        97

    accuracy                           0.96       399
   macro avg       0.96      0.96      0.96       399
weighted avg       0.96      0.96      0.96       399

Confusion Matrix: 
 [[104   2   0   0]
 [  1  87   1   0]
 [  0   4  99   4]
 [  0   0   3  94]]


### Conclusion
As expected, the SVM shows the best results so far, it even solves the problem of showing low recall for the price_range of 2 faced by the logistic regression model. The SVM model with a linear kernel shows great results, a poly kernel of degree 3 also shows similar results, but the computational cost is a little higher.

###
<a id='4.11'></a>
<p style="font-size: 34px; color: #FFFFFFF; 
  font-family: 'Roboto'; 
  text-align: center; 
  padding: 10px 20px; /* Add padding for spacing */
  background-image: linear-gradient(to right, #9746ff, #000000); 
  border-radius: 5px 5px;"><strong>XGBoost</strong></p>

In [27]:
xgb = XGBClassifier()
xgb.fit(x_train, y_train)
y_pred = xgb.predict(x_val)
xgb_acc = ModelEvaluation(y_val, y_pred)

Accuracy:  0.9373433583959899
Classification Report: 
               precision    recall  f1-score   support

           0       0.98      0.95      0.97       106
           1       0.88      0.94      0.91        89
           2       0.94      0.89      0.91       107
           3       0.94      0.97      0.95        97

    accuracy                           0.94       399
   macro avg       0.94      0.94      0.94       399
weighted avg       0.94      0.94      0.94       399

Confusion Matrix: 
 [[101   5   0   0]
 [  2  84   3   0]
 [  0   6  95   6]
 [  0   0   3  94]]


###
<a id='4.11'></a>
<p style="font-size: 34px; color: #FFFFFF; /* Set text color to black */ 
  font-family: 'Roboto'; 
  text-align: center; 
  padding: 10px 20px; /* Add padding for spacing */
  background-image: linear-gradient(to right, #9746ff, #000000); 
  border-radius: 5px 5px;"><strong>Voting Classifier</strong></p>


In [28]:
estimators = [('rf', rf), ('knn', knn), ('svm', svm), ('lr', lr), ('xgb', xgb)]
vote = VotingClassifier(estimators=estimators)
vote.fit(x_train, y_train)
y_pred = vote.predict(x_val)
voting_acc = ModelEvaluation(y_val, y_pred)

Accuracy:  0.9548872180451128
Classification Report: 
               precision    recall  f1-score   support

           0       0.99      0.97      0.98       106
           1       0.92      0.97      0.95        89
           2       0.96      0.91      0.93       107
           3       0.94      0.98      0.96        97

    accuracy                           0.95       399
   macro avg       0.95      0.96      0.95       399
weighted avg       0.96      0.95      0.95       399

Confusion Matrix: 
 [[103   3   0   0]
 [  1  86   2   0]
 [  0   4  97   6]
 [  0   0   2  95]]


###
<a id='4.11'></a>
<p style="font-size: 34px; color: #FFFFFF; /* Set text color to black */ 
  font-family: 'Roboto'; 
  text-align: center; 
  padding: 10px 20px; /* Add padding for spacing */
  background-image: linear-gradient(to right, #9746ff, #000000); 
  border-radius: 5px 5px;"><strong>Stacking Classifier</strong></p>

In [29]:
estimators = [('rf', rf), ('knn', knn), ('svm', svm), ('lr', lr), ('xgb', xgb)]
stack = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(max_iter=1000))
stack.fit(x_train, y_train)
y_pred = stack.predict(x_val)
stacking_acc = ModelEvaluation(y_val, y_pred)

Accuracy:  0.9674185463659147
Classification Report: 
               precision    recall  f1-score   support

           0       0.99      0.98      0.99       106
           1       0.96      0.97      0.96        89
           2       0.95      0.95      0.95       107
           3       0.97      0.97      0.97        97

    accuracy                           0.97       399
   macro avg       0.97      0.97      0.97       399
weighted avg       0.97      0.97      0.97       399

Confusion Matrix: 
 [[104   2   0   0]
 [  1  86   2   0]
 [  0   2 102   3]
 [  0   0   3  94]]


###
<a id='4.11'></a>
<p style="font-size: 34px; color: #FFFFFF; /* Set text color to black */ 
  font-family: 'Roboto'; 
  text-align: center; 
  padding: 10px 20px; /* Add padding for spacing */
  background-image: linear-gradient(to right, #9746ff, #000000); 
  border-radius: 5px 5px;"><strong>Model Comparison</strong></p>

In [31]:
# Make a model comparison

models = pd.DataFrame({ 'Model': ['KNN', 'Logistic Regression', 'Random Forest', 'SVM', 'XGBoost', 'Voting Classifier', 'Stacking Classifier'], 
                    'Score': [knn_acc, lr_acc, rf_acc, svm_acc, xgb_acc, voting_acc, stacking_acc]})

models = models.sort_values(by = 'Score', ascending = False)

px.bar(data_frame = models, x = 'Score', y = 'Model', color = 'Score', template = 'plotly_dark', title = 'Models Comparison')

Some models show results close to the SVM, but this is achieved through utilizing a number of different models, making the computational cost higher. In conclusion, the SVM model is the best trade-off between computational cost and accuracy.

### Saving the SVM model

In [34]:
# Saving the SVM model
pickle.dump(svm, open('Model/svm_model.pkl', 'wb'))

# Saving the selected features
pickle.dump(sorted_correlations[1:5].index, open('GeneratedFiles/selected_features.pkl', 'wb'))


# Price Prediction

In [35]:
# Loading the dataset
test = pd.read_excel('Dataset/test.xlsx')

In [37]:
# Loading scaler to normalize the test data
scaler = pickle.load(open('GeneratedFiles/scaler.pkl', 'rb'))


In [41]:
# Loading the model
model = pickle.load(open('Model/svm_model.pkl', 'rb'))

In [38]:
test.iloc[:, 1:] = scaler.fit_transform(test.iloc[:, 1:])       # Normalizing the test data, but skipping the ID column
x_test = test[sorted_correlations[1:5].index]                   # Selecting the top 4 features

In [42]:
# Predicting the test data
y_pred = model.predict(x_test)
test['price_range'] = y_pred
test.head()


Unnamed: 0,id,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,1,0.362241,1,0.52,1,0.736842,0,0.048387,0.0,0.941667,...,0.118511,0.60855,0.862319,0.5,0.388889,0.0,0,1,0,3
1,2,0.227485,1,0.0,1,0.210526,1,0.951613,0.777778,0.925,...,0.39119,0.237809,0.974772,0.071429,0.0,0.277778,1,0,0,3
2,3,0.871915,1,0.92,0,0.052632,0,0.403226,0.888889,0.883333,...,0.665967,0.577822,0.572464,0.857143,0.555556,0.444444,0,1,1,3
3,4,0.697799,0,0.0,1,0.947368,1,0.370968,0.444444,0.133333,...,0.154693,0.835671,0.974235,0.357143,0.0,0.277778,1,1,0,3
4,5,0.623082,0,0.36,0,0.578947,1,0.758065,0.444444,0.233333,...,0.392764,0.206413,0.40526,0.714286,0.444444,0.277778,1,0,1,1


In [43]:
test['price_range'].value_counts()     # Checking the distribution of the price ranges
# It shows a balanced distribution, which is a good sign, as the data was itself balanced

price_range
3    273
0    260
2    239
1    228
Name: count, dtype: int64

# Conclusion
The SVM models shows best results in minimum time. Predictions seem to be quite reasonable.