***Importing Libraries Needed***

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
#Importing the necessary libraries to get the models that we will use to train and predict and measure performance
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix

***Reading Training Data and Pre-Processing***

In [4]:
df = pd.read_excel("train_maids.xlsx")

In [5]:
# Replacing the null values with mean
for i in df.columns:
    df[i].fillna(df[i].mean(), inplace=True)

In [6]:

# X = df['ram'].values.reshape(-1, 1)  # Accuracy was really low compared when using every column in the dataset or chosen columns
# X = df.drop(columns=['price_range'])

X = df[['battery_power', 'px_height', 'px_width', 'ram']] # All other data points are low-correlated with target variable shown in EDA, therefore using only 'battery_power', 'px_height', 'px_width', 'ram' column for training
y = df['price_range']  # Target variable

STDscaler = StandardScaler()
# minmaxscaler = MinMaxScaler()


X = STDscaler.fit_transform(X)
# X = minmaxscaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

***Defining Models and setting hyperparameters for the best accuracy***

In [7]:
models = {"Logistic Regression": LogisticRegression(C = 0.001),
          "K-Nearest Neighbors": KNeighborsClassifier(weights = "distance"),
          "Decision Tree": DecisionTreeClassifier(),
          "Neural Network": MLPClassifier(max_iter= 1000, hidden_layer_sizes=(150,)),
          "Random Forest": RandomForestClassifier(n_estimators = 100, n_jobs = 8, max_samples = 75),
          "Gradient Boosting": GradientBoostingClassifier(learning_rate= 0.001, max_depth= 6, n_estimators= 150, subsample= 0.8),
          "Ada Boost" : AdaBoostClassifier(),
         }

***Training ML Models***

In [8]:
model_rf = []
for name, model in models.items():    
    model_rf.append(model.fit(X_train, y_train))
    print(name + " trained.")

Logistic Regression trained.
K-Nearest Neighbors trained.
Decision Tree trained.
Neural Network trained.
Random Forest trained.
Gradient Boosting trained.
Ada Boost trained.


***Seeing Models Evaluation to choose the best ML Model on the dataset***

In [9]:
#test each model accuracy
for name, model in models.items():
    print(name + ": {:.2f}%".format(model.score(X_test, y_test) * 100))
    print()

Logistic Regression: 75.75%

K-Nearest Neighbors: 92.00%

Decision Tree: 87.25%

Neural Network: 96.50%

Random Forest: 86.00%

Gradient Boosting: 87.75%

Ada Boost: 62.75%



In [10]:
for name, model in models.items():
    y_predict = model.predict(X_test)
    y_predict_proba = model.predict_proba(X_test)
    
    print(name + ":")
    print(classification_report(y_test, y_predict))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_predict))
    print("Recall Score:", round(metrics.recall_score(y_test, y_predict, average='weighted') * 100, 3))
    print("Precision Score:", round(metrics.precision_score(y_test, y_predict, average='weighted') * 100, 3))
    print('\n')

Logistic Regression:
              precision    recall  f1-score   support

           0       0.75      1.00      0.86       105
           1       0.80      0.44      0.57        91
           2       0.74      0.50      0.60        92
           3       0.76      1.00      0.86       112

    accuracy                           0.76       400
   macro avg       0.76      0.73      0.72       400
weighted avg       0.76      0.76      0.73       400

Confusion Matrix:
[[105   0   0   0]
 [ 35  40  16   0]
 [  0  10  46  36]
 [  0   0   0 112]]
Recall Score: 75.75
Precision Score: 76.141


K-Nearest Neighbors:
              precision    recall  f1-score   support

           0       0.97      0.96      0.97       105
           1       0.89      0.93      0.91        91
           2       0.85      0.88      0.87        92
           3       0.96      0.90      0.93       112

    accuracy                           0.92       400
   macro avg       0.92      0.92      0.92       400
we

Recall Score: 96.5
Precision Score: 96.517


Random Forest:
              precision    recall  f1-score   support

           0       0.93      0.94      0.94       105
           1       0.85      0.84      0.84        91
           2       0.74      0.83      0.78        92
           3       0.91      0.83      0.87       112

    accuracy                           0.86       400
   macro avg       0.86      0.86      0.86       400
weighted avg       0.86      0.86      0.86       400

Confusion Matrix:
[[99  6  0  0]
 [ 7 76  8  0]
 [ 0  7 76  9]
 [ 0  0 19 93]]
Recall Score: 86.0
Precision Score: 86.444


Gradient Boosting:
              precision    recall  f1-score   support

           0       0.97      0.90      0.93       105
           1       0.82      0.91      0.86        91
           2       0.79      0.85      0.82        92
           3       0.93      0.86      0.89       112

    accuracy                           0.88       400
   macro avg       0.88      0.88   

***Choosing K-Nearest Neighbors to avoid over-fitting from Neural Network***

***Testing on Test Dataset***

In [11]:
# Read and preprocess the data from the test dataset
df_test = pd.read_excel('test_maids.xlsx')

for i in df_test.columns:
    df_test[i].fillna(df_test[i].mean(), inplace=True)

In [12]:
first_10_rows = df_test.head(10).copy()


In [13]:
test_x = first_10_rows.drop(labels='id', axis = 1)
test_x = test_x[['battery_power', 'px_height', 'px_width', 'ram']]

STDscaler = StandardScaler()
test_x = STDscaler.fit_transform(test_x)

In [14]:
df_predict = first_10_rows.copy()
df_predict['price_range_prediction'] = model_rf[1].predict(test_x)

df_predict.to_excel('prediction_maids.xlsx', index=False)