In [802]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, root_mean_squared_error


In [803]:
import sys
import os

# Add the root of your project to sys.path
project_root = os.path.abspath('..')  # or '.' if your notebook is in root
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [804]:
%load_ext autoreload
%autoreload 2

import importlib
import src.eda
importlib.reload(src.eda)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<module 'src.eda' from '/Users/jaswanth/mydocs/myprojects/Mobile Price Prediction/src/eda.py'>

### Read the Input


In [805]:
X_train = pd.read_csv('./../data/processed/X_train.csv')
y_train = pd.read_csv('./../data/processed/y_train.csv').squeeze()

X_test = pd.read_csv('./../data/processed/X_test.csv')
y_test = pd.read_csv('./../data/processed/y_test.csv').squeeze()

In [806]:
from src.utilities import CustomColumnTransformer

- Number of features = 10:

  - Training r2 score = 79
  - Testing r2 score = 64

Random Forest:

- R2 score = 69

- Number of features = 50:
  R2


### Defining the type of encoding each column needs


In [807]:
feature_roles = {
    # Ordinal Encoded Columns
    'Sound_3.5mmjack': 'ordinal',
    'is_foldable_phone': 'ordinal',
    'has_memory_card': 'ordinal',
    'memory_type_microSDXC': 'ordinal',
    'memory_type_microSDHC': 'ordinal',
    'Battery_Is_removable': 'ordinal',
    'Battery_Type_Lithium': 'ordinal',
    'Sound_has_stereo_speakers': 'ordinal',
    'Number_of_cores': 'ordinal',
    'CPU_Transistor_Size': 'ordinal',
    'Is_OS_Upgradable': 'ordinal',
    'Resolution_Width': 'ordinal',
    'Resolution_Height': 'ordinal',
    'max_resolution': 'ordinal',
    'ram': 'ordinal',
    'maincamera_mp': 'ordinal',
    'selfiecamera_mp': 'ordinal',
    'Bluetooth_version': 'ordinal',

    # Target Encoded Columns
    'Brand': 'target',
    'GPU_company': 'target',
    'CPU_Brand': 'target',
    'USB_Type': 'target',
    'operating_system': 'target',

    # Box-Cox Transformed Columns
    'Total_Pixels': 'transform',
    'Battery_Capacity': 'transform',
    'PPI': 'transform',
    'Display_Size': 'transform'
}

ordinalEncodingColumns = {
    'Sound_3.5mmjack' : ['Yes', 'No'],
    'is_foldable_phone' : [False, True],
    'has_memory_card' : [False, True],
    'memory_type_microSDXC' : [False, True],
    'memory_type_microSDHC' : [False, True],
    'Battery_Is_removable' : ['Removable', 'Non-Removable'],
    'Battery_Type_Lithium' : ['Lithium Ion', 'Lithium Polymer'],
    'Sound_has_stereo_speakers' : ['Yes', 'No'],
    'Number_of_cores' : ['Low_Core', 'Mid_Core', 'High_Core'],
    'CPU_Transistor_Size' : ['> 20nm', '10nm - 20nm', '5nm - 10nm', 'Less than 5nm'],
    'Is_OS_Upgradable' : [False, True],
    'Resolution_Width' : ['Low', 'Medium', 'High'],
    'Resolution_Height' : ['Low', 'Medium', 'High'],
    'max_resolution' : ['720p', '1080p', '1440p', '4k', '8k'],
    'ram' : ['<= 4', '6 - 8', '8-16', '16+'],
    'maincamera_mp' : ['Low', 'Medium', 'High', 'Ultra'],
    'selfiecamera_mp' : ['Low', 'Medium', 'High', 'Ultra'],
    'Bluetooth_version' : ['Less than 4', '5', '6'],
}

In [808]:
def auto_split_features(selected_features, full_role_map, full_ordinal_map):
    ordinal_cols = {
        col: full_ordinal_map[col]
        for col in selected_features
        if full_role_map.get(col) == 'ordinal'
    }
    target_cols = [col for col in selected_features if full_role_map.get(col) == 'target']
    transform_cols = [col for col in selected_features if full_role_map.get(col) == 'transform']
    return ordinal_cols, target_cols, transform_cols


In [809]:
# selected_features = ['latest_wifi_version', 'Display_Size', 'max_resolution', 'Total_Pixels',
#        'ram', 'Brand', 'PPI', 'number_of_3g_bands', 'Resolution_Width',
#        'has_4k', 'selfiecamera_mp', 'Sound_3.5mmjack', 'CPU_Transistor_Size',
#        'has_EIS', 'has_HDR', 'has_8k', 'has_nfc', 'Resolution_Height',
#        'has_5G', 'is_foldable_phone', 'aws_support', 'maincamera_mp',
#        'CPU_Brand', 'GPU_company', 'has_CDMA', 'USB_Type', 'Bluetooth_version',
#        'Battery_Capacity', 'Number_of_cores']
# selected_features = ['latest_wifi_version', 'Display_Size', 'max_resolution', 'Total_Pixels',
#        'ram', 'Brand', 'PPI', 'number_of_3g_bands', 'Resolution_Width',
#        'has_4k', 'selfiecamera_mp']

selected_features =  ['latest_wifi_version', 'Display_Size',
       'max_resolution', 'Total_Pixels',
       'ram', 'Brand', 'PPI']

selected_features = ['latest_wifi_version',
 'max_resolution',
 'Brand',
 'ram',
 'selfiecamera_mp',
 'CPU_Transistor_Size',
 'maincamera_mp',
 'CPU_Brand',
 'GPU_company',
 'Battery_Capacity',
 'operating_system',
 ]


for i in range(len(selected_features)):
       selected_features[i] = selected_features[i].replace('remainder__', '')
       selected_features[i] = selected_features[i].replace('ordinalEncoding__' ,'')
       selected_features[i] = selected_features[i].replace('targetEncoding__', '')
       selected_features[i] = selected_features[i].replace('transformation__', '')




In [810]:
ordinal_cols, target_cols, transform_cols = auto_split_features(
    selected_features,
    feature_roles,
    ordinalEncodingColumns
)

In [811]:
X_train = X_train[selected_features]
X_test = X_test[selected_features]

In [812]:
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

In [813]:
models = {
    'GradientBoosting' : GradientBoostingRegressor(),
    'RandomForest' : RandomForestRegressor(),
    'Ridge' : Ridge(),
    'Lasso' : Lasso(),
    'Linear' : LinearRegression(),
    'SVR' : SVR(),
    'DTR' : DecisionTreeRegressor()
}

In [814]:
for name, model in models.items():

    pipeline = Pipeline(
    [
        ('transformer', CustomColumnTransformer(ordinal_cols, target_cols, transform_cols)),
        ('model', model)
    ]
    )

    pipeline.fit(X_train, y_train)

    scores = cross_val_score(pipeline, X = X_train, y  = y_train, scoring = 'r2')
    print(f'{name} : {scores.mean()}')

GradientBoosting : 0.5260532686757877
RandomForest : 0.5396909336475832
Ridge : 0.43031215897263786
Lasso : 0.429978897773429
Linear : 0.43031006208534023
SVR : 0.016313774522956105
DTR : 0.2637141400112474


Based on the above results, I chose Random Forest as my final model.

I will finetune the model and then try to increase the r2 score even more.


In [815]:
pipeline = Pipeline(
    [
        ('transformer', CustomColumnTransformer(ordinal_cols, target_cols, transform_cols)),
        ('model', RandomForestRegressor())
    ]
)

In [816]:
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'model__n_estimators': [100, 200, 300],           # Number of trees
#     'model__max_depth': [10, 15, 20],           # Tree depth limit
#     'model__min_samples_split': [2, 5, 10],           # Min samples to split an internal node
#     'model__min_samples_leaf': [1, 2, 5],             # Min samples in a leaf
#     'model__max_features': ['sqrt', 'log2', 0.3, 0.5],# Number of features to consider at each split
#     'model__bootstrap': [True],                # Whether to use bootstrap samples
#     'model__max_samples': [None, 0.6, 0.8],           # If bootstrap=True, use subset of rows
#     'model__criterion': ['squared_error', 'absolute_error']  # Loss function (since sklearn 1.0)
# }


# from sklearn.model_selection import RandomizedSearchCV


# search = RandomizedSearchCV(
#     pipeline,
#     param_distributions=param_grid,
#     n_iter=30, 
#     cv=5,
#     scoring='r2',
#     random_state=42,
#     n_jobs=-1  # use all CPU cores
# )
# search.fit(X_train, y_train)

# print(f'Best Params : {search.best_params_}')
# print(f'Best Score : {search.best_score_}')

### Checking the R2 score of both training and testing set


In [817]:
pipeline.fit(X_train, y_train)
y_pred_train = pipeline.predict(X_train)
y_pred_test = pipeline.predict(X_test)

In [818]:
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)

adj_train_r2 = 1- (1-train_r2)  * (len(y_train) - 1) / (len(y_train) - X_train.shape[1] - 1)
adj_test_r2 = 1- (1-test_r2)  * (len(y_test) - 1) / (len(y_test) - X_test.shape[1] - 1)

In [819]:
print(f'Training r2 Score : {train_r2}')
print(f'Testing r2 Score : {test_r2}')

print(f'Training Adj r2 score : {adj_train_r2}')
print(f'Testing Adj r2 score : {adj_test_r2}')

Training r2 Score : 0.8801897008035635
Testing r2 Score : 0.5530420402328209
Training Adj r2 score : 0.8798948656153843
Testing Adj r2 score : 0.5429464805666218


### Cross validating the R2 score


In [820]:
from sklearn.model_selection import cross_val_score

In [821]:
scores = cross_val_score(pipeline, X = X_train, y  = y_train, scoring = 'r2')
scores.mean()

np.float64(0.5413718841506576)

### Checking the Mean squared error


In [822]:
root_mean_squared_error(y_test, y_pred_test)

197.2977289888448

In [823]:
# selected_features = ['remainder__latest_wifi_version',
#  'transformation__Display_Size',
#  'ordinalEncoding__max_resolution',
#  'targetEncoding__Brand',
#  'transformation__Total_Pixels',
#  'ordinalEncoding__ram',
#  'transformation__PPI',
#  'ordinalEncoding__selfiecamera_mp',
#  'ordinalEncoding__CPU_Transistor_Size',
#  'ordinalEncoding__maincamera_mp',
#  'targetEncoding__CPU_Brand',
#  'targetEncoding__GPU_company',
#  'transformation__Battery_Capacity',
#  'remainder__has_eSIM',
#  'ordinalEncoding__Number_of_cores',
#  'targetEncoding__operating_system',
#  ]

# ordinal_cols, target_cols, transform_cols = auto_split_features(
#     selected_features,
#     feature_roles,
#     ordinalEncodingColumns
# )

In [824]:
pipeline = Pipeline(
    [
    ('transformer', CustomColumnTransformer(ordinal_cols, target_cols, transform_cols)),
    ('model', RandomForestRegressor(n_estimators= 300, min_samples_split= 2, min_samples_leaf= 1, max_samples= 0.6, max_features= 0.3, max_depth= 20, criterion= 'absolute_error', bootstrap= True))
    ])
 

pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [825]:
y_pred = pipeline.predict(X_test)

In [826]:
r2_score(y_test, y_pred)

0.6289617121969187

In [827]:
from sklearn.metrics import root_mean_squared_error

root_mean_squared_error(y_test, y_pred)

179.76209131593464

In [828]:
import joblib

joblib.dump(pipeline, './../models/price_prediction_pipeline.pkl')

['./../models/price_prediction_pipeline.pkl']

In [829]:
X_train.head()

Unnamed: 0,latest_wifi_version,max_resolution,Brand,ram,selfiecamera_mp,CPU_Transistor_Size,maincamera_mp,CPU_Brand,GPU_company,Battery_Capacity,operating_system
0,1,1080p,Huawei,<= 4,Low,> 20nm,Medium,qualcomm,Qualcomm,3000.0,Android
1,4,8k,Asus,8-16,High,5nm - 10nm,Ultra,qualcomm,Qualcomm,6000.0,Android
2,1,1080p,Xiaomi,<= 4,Low,10nm - 20nm,Medium,qualcomm,Qualcomm,4000.0,Android
3,1,720p,Nokia,<= 4,Low,> 20nm,Low,qualcomm,Qualcomm,2500.0,Android
4,1,1440p,Infinix,<= 4,Low,10nm - 20nm,Medium,mediatek,ARM,5200.0,Android
