In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.impute import KNNImputer, SimpleImputer
from category_encoders import TargetEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import boxcox
from sklearn.preprocessing import PowerTransformer
from sklearn.compose import TransformedTargetRegressor

In [3]:
import sys
import os

# Add the root of your project to sys.path
project_root = os.path.abspath('..')  # or '.' if your notebook is in root
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [4]:
%load_ext autoreload
%autoreload 2

import importlib
import src.eda
importlib.reload(src.eda)

<module 'src.eda' from '/Users/jaswanth/mydocs/myprojects/Mobile Price Prediction/src/eda.py'>

### Read the input


In [5]:
input_path = './../data/processed/After_05_Fixing_Categories.csv'
X = pd.read_csv('./../data/processed/X_train.csv')
y = pd.read_csv('./../data/processed/y_train.csv').squeeze()

X_test = pd.read_csv('./../data/processed/X_test.csv')
y_test = pd.read_csv('./../data/processed/y_test.csv').squeeze()

### Apply Transformations Using a Pipeline


In [6]:
from src.utilities import CustomColumnTransformer

In [7]:
ordinalEncodingColumns = {
    'Sound_3.5mmjack' : ['Yes', 'No'],
    'is_foldable_phone' : [False, True],
    'has_memory_card' : [False, True],
    'memory_type_microSDXC' : [False, True],
    'memory_type_microSDHC' : [False, True],
    'Battery_Is_removable' : ['Removable', 'Non-Removable'],
    'Battery_Type_Lithium' : ['Lithium Ion', 'Lithium Polymer'],
    'Sound_has_stereo_speakers' : ['Yes', 'No'],
    'Number_of_cores' : ['Low_Core', 'Mid_Core', 'High_Core'],
    'CPU_Transistor_Size' : ['> 20nm', '10nm - 20nm', '5nm - 10nm', 'Less than 5nm'],
    'Is_OS_Upgradable' : [False, True],
    'Resolution_Width' : ['Low', 'Medium', 'High'],
    'Resolution_Height' : ['Low', 'Medium', 'High'],
    'max_resolution' : ['720p', '1080p', '1440p', '4k', '8k'],
    'ram' : ['<= 4', '6 - 8', '8-16', '16+'],
    'maincamera_mp' : ['Low', 'Medium', 'High', 'Ultra'],
    'selfiecamera_mp' : ['Low', 'Medium', 'High', 'Ultra'],
    'Bluetooth_version' : ['Less than 4', '5', '6'],
}

targetEncodingColumns = ['Brand', 'GPU_company', 'CPU_Brand', 'USB_Type', 'operating_system']

transformingColumns = ['Total_Pixels', 'Battery_Capacity', 'PPI',	'Display_Size']

pipeline= Pipeline([ ('preprocessing', CustomColumnTransformer(ordinalEncodingColumns, targetEncodingColumns, transformingColumns))])

NameError: name 'X_train' is not defined

In [29]:
X['remainder__latest_wifi_version'].value_counts()

remainder__latest_wifi_version
2.0    2024
1.0    1388
3.0     651
5.0     178
4.0     149
0.0      92
Name: count, dtype: int64

In [8]:
X = pd.DataFrame(pipeline.fit_transform(X, y), columns = pipeline.get_feature_names_out())

### Technique 1 : Correlation analysis


In [9]:
temp = X.copy()
temp['Price'] = y

In [10]:
fs1 = temp.corr()['Price'].sort_values(ascending = False).reset_index().rename(columns = {'index' : 'feature', 'Price' : 'correlation'})

### Technique 2 : Random Forest Feature Selection


In [11]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()

rf.fit(X, y)

In [12]:
fs2 = pd.DataFrame({'feature' : X.columns ,'RF' : rf.feature_importances_})

### Technique 3 : Gradient Boosting Feature Importance


In [13]:
from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor()

gb.fit(X, y)

In [14]:
fs3 = pd.DataFrame({'feature' : X.columns ,'GB' : gb.feature_importances_})

### Technique 4 : Permutation Importance


In [15]:
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split

X_train_label, X_test_label, y_train_label, y_test_label = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest regressor on label encoded data
rf_label = RandomForestRegressor(n_estimators=100, random_state=42)
rf_label.fit(X_train_label, y_train_label)

# Calculate Permutation Importance
perm_importance = permutation_importance(rf_label, X_test_label, y_test_label, n_repeats=30, random_state=42)

# Organize results into a DataFrame
fs4 = pd.DataFrame({
    'feature': X.columns,
    'permutation_importance': perm_importance.importances_mean
}).sort_values(by='permutation_importance', ascending=False)



In [16]:
fs4

Unnamed: 0,feature,permutation_importance
56,remainder__latest_wifi_version,0.138351
26,transformation__Display_Size,0.089108
25,transformation__PPI,0.057943
13,ordinalEncoding__max_resolution,0.05527
18,targetEncoding__Brand,0.054153
36,remainder__thickness,0.048065
23,transformation__Total_Pixels,0.019629
35,remainder__number_of_3g_bands,0.017265
51,remainder__has_HDR,0.016472
24,transformation__Battery_Capacity,0.013116


### Technique 5 : RFE


In [17]:
from sklearn.feature_selection import RFE

# Initialize the base estimator
estimator = RandomForestRegressor()

# Apply RFE on the label-encoded and standardized training data
selector_label = RFE(estimator, n_features_to_select=X.shape[1], step=1)
selector_label = selector_label.fit(X, y)

# Get the selected features based on RFE
selected_features = X.columns[selector_label.support_]

# Extract the coefficients for the selected features from the underlying linear regression model
selected_coefficients = selector_label.estimator_.feature_importances_

# Organize the results into a DataFrame
fs5 = pd.DataFrame({
    'feature': selected_features,
    'rfe_score': selected_coefficients
}).sort_values(by='rfe_score', ascending=False)

### Merge all the dataframes with different feature importances


In [18]:
final_fi_df = fs1.merge(fs2,on='feature').merge(fs3,on='feature').merge(fs4,on='feature').merge(fs5,on='feature').set_index('feature')

### Sort the features based on their importance


In [19]:
final_fi_df = (final_fi_df.sum(axis = 1) / final_fi_df.shape[1]).sort_values(ascending = False).reset_index()

### Changing the column names


In [20]:
final_fi_df.columns = ['feature', 'importance']

In [21]:
final_fi_df

Unnamed: 0,feature,importance
0,remainder__latest_wifi_version,0.268914
1,transformation__Display_Size,0.184425
2,ordinalEncoding__max_resolution,0.172616
3,transformation__Total_Pixels,0.139858
4,transformation__PPI,0.132133
5,ordinalEncoding__ram,0.127449
6,remainder__number_of_3g_bands,0.11254
7,targetEncoding__Brand,0.10797
8,remainder__has_4k,0.104552
9,ordinalEncoding__Resolution_Width,0.10409


#### Selecting all the features which has a minimum feature importance


In [22]:
selected_features = final_fi_df[final_fi_df['importance'] > -100]['feature'].tolist()

In [23]:
selected_features

['remainder__latest_wifi_version',
 'transformation__Display_Size',
 'ordinalEncoding__max_resolution',
 'transformation__Total_Pixels',
 'transformation__PPI',
 'ordinalEncoding__ram',
 'remainder__number_of_3g_bands',
 'targetEncoding__Brand',
 'remainder__has_4k',
 'ordinalEncoding__Resolution_Width',
 'remainder__has_HDR',
 'ordinalEncoding__selfiecamera_mp',
 'ordinalEncoding__Sound_3.5mmjack',
 'ordinalEncoding__CPU_Transistor_Size',
 'remainder__has_EIS',
 'remainder__has_8k',
 'remainder__has_nfc',
 'ordinalEncoding__is_foldable_phone',
 'ordinalEncoding__Resolution_Height',
 'remainder__has_5G',
 'remainder__aws_support',
 'targetEncoding__CPU_Brand',
 'ordinalEncoding__maincamera_mp',
 'targetEncoding__GPU_company',
 'remainder__has_CDMA',
 'targetEncoding__USB_Type',
 'ordinalEncoding__Bluetooth_version',
 'transformation__Battery_Capacity',
 'remainder__has_eSIM',
 'ordinalEncoding__Number_of_cores',
 'remainder__has_dolby_vision',
 'remainder__has_CDMA2000',
 'targetEncodi

In [24]:
X = X[selected_features]

In [25]:
X.shape

(4482, 58)

In [26]:
X.head().iloc[:, 5:]

Unnamed: 0,ordinalEncoding__ram,remainder__number_of_3g_bands,targetEncoding__Brand,remainder__has_4k,ordinalEncoding__Resolution_Width,remainder__has_HDR,ordinalEncoding__selfiecamera_mp,ordinalEncoding__Sound_3.5mmjack,ordinalEncoding__CPU_Transistor_Size,remainder__has_EIS,...,remainder__has_stereo,remainder__has_720p,remainder__has_1440p,remainder__has_480p,remainder__has_Mini_SIM,remainder__has_Micro_SIM,ordinalEncoding__memory_type_microSDHC,ordinalEncoding__memory_type_microSDXC,ordinalEncoding__Sound_has_stereo_speakers,ordinalEncoding__has_memory_card
0,0.0,1.0,454.113559,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
1,2.0,6.0,500.527459,1.0,0.0,0.0,2.0,0.0,2.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,4.0,283.473936,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
3,0.0,4.0,216.951166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
4,0.0,1.0,175.615088,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0


In [27]:
X.columns

Index(['remainder__latest_wifi_version', 'transformation__Display_Size',
       'ordinalEncoding__max_resolution', 'transformation__Total_Pixels',
       'transformation__PPI', 'ordinalEncoding__ram',
       'remainder__number_of_3g_bands', 'targetEncoding__Brand',
       'remainder__has_4k', 'ordinalEncoding__Resolution_Width',
       'remainder__has_HDR', 'ordinalEncoding__selfiecamera_mp',
       'ordinalEncoding__Sound_3.5mmjack',
       'ordinalEncoding__CPU_Transistor_Size', 'remainder__has_EIS',
       'remainder__has_8k', 'remainder__has_nfc',
       'ordinalEncoding__is_foldable_phone',
       'ordinalEncoding__Resolution_Height', 'remainder__has_5G',
       'remainder__aws_support', 'targetEncoding__CPU_Brand',
       'ordinalEncoding__maincamera_mp', 'targetEncoding__GPU_company',
       'remainder__has_CDMA', 'targetEncoding__USB_Type',
       'ordinalEncoding__Bluetooth_version',
       'transformation__Battery_Capacity', 'remainder__has_eSIM',
       'ordinalEncoding__Num