In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import sys
import os

# Add the root of your project to sys.path
project_root = os.path.abspath('..')  # or '.' if your notebook is in root
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [3]:
%load_ext autoreload
%autoreload 2

import importlib
import src.eda
importlib.reload(src.eda)

<module 'src.eda' from '/Users/jaswanth/mydocs/myprojects/Mobile Price Prediction/src/eda.py'>

In [4]:
input_path = './../data/processed/After_05_Fixing_Categories.csv'
df = pd.read_csv(input_path)

### Numerical Columns


Numeric(Missing values) : thickness, Battery_Capacity, CPU_Transistor_Size,

Numeric(transformation) : thickness, Battery_Capacity


In [5]:
df.isna().sum().sort_values(ascending = False).head(10)

Battery_Type_Lithium    1255
camera_hdr               421
camera_panorama          421
GPU_company              151
CPU_Model                135
CPU_Brand                115
has_nfc                   93
thickness                 66
Battery_Capacity          17
Number_of_cores           14
dtype: int64

In [6]:
X = df.drop(columns = 'Price_In_Dollars')
y = df['Price_In_Dollars']

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

#### Filling Missing values


In [8]:
import numpy as np

def fillMissingValuesWhole(X_train, X_test, columnToUse, columnToFill, isNumericColumn):
    mapper = {}

    for brand in X_train[columnToUse].unique():
        if not isNumericColumn:
            fillingValue = X_train[X_train[columnToUse] == brand][columnToFill].mode()
            if fillingValue.empty:
                fillingValue = X_train[columnToFill].mode()
            mapper[brand] = fillingValue[0]
        else:
            fillingValue = X_train[X_train[columnToUse] == brand][columnToFill].mean()
            if np.isnan(fillingValue):
                fillingValue = X_train[columnToFill].mean()
            mapper[brand] = fillingValue

    def transformationFunction(row):
        row[columnToFill] = mapper.get(row[columnToUse], np.nan)
        return row

    def findMissingValuesAndFill(df, column):
        missing_rows = df[df[column].isna()].index
        df.loc[missing_rows, column] = df.loc[missing_rows].apply(transformationFunction, axis=1)[column]

    findMissingValuesAndFill(X_train, columnToFill)
    findMissingValuesAndFill(X_test, columnToFill)

In [9]:
fillMissingValuesWhole(X_train, X_test, 'Brand', 'camera_panorama', False)
fillMissingValuesWhole(X_train, X_test, 'Brand', 'GPU_company', False)
fillMissingValuesWhole(X_train, X_test, 'Brand', 'CPU_Model', False)
fillMissingValuesWhole(X_train, X_test, 'Brand', 'CPU_Brand', False)
fillMissingValuesWhole(X_train, X_test, 'Brand', 'has_nfc', False)
fillMissingValuesWhole(X_train, X_test, 'Brand', 'Number_of_cores', False)
fillMissingValuesWhole(X_train, X_test, 'Brand', 'USB_Type', False)
fillMissingValuesWhole(X_train, X_test, 'Brand', 'is_foldable_phone', False)
fillMissingValuesWhole(X_train, X_test, 'Brand', 'Is_OS_Upgradable', False)
fillMissingValuesWhole(X_train, X_test, 'Brand', 'Battery_Type_Lithium', False)
fillMissingValuesWhole(X_train, X_test, 'Brand', 'camera_hdr', False)
fillMissingValuesWhole(X_train, X_test, 'Brand', 'number_of_2g_bands', False)

In [10]:
fillMissingValuesWhole(X_train, X_test, 'Brand', 'thickness', True)
fillMissingValuesWhole(X_train, X_test, 'Brand', 'Battery_Capacity', True)

In [11]:
X_train.head()

Unnamed: 0,Name,Sound_3.5mmjack,Brand,Model,has_LTE,has_5G,has_CDMA,has_CDMA2000,has_EVDO,number_of_sims,...,ram,Total_Pixels,number_of_camera_features,maincamera_mp,selfiecamera_mp,latest_wifi_version,Bluetooth_version,has_nfc,USB_Type,operating_system
1615,Huawei G7 Plus,Yes,Huawei,G7 Plus,1.0,0.0,0.0,0.0,0.0,2.0,...,<= 4,2073600.0,1.0,Medium,Low,1,Less than 4,0.0,Type-B,Android
1093,Asus ROG Phone 5 Pro,Yes,Asus,ROG Phone 5 Pro,1.0,1.0,0.0,0.0,0.0,2.0,...,8-16,2643840.0,3.0,Ultra,High,4,5,1.0,Type-C,Android
3637,Xiaomi Mi A2 Lite (Redmi 6 Pro),Yes,Xiaomi,Mi A2 Lite (Redmi 6 Pro),1.0,0.0,1.0,0.0,0.0,2.0,...,<= 4,2462400.0,3.0,Medium,Low,1,Less than 4,0.0,Type-B,Android
2339,Nokia C1,Yes,Nokia,C1,0.0,0.0,0.0,0.0,0.0,2.0,...,<= 4,460800.0,1.0,Low,Low,1,Less than 4,0.0,Type-B,Android
504,Infinix Hot 11,Yes,Infinix,Hot 11,1.0,0.0,0.0,0.0,0.0,2.0,...,<= 4,2600640.0,1.0,Medium,Low,1,5,0.0,Type-C,Android


In [12]:
X_train = X_train.drop(columns = ['Name', 'Model', 'CPU_Model'])
X_test = X_test.drop(columns = ['Name', 'Model', 'CPU_Model'])

In [13]:
X_train.to_csv('./../data/processed/X_train.csv', index = False)
X_test.to_csv('./../data/processed/X_test.csv', index = False)
y_train.to_csv('./../data/processed/y_train.csv', index = False)
y_test.to_csv('./../data/processed/y_test.csv', index = False)