In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('./../data/processed/After_02_Fixing_Datatypes.csv')

#### 1. Display_Size

    Fixing:
        - There are 289 devices whose display size > 8 inches
        - All the devices > 8 inches except the foldable are tabs => Remove the tabs from the data.
        - All devices having display size < 4 are feature phones => Remove them.


In [3]:
# Removing all the feature phones
df = df[df['Display_Size'] >= 4]

# Removing all the tabs
df = df[(df['Display_Size'] < 8) | ((df['Display_Size'] >= 8) & df['is_foldable_phone'] == True)]


#### 2. Battery_Capacity

    - All of the issues are fixed by the Display Size.
    - Feels Great.

    Fixing:
        Phones : Oukitel WP19, Doogee V Max, Doogee S100 Pro, Oukitel WP33 Pro, Blackview BV9300, Doogee V Max Plus
        Doogee V Max Pro, Ulefone Armor 24, Doogee Fire 6 Power, Doogee Fire 6 Max, Ulefone Armor 26 Ultra

        All of the above are the phones and the rest of the devices having battery capacity > 15000 are tablets.

        - Keep the above devices and remove all the other devices whose battery capacity > 15000.


#### 3. Price_In_Dollars

    - Fixing :
        - There is one phone which is costing 9999 dollars, which is an error.
        - There is one phone which is > 5000 dollars, distorting the distribution. Remove these two phones.
        - Log transform the prices which are still looking like outliers.


In [4]:
df = df[df['Price_In_Dollars'] < 5000]

#### 4. Total_Pixels

    Fixing :
         - All the phones which are having > 6000000 pixels and are from sony are discontinued. So remove these particular phones.


In [5]:
df = df[df['Total_Pixels'] < 6000000]

#### 5. number_of_2g_bands

    Fixing:
    - Combine 5 with 4 and 1 with 2 to take care of the rare categories.


In [6]:
df['number_of_2g_bands'] = df['number_of_2g_bands'].replace(1.0, 2.0)

#### 6. thickness

    Fixing:
        - The phones which are having thickness < 2 are having thickness in inches => Convert to Millimeters.
        - The phones which are having thickness == 0, it is wrong, fill these values manually.
        - The thickness which are on the greater side are valid and are not wrong values.


In [7]:
def modifythickness(thickness):
    if thickness == 0.00:
        return 9
    else:
        return 11.43

In [8]:
df.loc[df['thickness'] < 2, 'thickness'] = df[df['thickness'] < 2]['thickness'].apply(modifythickness)

#### 7. PPI

    - Automatically got resolved
    - Feels amazing.

    Fixing :
        - The Phones with PPI > 600 are mostly from Sony and most of the phones got discontinued, so their value has been dropped.
        - These phones will have a negative impact on the ML model.
        - Drop the phones which are having PPI > 600 and price < 500 dollars.


In [9]:
df[df['PPI'] > 600]

Unnamed: 0,Name,Sound_3.5mmjack,Brand,Model,has_LTE,has_5G,has_CDMA,has_CDMA2000,has_EVDO,number_of_sims,...,max_FPS,ram,Total_Pixels,number_of_camera_features,maincamera_mp,selfiecamera_mp,latest_wifi_version,Bluetooth_version,has_nfc,USB_Type


8. Resolution_Width

   - Automatically Got resolved
   - Feels fantastic

   Fixing:

   - Most of the phones which are having Resolution_Width > 2500 are tabs. Remove those.


In [10]:
df[df['Resolution_Width'] > 2500]

Unnamed: 0,Name,Sound_3.5mmjack,Brand,Model,has_LTE,has_5G,has_CDMA,has_CDMA2000,has_EVDO,number_of_sims,...,max_FPS,ram,Total_Pixels,number_of_camera_features,maincamera_mp,selfiecamera_mp,latest_wifi_version,Bluetooth_version,has_nfc,USB_Type


#### 9. Resolution_Height

    - Automatically got resolved.
    - Feels phenomenal.

    Fixing:
        - The phone having Resolution height < 200 is a feature phone => Remove it. - All the phones having resolution height > 3500 are not having actual prices. So remove them.


In [11]:
df[df['Resolution_Height'] < 200]

Unnamed: 0,Name,Sound_3.5mmjack,Brand,Model,has_LTE,has_5G,has_CDMA,has_CDMA2000,has_EVDO,number_of_sims,...,max_FPS,ram,Total_Pixels,number_of_camera_features,maincamera_mp,selfiecamera_mp,latest_wifi_version,Bluetooth_version,has_nfc,USB_Type


In [12]:
df[df['Resolution_Height'] > 3500]

Unnamed: 0,Name,Sound_3.5mmjack,Brand,Model,has_LTE,has_5G,has_CDMA,has_CDMA2000,has_EVDO,number_of_sims,...,max_FPS,ram,Total_Pixels,number_of_camera_features,maincamera_mp,selfiecamera_mp,latest_wifi_version,Bluetooth_version,has_nfc,USB_Type


#### Removing Unnecessary columns


In [13]:

columnsToRemove = ['MainCamera_Cameraspecs', 'SelfieCamera_Cameraspecs', 'SelfieCamera_Features', 'COMMS_WLAN', 'COMMS_Bluetooth', 'Colors', 'camera_has_led_flash', 'COMMS_USB', 'COMMS_NFC', 'COMMS_Radio', 'Parsed_Camera_Features', 'has_GSM', 'has_UMTS', 'has_HSPA', 'memory_type_microSD', 'memory_type_SD', 'memory_type_Nano_Memory']

In [14]:
df = df.drop(columns = columnsToRemove)

KeyError: "['MainCamera_Cameraspecs', 'SelfieCamera_Cameraspecs', 'SelfieCamera_Features', 'COMMS_WLAN', 'COMMS_Bluetooth', 'Colors', 'camera_has_led_flash', 'COMMS_USB', 'COMMS_NFC', 'COMMS_Radio', 'Parsed_Camera_Features', 'has_GSM', 'has_UMTS', 'has_HSPA', 'memory_type_microSD', 'memory_type_SD', 'memory_type_Nano_Memory'] not found in axis"

In [None]:
df.shape

(4981, 64)

In [None]:
output_path = './../data/processed/After_03_Fixing_Outliers.csv'
df.to_csv(output_path, index = False)