# DATA CLEANING 

In [948]:
import pandas as pd
import numpy as np
import re


In [949]:
phone_df = pd.read_csv('phone_price.csv')


In [950]:
phone_df = phone_df.drop(['Name', 'Spec_score', 'Unnamed: 0', 'Rating', 'No_of_sim', 'Processor_name'], axis='columns')


* FUNCTIONS 

In [951]:

def clean(clmn):
    if isinstance(clmn, str) and clmn[0].isdigit():
        try:
            return float(clmn.split(' ')[0])
        except ValueError:
            return 0
    else:
        return 0
    

def clean_2(clmn):
    if isinstance(clmn, str):
        clmn = clmn.replace(" ", "")
        if clmn[0].isdigit():
            try:
                return float(clmn.split('W')[0])
            except ValueError:
                return 0
    return 0

def clean_3(clmn):
    if isinstance(clmn, str):
        clmn = clmn.strip().split(' ')
        if len(clmn) > 1 and clmn[0].replace('.', '', 1).isdigit():
            try:
                memory_size = float(clmn[0])
                if clmn[1].upper() == 'TB':
                    return memory_size * 1000
                elif clmn[1].upper() == 'GB':
                    return memory_size
            except (ValueError, IndexError):
                return 0
    return 0

def clean_4(clmn):
    if isinstance(clmn, str):
        clmn = clmn.strip().split(' ')
        if len(clmn) > 1 and clmn[-2].replace('.', '', 1).isdigit():
            try:
                memory_size = float(clmn[-2])
                if clmn[-1].upper() == 'TB':
                    return memory_size * 1000
                elif clmn[-1].upper() == 'GB':
                    return memory_size
            except (ValueError, IndexError):
                return 0
    return 0

def extract_camera_features(camera_str):
    total_rear_mp, num_rear_cameras, max_rear_mp = 0, 0, 0
    total_front_mp, num_front_cameras, max_front_mp = 0, 0, 0
    try:
        rear_part, front_part = camera_str.split('&')
    except ValueError:
        rear_part = camera_str
        front_part = ''
    rear_cameras = re.findall(r'(\d+) MP', rear_part)
    if rear_cameras:
        rear_cameras = list(map(int, rear_cameras))
        total_rear_mp = sum(rear_cameras)
        num_rear_cameras = len(rear_cameras)
        max_rear_mp = max(rear_cameras)
    front_cameras = re.findall(r'(\d+) MP', front_part)
    if front_cameras:
        front_cameras = list(map(int, front_cameras))
        total_front_mp = sum(front_cameras)
        num_front_cameras = len(front_cameras)
        max_front_mp = max(front_cameras)
    return {
        'total_rear_mp': total_rear_mp,
        'num_rear_cameras': num_rear_cameras,
        'max_rear_mp': max_rear_mp,
        'total_front_mp': total_front_mp,
        'num_front_cameras': num_front_cameras,
        'max_front_mp': max_front_mp
    }

def extract_processor_speed(processor_type):
    match = re.search(r'([\d\.]+) GHz', processor_type)
    if match:
        return float(match.group(1))
    else:
        if 'Octa Core' in processor_type:
            return 2.0
        elif 'Quad Core' in processor_type:
            return 1.5
        elif 'Nine-Cores' in processor_type:
            return 2.5
        else:
            return None


* CLEANING 


In [952]:
phone_df = phone_df.reset_index(drop=True)
phone_df['fast_charging'] = phone_df['fast_charging'].apply(clean_2)


In [953]:
phone_df = phone_df.drop(phone_df[phone_df.fast_charging > 200.0].index)


In [954]:
phone_df = phone_df.reset_index(drop=True)
phone_df['Ram'] = phone_df['Ram'].apply(clean)
phone_df = phone_df.drop(phone_df[phone_df.Ram == 0].index)
phone_df = phone_df.drop(phone_df[phone_df.Ram > 24.0].index)


In [955]:
phone_df = phone_df.reset_index(drop=True)
phone_df['Battery'] = phone_df['Battery'].apply(clean)


In [956]:
phone_df = phone_df.reset_index(drop=True)
phone_df['Display'] = phone_df['Display'].apply(clean)
phone_df = phone_df.drop(phone_df[phone_df.Display < 4].index)
phone_df = phone_df.drop(phone_df[phone_df.Display > 8].index)


In [957]:
phone_df = phone_df.reset_index(drop=True)
phone_df['Inbuilt_memory'] = phone_df['Inbuilt_memory'].apply(clean_3)


In [958]:
phone_df = phone_df.reset_index(drop=True)
phone_df['External_Memory'] = phone_df['External_Memory'].apply(clean_4)


In [959]:
camera_features = phone_df['Camera'].apply(extract_camera_features)
camera_features_df = pd.DataFrame(camera_features.tolist())
phone_df = pd.concat([phone_df, camera_features_df], axis=1)
phone_df.drop('Camera', axis=1, inplace=True)


In [960]:
phone_df['screen_width'] = phone_df['Screen_resolution'].str.extract(r'(\d+) x \d+').astype(float)
phone_df['screen_height'] = phone_df['Screen_resolution'].str.extract(r'\d+ x (\d+)').astype(float)
phone_df.drop('Screen_resolution', axis=1, inplace=True)


In [961]:
phone_df.dropna(subset=['Processor'], inplace=True)
phone_df.dropna(subset=['Android_version'], inplace=True)


In [962]:
phone_df = phone_df.reset_index(drop=True)
phone_df['Android_version'] = phone_df['Android_version'].apply(clean)
phone_df = phone_df.drop(phone_df[phone_df.Android_version > 14].index)
phone_df = phone_df.drop(phone_df[phone_df.Android_version < 6].index)


In [963]:
phone_df = phone_df.drop(phone_df[phone_df.Inbuilt_memory < 16].index)


In [964]:
phone_df['Processor_Speed'] = phone_df['Processor'].apply(extract_processor_speed)
phone_df.drop('Processor', axis=1, inplace=True)


In [965]:
dummies = pd.get_dummies(phone_df.company , drop_first=True)
phone_df = pd.concat([phone_df, dummies], axis=1)


In [966]:
phone_df['Price'] = phone_df['Price'].str.replace(',', '').astype(int)


In [967]:
phone_df['Price_EUR'] = round(phone_df['Price'] / 90, 2)
phone_df.drop('Price', axis=1, inplace=True)
phone_df.drop('company', axis=1, inplace=True)


In [968]:
phone_df = phone_df.fillna(0)


# Training Model

In [969]:
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np
from sklearn.ensemble import RandomForestRegressor



In [970]:
X = phone_df.drop('Price_EUR', axis='columns')
y = phone_df['Price_EUR']


In [971]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4,random_state=10)


In [972]:
model = RandomForestRegressor(bootstrap=False,
                                max_depth = 20,
                                max_features='log2',
                                min_samples_leaf = 1,
                                min_samples_split = 10,
                                n_estimators = 10).fit(X_train, y_train)


In [973]:
model.score(X_test, y_test)


0.8580374341460292

* Saving Model

In [975]:
import pickle

pickle.dump(model, open('phone_price_model.sav', 'wb'))


* Saving Columns

In [976]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json", 'w') as f:
    f.write(json.dumps(columns)) 
