In [None]:
!pip install numpy
!pip install pandas

In [None]:
import pandas as pd
import numpy as np

In [None]:
url = "https://raw.githubusercontent.com/MKDpahasara/mobile-price-prediction-model/main/Mobile%20phone%20price.csv"
df = pd.read_csv(url, encoding='latin-1')
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.info()

## Rename First column

In [None]:
df.rename(columns={'ï»¿BRAND': 'Brand'}, inplace=True)
df

## create new column with main camera (mp)

In [None]:
import re
def extract_main_camera(row):
    megapixels = re.split(r'\s*\+\s*', str(row['Camera (MP)']))
    for mp in megapixels:
        if re.match(r'^\d+MP$', mp):
            return int(re.match(r'^(\d+)MP$', mp).group(1)) 
        elif re.match(r'^\d+(\.\d+)?$', mp): 
            return int(float(mp)) 
    return None

df['main_camera(MP)'] = df.apply(extract_main_camera, axis=1)



In [None]:
df

## Create new column for camera count

In [None]:
def count_cameras(row):
    megapixels = re.split(r'\s*\+\s*', str(row['Camera (MP)']))
    return len(megapixels)

df['camera_count'] = df.apply(count_cameras, axis=1)



In [None]:
df

## Remove (GB) from RAM and Storage columns

In [None]:
df['RAM (GB)'] = df['RAM (GB)'].str.replace('GB','').astype('int32')


In [None]:
df['Storage(GB )'] = df['Storage(GB )'].str.replace('GB','').astype('int32')

In [None]:
df

In [None]:
df.describe

In [None]:
df.info()

In [None]:
df['Price($)'] = df['Price($)'].replace('[\$,]', '', regex=True).astype(int)


In [None]:
df.info()

## check correlation between features and target

In [None]:
df.corr()['Price($)']

## Including brands with values count less than 7 in the "Other" category

In [None]:
df['Brand'].value_counts()

In [None]:
def other_brands(inpt):
    if inpt == 'Asus' or inpt == 'LG' or inpt == 'Blackberry' or inpt == 'Sony' or inpt == 'CAT':
         return 'other'
    else:
        return inpt
    
df['Brand'] = df['Brand'].apply(other_brands)
   



In [None]:
df.head(3)

In [None]:
df['Brand'].value_counts()

In [None]:
len(df['MODEL'].value_counts())

In [None]:
df['SCREEN SIZE (Inch)'].value_counts()

In [None]:
df = df[~df['SCREEN SIZE (Inch)'].isin(['7.6 (unfolded)', '6.8 + 3.9'])]

In [None]:
df['SCREEN SIZE (Inch)'] = df['SCREEN SIZE (Inch)'].astype(float)

In [None]:
df['SCREEN SIZE (Inch)'].value_counts()

## Finalize Dataset

In [None]:
df.head(3)

In [None]:
cl_df = df.drop(columns = ['MODEL','Camera (MP)',])
cl_df.head(2)

In [None]:
cl_df.info()

In [None]:
cl_df = cl_df.recamera_countname(columns={
    'Brand': 'brand',
    'MODEL': 'model',
    'Storage(GB )': 'storage_gb',
    'RAM (GB)': 'ram_gb',
    'SCREEN SIZE (Inch)': 'screen_size_inch',
    'Camera (MP)': 'camera_mp',
    'battery Capacity (mah)': 'battery_capacity_mah',
    'Price($)': 'price_usd',
    'main_camera(MP)': 'main_camera_mp',
    'camera_count': 'camera_count'
})

## one hot encoding

In [None]:
cl_df = pd.get_dummies(cl_df)

In [None]:
cl_df.head(2)

In [None]:
max(cl_df.camera_count)

In [None]:
cl_df.shape

## Feature variables and target variable

In [None]:
X = cl_df.drop('price_usd', axis=1)
y = cl_df['price_usd']

In [None]:
!pip install sklearn

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [None]:
X_train.shape,X_test.shape

## choose best model 

In [None]:
def model_acc(model):
    model.fit(X_train,y_train)
    acc = model.score(X_test,y_test)
    print(str(model)+'model accurecy = '+str(acc))

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
model_acc(lr)

from sklearn.linear_model import Ridge
ri = Ridge()
model_acc(ri)

from sklearn.linear_model import Lasso
la = Lasso()
model_acc(la)

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
model_acc(rf)

## Hyperparameter tuning for LinearRegression model

In [None]:
from sklearn.model_selection import GridSearchCV
parameters={'normalize': [True, False], 
    'fit_intercept': [True, False],
    'n_jobs': [-1]}
grid = GridSearchCV(estimator = lr,param_grid = parameters)
grid_fit = grid.fit(X_train,y_train)
best_model = grid_fit.best_estimator_
best_model

In [None]:
best_model.score(X_test,y_test)

In [None]:
import pickle
with open('prediction_model_1.pickle','wb')as file:
    pickle.dump(best_model,file)

In [None]:
cl_df.head()