# Building Project Usng Pipline

In [1]:
# Data libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# Modeling
from sklearn.ensemble import RandomForestRegressor
# from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score
np.random.seed(7)
import pickle as pc
%matplotlib inline

In [2]:
# Import data and clean data
data =  pd.read_csv('mobile_detail.csv')
data
data = data.drop('imgURL', axis =1)
data = data.drop('corpus', axis =1)
# data = data.drop('mobile_name', axis =1)
data.columns

Index(['ratings', 'price', 'brand', 'mobile_name', 'color', 'storage',
       'system', 'processor_type', 'resolution', 'size'],
      dtype='object')

In [3]:
data.isnull().sum()
data.dropna(subset=['processor_type'], inplace=True)
data.dropna(subset=['size'], inplace=True)

In [4]:
data

Unnamed: 0,ratings,price,brand,mobile_name,color,storage,system,processor_type,resolution,size
0,4.2,23999,redmi,Note 12 Pro 5G,black,128,android,mediatek,Full HD,6.7
1,4.5,20999,oppo,F11 Pro,green,128,android,mediatek,Full HD,6.5
3,4.1,21999,oneplus,Nord CE 5G,blue,256,android,qualcomm,Full HD,6.4
4,4.6,3537,apple,iPhone 13 mini,blue,128,ios,apple,Full HD,5.4
5,4.5,5537,apple,iPhone 6s Plus,gold,64,ios,apple,4K,5.5
...,...,...,...,...,...,...,...,...,...,...
1977,4.3,9490,vivo,Y3s,blue,32,android,mediatek,HD,6.5
1978,4.5,11464,apple,iPhone 8,grey,256,ios,apple,HD,4.7
1980,4.1,15999,motorola,g72,grey,128,android,mediatek,Full HD,6.5
1981,4.3,11999,redmi,Note 9,black,64,android,mediatek,Full HD,6.5


In [5]:
print(data.dtypes)
data.isnull().sum()
data.columns

ratings           float64
price               int64
brand              object
mobile_name        object
color              object
storage             int64
system             object
processor_type     object
resolution         object
size              float64
dtype: object


Index(['ratings', 'price', 'brand', 'mobile_name', 'color', 'storage',
       'system', 'processor_type', 'resolution', 'size'],
      dtype='object')

In [6]:
# dfine features and transformers from piplines
cat_feature = ["brand","mobile_name","color","system","resolution"]
cat_transform = Pipeline(steps=[
    # ("imputer",SimpleImputer(strategy='constant', fill_value='missing')),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
# # for imputing values which is not present in my data
# sim_feature = ['Sim_Cards']
# sim_feature = pipline(steps = [
#     ("imputer", SimpleImputer(strategy= 'c', fill_value = 4))
# ])
# num_feature = ["Memory(kb)"]
# num_feature = pipline(steps = [
#     ("imputer", SimpleImputer(strategy='mean'))
# ])

In [7]:
# Defining variable for upper part
np.random.seed(7)
preprocessing = ColumnTransformer(
    transformers=[
        ("cat", cat_transform, cat_feature),
        # ('sim', sim_transform, sim_feature),
        # ('num', num_transform, num_features)
    ]
)
# creating new pipline
model_pipeline = Pipeline(steps = [
    ("preprocessing", preprocessing),
    ("model", RandomForestRegressor())
])
# now split data
X = data.drop('size', axis = 1)
y = data['size']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size= 0.2)
# using fit now run and evaluate
model_pipeline.fit(X_train,y_train)
model_pipeline.score(X_test, y_test)

0.8317073684313944

In [8]:
# tunning hyper parametrs
# Dictionary 
hparams3 = {'model__max_depth': [10, 20, 30], # have to fit these params in upper cell "model"
            # 'preprocessing__num__imputer__strategy':["mean","median"]
           'model__n_estimators': [100, 200, 500],
           'model__min_samples_leaf':  [1, 2],
           'model__min_samples_split': [2,4],
           'model__max_features': ['sqrt', 'log2', None]
           }
my_gscv_model = GridSearchCV(model_pipeline, hparams3, cv=5, verbose=2)
my_gscv_model.fit(X_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=10; total time=   0.0s
[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=10; total time=   0.0s
[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=10; total time=   0.0s
[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=10; total time=   0.0s
[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=10; total time=   0.0s
[CV] END model__max_depth=10, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time=   0.1s
[CV] END mod

  _data = np.array(data, dtype=dtype, copy=copy,


In [9]:
my_gscv_model.score(X_test, y_test)

0.8065912329225666