# Data Pre-Processing

Data is taken from: https://www.kaggle.com/knightbearr/pizza-price-prediction

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor


## Importing the dataset

In [None]:
df = pd.read_csv('../input/pizza-price-prediction/pizza_v1.csv')


In [None]:
df.head()


Unnamed: 0,company,price_rupiah,diameter,topping,variant,size,extra_sauce,extra_cheese
0,A,"Rp235,000",22.0,chicken,double_signature,jumbo,yes,yes
1,A,"Rp198,000",20.0,papperoni,double_signature,jumbo,yes,yes
2,A,"Rp120,000",16.0,mushrooms,double_signature,reguler,yes,yes
3,A,"Rp155,000",14.0,smoked beef,double_signature,reguler,yes,no
4,A,"Rp248,000",18.0,mozzarella,double_signature,jumbo,yes,no


In [None]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 0 to 128
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   company       129 non-null    object 
 1   price_rupiah  129 non-null    object 
 2   diameter      129 non-null    float64
 3   topping       129 non-null    object 
 4   variant       129 non-null    object 
 5   size          129 non-null    object 
 6   extra_sauce   129 non-null    object 
 7   extra_cheese  129 non-null    object 
dtypes: float64(1), object(7)
memory usage: 8.2+ KB


In [None]:
df.isnull().sum()


company         0
price_rupiah    0
diameter        0
topping         0
variant         0
size            0
extra_sauce     0
extra_cheese    0
dtype: int64

In [None]:
df.describe()

Unnamed: 0,diameter
count,129.0
mean,12.976744
std,3.272674
min,8.0
25%,12.0
50%,12.0
75%,14.0
max,22.0


In [None]:
df_numerical = df.select_dtypes(exclude=['object'])
df_categorical = df.select_dtypes(include=['object'])


In [None]:
df.dtypes


company          object
price_rupiah     object
diameter        float64
topping          object
variant          object
size             object
extra_sauce      object
extra_cheese     object
dtype: object

In [None]:
df['company'].value_counts()
df["price_rupiah"].value_counts()


Rp72,000     11
Rp90,000      8
Rp46,000      7
Rp75,000      5
Rp49,000      5
Rp83,000      5
Rp96,000      5
Rp149,000     4
Rp99,000      4
Rp110,000     4
Rp70,000      4
Rp140,000     4
Rp39,000      4
Rp23,500      3
Rp115,000     3
Rp93,000      3
Rp98,000      3
Rp35,000      3
Rp44,000      3
Rp69,000      3
Rp31,000      3
Rp78,000      3
Rp60,000      3
Rp114,000     2
Rp105,000     2
Rp76,000      2
Rp123,000     2
Rp33,000      2
Rp126,500     2
Rp230,000     2
Rp188,000     2
Rp119,000     2
Rp54,000      1
Rp198,000     1
Rp120,000     1
Rp155,000     1
Rp32,000      1
Rp248,000     1
Rp51,000      1
Rp84,000      1
Rp28,000      1
Rp92,000      1
Rp235,000     1
Name: price_rupiah, dtype: int64

In [None]:
df.rename(columns={'price_rupiah': 'Cost', 'company': 'Company', 'diameter': 'Diameter', 'variant': 'Variant',
          'size': 'Size', 'extra_sauce': 'Extra_sauce', 'extra_cheese': 'Extra_cheese'}, inplace=True)


In [None]:
encoding_columns = ['Company', 'topping', 'Variant',
                    'Size', 'Extra_sauce', 'Extra_cheese']


In [None]:
def refining_cost(col, df):
    df[col] = df[col].map(lambda x: x.replace('Rp', ''))
    df[col] = df[col].map(lambda x: x.replace(',', ''))


In [None]:
def categorical_encoding(value, df):
    one_hot_encoder = ce.OneHotEncoder(
        cols=value, return_df=True, use_cat_names=True)
    df_final = one_hot_encoder.fit_transform(df)
    return df_final


In [None]:
df = categorical_encoding("Company", df)
df = categorical_encoding("topping", df)
df = categorical_encoding('Variant', df)
df = categorical_encoding('Size', df)
df = categorical_encoding("Extra_sauce", df)
df = categorical_encoding('Extra_cheese', df)


  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


In [None]:
refining_cost('Cost', df)
df['Cost'] = df['Cost'].astype('float64')
df.dtypes


Company_A                      int64
Company_B                      int64
Company_C                      int64
Company_D                      int64
Company_E                      int64
Cost                         float64
Diameter                     float64
topping_chicken                int64
topping_papperoni              int64
topping_mushrooms              int64
topping_smoked beef            int64
topping_mozzarella             int64
topping_black papper           int64
topping_tuna                   int64
topping_meat                   int64
topping_sausage                int64
topping_onion                  int64
topping_vegetables             int64
topping_beef                   int64
Variant_double_signature       int64
Variant_american_favorite      int64
Variant_super_supreme          int64
Variant_meat_lovers            int64
Variant_double_mix             int64
Variant_classic                int64
Variant_crunchy                int64
Variant_new_york               int64
V

In [None]:
X = df.drop(['Cost'], axis=1)
y = df['Cost']
print(X.shape)


(129, 48)


## Splitting the dataset into the Training set and Test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train, X_test, y_train, y_test = np.array(X_train), np.array(
    X_test), np.array(y_train), np.array(y_test)


# Modeling

## Multiple Linear Regression

In [None]:
mul_regressor = LinearRegression()
mul_regressor.fit(X_train, y_train)

LinearRegression()

## Decision Tree Regression

In [None]:
decision_regressor = DecisionTreeRegressor(random_state = 0)
decision_regressor.fit(X_train, y_train)

DecisionTreeRegressor(random_state=0)

## Polynomial Rregression

In [None]:
poly_reg = PolynomialFeatures(degree = 4)
X_poly = poly_reg.fit_transform(X_train)
poly_regressor = LinearRegression()
poly_regressor.fit(X_poly, y_train)

LinearRegression()

## Random Forest Regression

In [None]:
forest_regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
forest_regressor.fit(X_train, y_train)


RandomForestRegressor(n_estimators=10, random_state=0)

# R Squared Score

In [None]:

mul_score = r2_score(y_test, mul_regressor.predict(X_test))
decision_score = r2_score(y_test, decision_regressor.predict(X_test))
poly_score = r2_score(
    y_test,  poly_regressor.predict(poly_reg.transform(X_test)))
forest_score = r2_score(y_test,  forest_regressor.predict(X_test))

var = {mul_score: "mul_score", decision_score: "decision_score",
       poly_score: "poly_score", forest_score: "forest_score"}
print("The Regression model with the highest score is:", var.get(max(var)), "=",max(var))


The Regression model with the highest score is: forest_score = 0.9412576941774162
