## Library

In [26]:
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, plot_confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

## File "chocolate_bars.csv"

In [27]:
data = pd.read_csv('chocolate_bars.csv')
data.head()

Unnamed: 0,id,manufacturer,company_location,year_reviewed,bean_origin,bar_name,cocoa_percent,num_ingredients,ingredients,review,rating
0,2454,5150,U.S.A.,2019,Tanzania,"Kokoa Kamili, batch 1",76.0,3.0,"B,S,C","rich cocoa, fatty, bready",3.25
1,2458,5150,U.S.A.,2019,Dominican Republic,"Zorzal, batch 1",76.0,3.0,"B,S,C","cocoa, vegetal, savory",3.5
2,2454,5150,U.S.A.,2019,Madagascar,"Bejofo Estate, batch 1",76.0,3.0,"B,S,C","cocoa, blackberry, full body",3.75
3,2542,5150,U.S.A.,2021,Fiji,"Matasawalevu, batch 1",68.0,3.0,"B,S,C","chewy, off, rubbery",3.0
4,2546,5150,U.S.A.,2021,Venezuela,"Sur del Lago, batch 1",72.0,3.0,"B,S,C","fatty, earthy, moss, nutty,chalky",3.0


## INFO DATA

In [28]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2530 entries, 0 to 2529
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                2530 non-null   int64  
 1   manufacturer      2530 non-null   object 
 2   company_location  2530 non-null   object 
 3   year_reviewed     2530 non-null   int64  
 4   bean_origin       2530 non-null   object 
 5   bar_name          2530 non-null   object 
 6   cocoa_percent     2530 non-null   float64
 7   num_ingredients   2443 non-null   float64
 8   ingredients       2443 non-null   object 
 9   review            2530 non-null   object 
 10  rating            2530 non-null   float64
dtypes: float64(3), int64(2), object(6)
memory usage: 217.5+ KB


In [29]:
data.mean()

  data.mean()


id                 1429.800791
year_reviewed      2014.374308
cocoa_percent        71.639723
num_ingredients       3.041343
rating                3.196344
dtype: float64

In [30]:
data = data.fillna(data.mean())

  data = data.fillna(data.mean())


In [46]:
data.isna().sum()

id                   0
manufacturer         0
company_location     0
year_reviewed        0
bean_origin          0
bar_name             0
cocoa_percent        0
num_ingredients      0
ingredients         87
review               0
rating               0
dtype: int64

In [31]:
data['bean_origin'].value_counts()

Venezuela                253
Peru                     244
Dominican Republic       226
Ecuador                  219
Madagascar               177
                        ... 
Sumatra                    1
St.Vincent-Grenadines      1
Martinique                 1
Burma                      1
DR Congo                   1
Name: bean_origin, Length: 62, dtype: int64

In [32]:
data.duplicated().sum()

0

## Data Cleaning Nilai Kategorical

In [33]:
data['bean_origin'].value_counts()

Venezuela                253
Peru                     244
Dominican Republic       226
Ecuador                  219
Madagascar               177
                        ... 
Sumatra                    1
St.Vincent-Grenadines      1
Martinique                 1
Burma                      1
DR Congo                   1
Name: bean_origin, Length: 62, dtype: int64

In [34]:
data.drop(data[data['bean_origin'] == 'Sumatra'].index, inplace=True)
data.drop(data[data['bean_origin'] == 'St.Vincent-Grenadines'].index, inplace=True)
data.drop(data[data['bean_origin'] == 'Martinique'].index, inplace=True)
data.drop(data[data['bean_origin'] == 'Burma'].index, inplace=True)
data.drop(data[data['bean_origin'] == 'DR Congo '].index, inplace=True)

In [35]:
data['bean_origin'].value_counts()

Venezuela              253
Peru                   244
Dominican Republic     226
Ecuador                219
Madagascar             177
Blend                  156
Nicaragua              100
Bolivia                 80
Tanzania                79
Colombia                79
Brazil                  78
Belize                  76
Vietnam                 73
Guatemala               62
Mexico                  55
Papua New Guinea        50
Costa Rica              43
Trinidad                42
Ghana                   41
India                   35
U.S.A.                  33
Haiti                   30
Honduras                25
Jamaica                 24
Philippines             24
Indonesia               20
Grenada                 19
Uganda                  19
Fiji                    16
Sao Tome                14
Vanuatu                 13
Cuba                    12
Congo                   11
Solomon Islands         10
St. Lucia               10
Panama                   9
Malaysia                 8
P

In [36]:
x = data[['manufacturer']]
y = data['id']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3)

In [37]:
x_train.to_csv('x_train.csv')
y_train.to_csv('y_train.csv')
x_test.to_csv('x_test.csv')
y_test.to_csv('y_test.csv')

In [38]:
print(f' x training {len(x_train)}',
      f' y training {len(y_train)}',
      f' x testing {len(x_test)}',
      f' y testing {len(y_test)}',
     sep='\n')

 x training 1768
 y training 1768
 x testing 758
 y testing 758


## DATA TRANSFORMASI

## Normalisasi

In [42]:
dataset_normalisasi = data.copy()

In [43]:
scaler = MinMaxScaler()

normalize_dataset = scaler.fit_transform(dataset_normalisasi[['id', 'year_reviewed']])

normalize_dataset = pd.DataFrame(normalize_dataset)
normalize_dataset.rename(columns={ 0 :'id', 1 : 'year_reviewed'}, inplace=True)

normalize_dataset.head(5)

Unnamed: 0,id,year_reviewed
0,0.904692,0.866667
1,0.906169,0.866667
2,0.904692,0.866667
3,0.9372,1.0
4,0.938678,1.0


## Data Cleaning

In [44]:
dataset_normalisasi = data.copy()

In [41]:
imputer_mean = SimpleImputer(strategy='mean')
imputer_median = SimpleImputer(strategy='median')
imputer_modus = SimpleImputer(strategy='most_frequent')

In [48]:
dataset_normalisasi['id'] = imputer_modus.fit_transform(dataset_normalisasi[['id']])

dataset_normalisasi.isna().sum()

id                   0
manufacturer         0
company_location     0
year_reviewed        0
bean_origin          0
bar_name             0
cocoa_percent        0
num_ingredients      0
ingredients         87
review               0
rating               0
dtype: int64

In [49]:
dataset_normalisasi['year_reviewed'] = imputer_mean.fit_transform(dataset_normalisasi[['year_reviewed']])

dataset_normalisasi.isna().sum()

id                   0
manufacturer         0
company_location     0
year_reviewed        0
bean_origin          0
bar_name             0
cocoa_percent        0
num_ingredients      0
ingredients         87
review               0
rating               0
dtype: int64

In [50]:
dataset_normalisasi.dropna(inplace=True)

dataset_normalisasi.isna().sum()

id                  0
manufacturer        0
company_location    0
year_reviewed       0
bean_origin         0
bar_name            0
cocoa_percent       0
num_ingredients     0
ingredients         0
review              0
rating              0
dtype: int64

In [51]:
len(dataset_normalisasi)

2439

## Encoding

In [52]:
le = LabelEncoder()

data['ingredients'] = le.fit_transform(data['ingredients'])

In [53]:
data.head()

Unnamed: 0,id,manufacturer,company_location,year_reviewed,bean_origin,bar_name,cocoa_percent,num_ingredients,ingredients,review,rating
0,2454,5150,U.S.A.,2019,Tanzania,"Kokoa Kamili, batch 1",76.0,3.0,10,"rich cocoa, fatty, bready",3.25
1,2458,5150,U.S.A.,2019,Dominican Republic,"Zorzal, batch 1",76.0,3.0,10,"cocoa, vegetal, savory",3.5
2,2454,5150,U.S.A.,2019,Madagascar,"Bejofo Estate, batch 1",76.0,3.0,10,"cocoa, blackberry, full body",3.75
3,2542,5150,U.S.A.,2021,Fiji,"Matasawalevu, batch 1",68.0,3.0,10,"chewy, off, rubbery",3.0
4,2546,5150,U.S.A.,2021,Venezuela,"Sur del Lago, batch 1",72.0,3.0,10,"fatty, earthy, moss, nutty,chalky",3.0


In [62]:
data.head()

Unnamed: 0,manufacturer,company_location,year_reviewed,bean_origin,bar_name,cocoa_percent,num_ingredients,ingredients,review,rating
0,5150,U.S.A.,2019,Tanzania,"Kokoa Kamili, batch 1",76.0,3.0,10,"rich cocoa, fatty, bready",3.25
1,5150,U.S.A.,2019,Dominican Republic,"Zorzal, batch 1",76.0,3.0,10,"cocoa, vegetal, savory",3.5
2,5150,U.S.A.,2019,Madagascar,"Bejofo Estate, batch 1",76.0,3.0,10,"cocoa, blackberry, full body",3.75
3,5150,U.S.A.,2021,Fiji,"Matasawalevu, batch 1",68.0,3.0,10,"chewy, off, rubbery",3.0
4,5150,U.S.A.,2021,Venezuela,"Sur del Lago, batch 1",72.0,3.0,10,"fatty, earthy, moss, nutty,chalky",3.0


In [64]:
data['bean_origin'] = le.fit_transform(data['bean_origin'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['bean_origin'] = le.fit_transform(data['bean_origin'])


In [65]:
data.head()

Unnamed: 0,manufacturer,company_location,year_reviewed,bean_origin,bar_name,cocoa_percent,num_ingredients,ingredients,review,rating
0,5150,U.S.A.,2019,48,"Kokoa Kamili, batch 1",76.0,3.0,10,"rich cocoa, fatty, bready",3.25
1,5150,U.S.A.,2019,12,"Zorzal, batch 1",76.0,3.0,10,"cocoa, vegetal, savory",3.5
2,5150,U.S.A.,2019,27,"Bejofo Estate, batch 1",76.0,3.0,10,"cocoa, blackberry, full body",3.75
3,5150,U.S.A.,2021,15,"Matasawalevu, batch 1",68.0,3.0,10,"chewy, off, rubbery",3.0
4,5150,U.S.A.,2021,56,"Sur del Lago, batch 1",72.0,3.0,10,"fatty, earthy, moss, nutty,chalky",3.0


In [66]:
data['company_location'] = le.fit_transform(data['company_location'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['company_location'] = le.fit_transform(data['company_location'])


In [67]:
data.head()

Unnamed: 0,manufacturer,company_location,year_reviewed,bean_origin,bar_name,cocoa_percent,num_ingredients,ingredients,review,rating
0,5150,60,2019,48,"Kokoa Kamili, batch 1",76.0,3.0,10,"rich cocoa, fatty, bready",3.25
1,5150,60,2019,12,"Zorzal, batch 1",76.0,3.0,10,"cocoa, vegetal, savory",3.5
2,5150,60,2019,27,"Bejofo Estate, batch 1",76.0,3.0,10,"cocoa, blackberry, full body",3.75
3,5150,60,2021,15,"Matasawalevu, batch 1",68.0,3.0,10,"chewy, off, rubbery",3.0
4,5150,60,2021,56,"Sur del Lago, batch 1",72.0,3.0,10,"fatty, earthy, moss, nutty,chalky",3.0


In [68]:
data['bar_name'] = le.fit_transform(data['bar_name'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['bar_name'] = le.fit_transform(data['bar_name'])


In [69]:
data.head()

Unnamed: 0,manufacturer,company_location,year_reviewed,bean_origin,bar_name,cocoa_percent,num_ingredients,ingredients,review,rating
0,5150,60,2019,48,748,76.0,3.0,10,"rich cocoa, fatty, bready",3.25
1,5150,60,2019,12,1593,76.0,3.0,10,"cocoa, vegetal, savory",3.5
2,5150,60,2019,27,162,76.0,3.0,10,"cocoa, blackberry, full body",3.75
3,5150,60,2021,15,933,68.0,3.0,10,"chewy, off, rubbery",3.0
4,5150,60,2021,56,1414,72.0,3.0,10,"fatty, earthy, moss, nutty,chalky",3.0


In [70]:
data['review'] = le.fit_transform(data['review'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['review'] = le.fit_transform(data['review'])


In [71]:
data.head()

Unnamed: 0,manufacturer,company_location,year_reviewed,bean_origin,bar_name,cocoa_percent,num_ingredients,ingredients,review,rating
0,5150,60,2019,48,748,76.0,3.0,10,1678,3.25
1,5150,60,2019,12,1593,76.0,3.0,10,319,3.5
2,5150,60,2019,27,162,76.0,3.0,10,288,3.75
3,5150,60,2021,15,933,68.0,3.0,10,229,3.0
4,5150,60,2021,56,1414,72.0,3.0,10,742,3.0
