<a href="https://colab.research.google.com/github/Grashch/Data-science/blob/main/TreeRegressor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd

In [3]:
data = pd.read_csv('/content/flavors_of_cacao.csv')
data = data.rename(columns={
    'Company\xa0\n(Maker-if known)': 'Company (Maker-if known)',
    'Specific Bean Origin\nor Bar Name': 'Specific Bean Origin or Bar Name',
    'Review\nDate' : 'Review Date',
    'Cocoa\nPercent' : 'Cocoa Percent',
    'Company\nLocation' : 'Company Location',
    'Bean\nType' : 'Bean Type',
    'Broad Bean\nOrigin' : 'Broad Bean Origin'
})
data['Cocoa Percent'] = data['Cocoa Percent'].str.replace('%', '').astype(float)
data.head()

Unnamed: 0,Company (Maker-if known),Specific Bean Origin or Bar Name,REF,Review Date,Cocoa Percent,Company Location,Rating,Bean Type,Broad Bean Origin
0,A. Morin,Agua Grande,1876,2016,63.0,France,3.75,,Sao Tome
1,A. Morin,Kpime,1676,2015,70.0,France,2.75,,Togo
2,A. Morin,Atsane,1676,2015,70.0,France,3.0,,Togo
3,A. Morin,Akata,1680,2015,70.0,France,3.5,,Togo
4,A. Morin,Quilla,1704,2015,70.0,France,3.5,,Peru


In [4]:
data.isna().mean(axis=0)

Unnamed: 0,0
Company (Maker-if known),0.0
Specific Bean Origin or Bar Name,0.0
REF,0.0
Review Date,0.0
Cocoa Percent,0.0
Company Location,0.0
Rating,0.0
Bean Type,0.000557
Broad Bean Origin,0.000557


In [5]:
categorical_features = []
numerical_features = []
[col for col in data.columns if data[col].dtype == "object"]

['Company (Maker-if known)',
 'Specific Bean Origin or Bar Name',
 'Company Location',
 'Bean Type',
 'Broad Bean Origin']

In [6]:
company_stat = data['Company (Maker-if known)'].value_counts()
company_stat

Unnamed: 0_level_0,count
Company (Maker-if known),Unnamed: 1_level_1
Soma,47
Bonnat,27
Fresco,26
Pralus,25
A. Morin,23
...,...
Svenska Kakaobolaget,1
Baravelli's,1
Urzi,1
Vietcacao (A. Morin),1


In [7]:
data['company'] = data['Company (Maker-if known)'].apply(lambda x: x if company_stat[x] >= 18 else 'Other')
data['company'].value_counts()

Unnamed: 0_level_0,count
company,Unnamed: 1_level_1
Other,1505
Soma,47
Bonnat,27
Fresco,26
Pralus,25
A. Morin,23
Arete,22
Guittard,22
Domori,22
Valrhona,21


In [8]:
origin_stat = data['Specific Bean Origin or Bar Name'].value_counts()
location_stat = data['Company Location'].value_counts()
type_stat = data['Bean Type'].value_counts(dropna=False)
bean_stat = data['Broad Bean Origin'].value_counts(dropna=False)

In [9]:
data['origin'] = data['Specific Bean Origin or Bar Name'].apply(lambda x: x if origin_stat[x] >= 18 else 'Other')
data['location'] = data['Company Location'].apply(lambda x: x if location_stat[x] >= 18 else 'Other')
data['bean_type'] = data['Bean Type'].apply(lambda x: x if type_stat[x] >= 20 else 'Other')
data['bean_origin'] = data['Broad Bean Origin'].apply(lambda x: x if bean_stat[x] >= 18 else 'Other')

In [10]:
categorical_features.extend(['company', 'origin', 'location', 'bean_type', 'bean_origin'])
numerical_features.extend(['REF', 'Review Date', 'Cocoa Percent'])

In [11]:
sum([data[feature].nunique() for feature in categorical_features])

70

In [12]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
encoder.fit(data[categorical_features])

In [13]:
categories = []
for i, feature in enumerate(categorical_features):
  categories.extend([f'{feature}:{value}' for value in encoder.categories_[i]])

categories[:5]

['company:A. Morin',
 'company:Arete',
 'company:Bonnat',
 'company:Coppeneur',
 'company:Domori']

In [14]:
data.loc[:,categories] = encoder.transform(data[categorical_features]).toarray()
data

Unnamed: 0,Company (Maker-if known),Specific Bean Origin or Bar Name,REF,Review Date,Cocoa Percent,Company Location,Rating,Bean Type,Broad Bean Origin,company,...,bean_origin:Mexico,bean_origin:Nicaragua,bean_origin:Other,bean_origin:Papua New Guinea,bean_origin:Peru,bean_origin:Tanzania,bean_origin:Trinidad,bean_origin:Venezuela,bean_origin:Vietnam,bean_origin:
0,A. Morin,Agua Grande,1876,2016,63.0,France,3.75,,Sao Tome,A. Morin,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A. Morin,Kpime,1676,2015,70.0,France,2.75,,Togo,A. Morin,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,A. Morin,Atsane,1676,2015,70.0,France,3.00,,Togo,A. Morin,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,A. Morin,Akata,1680,2015,70.0,France,3.50,,Togo,A. Morin,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,A. Morin,Quilla,1704,2015,70.0,France,3.50,,Peru,A. Morin,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1790,Zotter,Peru,647,2011,70.0,Austria,3.75,,Peru,Other,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1791,Zotter,Congo,749,2011,65.0,Austria,3.00,Forastero,Congo,Other,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1792,Zotter,Kerala State,749,2011,65.0,Austria,3.50,Forastero,India,Other,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1793,Zotter,Kerala State,781,2011,62.0,Austria,3.25,,India,Other,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
from sklearn.model_selection import train_test_split

features = numerical_features + categories

X = data[features]
y = data['Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

In [28]:
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor()
tree.fit(X_train, y_train)

In [16]:
from sklearn.

def get_score(model):