## **Class 26: StreamLit Coffee**

In [12]:
import os
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RepeatedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, RidgeCV
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score
)
from sklearn.inspection import permutation_importance
logger = logging.getLogger()
logger.setLevel(logging.INFO)

### **1. Data loading**
**Objective**: Obtain the data from source and get a first glimpse of their properties and presentation

In [13]:
df_raw = pd.read_csv('../data/raw/coffee.csv')
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1339 entries, 0 to 1338
Data columns (total 44 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             1339 non-null   int64  
 1   total_cup_points       1339 non-null   float64
 2   species                1339 non-null   object 
 3   owner                  1332 non-null   object 
 4   country_of_origin      1338 non-null   object 
 5   farm_name              980 non-null    object 
 6   lot_number             276 non-null    object 
 7   mill                   1021 non-null   object 
 8   ico_number             1180 non-null   object 
 9   company                1130 non-null   object 
 10  altitude               1113 non-null   object 
 11  region                 1280 non-null   object 
 12  producer               1107 non-null   object 
 13  number_of_bags         1339 non-null   int64  
 14  bag_weight             1339 non-null   object 
 15  in_c

In [14]:
df_raw.sample(10, random_state=2025)

Unnamed: 0.1,Unnamed: 0,total_cup_points,species,owner,country_of_origin,farm_name,lot_number,mill,ico_number,company,...,color,category_two_defects,expiration,certification_body,certification_address,certification_contact,unit_of_measurement,altitude_low_meters,altitude_high_meters,altitude_mean_meters
1089,1089,80.33,Arabica,cqi taiwan icp cqi台灣合作夥伴,Taiwan,tokoffee莊園,,tokoffee莊園,,blossom valley宸嶧國際,...,Green,0,"December 26th, 2014",Blossom Valley International,fc45352eee499d8470cf94c9827922fb745bf815,de73fc9412358b523d3a641501e542f31d2668b0,m,600.0,600.0,600.0
942,942,81.42,Arabica,kurt kappeli,Mexico,various,,cafe gourmet de sierra azul sc,0016-2814-0001,globus coffee,...,Green,1,"May 1st, 2015",Specialty Coffee Association,36d0d00a3724338ba7937c52a378d085f2172daa,0878a7d4b9d35ddbf0fe2ce69a2062cceb45a660,m,1550.0,1550.0,1550.0
1241,1241,78.0,Arabica,alejandro garcia palacios,Mexico,finca la fortuna,,finca la fortuna,1506558883,industrializadora de kaffee andes s.a. de c.v.,...,Green,7,"September 11th, 2013",AMECAFE,59e396ad6e22a1c22b248f958e1da2bd8af85272,0eb4ee5b3f47b20b049548a2fd1e7d4a2b70d0a7,m,850.0,850.0,850.0
1332,1332,80.17,Robusta,andrew hetzel,India,sethuraman estates,,sethuraman estates,,"cafemakers, llc",...,Green,0,"June 20th, 2014",Specialty Coffee Association,ff7c18ad303d4b603ac3f8cff7e611ffc735e720,352d0cf7f3e9be14dad7df644ad65efc27605ae2,m,750.0,750.0,750.0
272,272,83.92,Arabica,alfredo bojalil,Mexico,finca sant rosa,,agroindustrias unidas de mexico,2222,ecomtrading,...,,2,"June 6th, 2013",AMECAFE,59e396ad6e22a1c22b248f958e1da2bd8af85272,0eb4ee5b3f47b20b049548a2fd1e7d4a2b70d0a7,m,1500.0,1500.0,1500.0
738,738,82.25,Arabica,consejo salvadoreño del café,El Salvador,zapato de mico,,tuxpal,391,consejo salvadoreño del café,...,Green,0,"July 7th, 2016",Salvadoran Coffee Council,3d4987e3b91399dbb3938b5bdf53893b6ef45be1,27b21e368fb8291cbea02c60623fe6c98f84524d,m,1500.0,1500.0,1500.0
1106,1106,80.17,Arabica,"exportadora atlantic, s.a.",Nicaragua,los mesones,,planta procesadora sebaco,017-053-0125,exportadora atlantic s.a,...,,2,"April 23rd, 2016",Asociación de Cafés Especiales de Nicaragua,fc561dd3c2eee024b032933e0a97b4aede0dc206,f79a8d4dee92a80ff14025f03ea34fa316b2132f,m,1100.0,1275.0,1187.5
1183,1183,79.25,Arabica,marco virgilio ramirez teliz,Mexico,el aguacate,,cafes de naranjal s.a. de c.v.,1104367469,cafes de naranjal s.a. de c.v,...,Green,10,"September 10th, 2013",AMECAFE,59e396ad6e22a1c22b248f958e1da2bd8af85272,0eb4ee5b3f47b20b049548a2fd1e7d4a2b70d0a7,m,1000.0,1000.0,1000.0
14,14,87.83,Arabica,ethiopia commodity exchange,Ethiopia,aolme,,c.p.w.e,010/0338,,...,,2,"August 31st, 2011",Ethiopia Commodity Exchange,a176532400aebdc345cf3d870f84ed3ecab6249e,61bbaf6a9f341e5782b8e7bd3ebf76aac89fe24b,m,1570.0,1700.0,1635.0
596,596,82.75,Arabica,eileen koyanagi,United States (Hawaii),,,,KP010914,kona pacific farmers cooperative,...,Blue-Green,0,"February 5th, 2015",Specialty Coffee Association,36d0d00a3724338ba7937c52a378d085f2172daa,0878a7d4b9d35ddbf0fe2ce69a2062cceb45a660,ft,,,


### **2. Data preprocessing**
**Objectives**: Perform the data cleaning, data transformation and data reduction steps to avoid data mistmatching, noisy data or data not wrangled

In [25]:
df_baking = df_raw.copy()
df_baking = df_baking[['total_cup_points','species','country_of_origin','aroma','flavor','aftertaste','acidity','body','balance','uniformity','clean_cup','sweetness','cupper_points','cupper_points','moisture']]
df_baking.columns = df_baking.columns.str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.lower()
df_baking['species'] = df_baking['species'].astype("category")
df_baking['country_of_origin'] = df_baking['country_of_origin'].astype("category")

df_baking = df_baking.loc[df_baking['flavor'] != 0].reset_index(drop=True)

df_baking.info()

#df = df_baking.copy()
#df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   total_cup_points   1338 non-null   float64 
 1   species            1338 non-null   category
 2   country_of_origin  1337 non-null   category
 3   aroma              1338 non-null   float64 
 4   flavor             1338 non-null   float64 
 5   aftertaste         1338 non-null   float64 
 6   acidity            1338 non-null   float64 
 7   body               1338 non-null   float64 
 8   balance            1338 non-null   float64 
 9   uniformity         1338 non-null   float64 
 10  clean_cup          1338 non-null   float64 
 11  sweetness          1338 non-null   float64 
 12  cupper_points      1338 non-null   float64 
 13  cupper_points      1338 non-null   float64 
 14  moisture           1338 non-null   float64 
dtypes: category(2), float64(13)
memory usage: 140.1 KB


### **3. Exploratory Data Analysis**
**Objective**: Summarize the main characteristics of the dataset using descriptive statistics and data visualization methods

In [None]:
df_train, df_test = train_test_split(df,test_size=0.2,random_state=2025)
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)
df_train.info()

In [None]:
display(df_train.describe().T)

In [None]:
df_train.hist(figsize=(8,8))
plt.tight_layout()
plt.show()

In [None]:
sns.pairplot(df_train, diag_kind='kde')
plt.show()

In [None]:
sns.heatmap(df_train.corr(),vmin=-1,vmax=1,cmap='RdBu',annot=True)
plt.show()

#### Rule 0:
Assume that there is a linear relation between the independent variables and the dependent variables

#### Rule 1:
Variables should not be too correlated (More than 0.75 or less than -0.75)

#### Rule 2:
Residuals of our prediction should have a symetrical distribution

#### Rule 3:
The Residuals should have a homogeneous dispersion

### **4. Machine learning**
**Objective**: Create a model that learns from data to make predictions and generalize to unseen data, and thus perform tasks without explicit instructions

In [None]:
X_train = df_train.drop('compressive_strength', axis=1)
y_train = df_train['compressive_strength']
X_test = df_test.drop('compressive_strength', axis=1)
y_test = df_test['compressive_strength']

In [None]:
num_proc = Pipeline([
    ('scaler', MinMaxScaler())
])
lreg_p = Pipeline([
    ('num_processer', num_proc),
    ('lreg', LinearRegression())
])

In [None]:
lreg_p.fit(X_train, y_train)

In [None]:
y_hat = lreg_p.predict(X_test)
print(f'MSE: {mean_squared_error(y_test, y_hat)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_hat))}')
print(f'R2: {r2_score(y_test, y_hat)}')

In [None]:
plt.scatter(y_test, y_hat)
plt.plot([0,80],[0,80],color='r')
plt.xlabel('real')
plt.ylabel('predicted')
plt.grid(True)
plt.axis('equal')
plt.show()

In [None]:
resid = y_hat-y_test
plt.scatter(y_hat,resid,c='r')
plt.axhline(0,linestyle='--')
plt.xlabel('Fitted Values')
plt.ylabel('residuals')
plt.show()

In [None]:
plt.hist(resid)
plt.show()

In [None]:
print(f'intercept:{lreg_p['lreg'].intercept_:.2f}') # beta_0
print(f'coeficients:{lreg_p['lreg'].coef_}') # list of beta (beta_1, beta_2, beta_3.....beta_n)

$-32.89 + 0.000122*Cement + 0.105*Slag + 0.094*Ash -0.1296*Water + .322221218*Super Plasticizer + .0193374230*Coarse Aggregate + .0246217848*Fine Aggregate + .113637320*Age $

In [None]:
importance = permutation_importance(lreg_p,X_test,y_test,n_repeats=25,random_state=2025)
importance_df = pd.DataFrame(importance.importances.T,columns=X_test.columns)
ax = importance_df.plot.box(vert=False)
ax.axvline(x=0,color='k',linestyle='--')
ax.set_title('Permutation Importance (Test Set)')
ax.grid(True)
plt.show()

In [None]:
#Lasso
cv = RepeatedKFold(n_splits=15,n_repeats=5,random_state=2025)
lreg_lasso = LassoCV(alphas=np.arange(0,1,0.01),cv=cv)
lreg_lasso.fit(X_train,y_train)

In [None]:
lreg_lasso.alpha_

In [None]:
#Ridge
cv = RepeatedKFold(n_splits=15,n_repeats=5,random_state=2025)
lreg_ridge = RidgeCV(alphas=np.arange(0,1,0.01),cv=cv)
lreg_ridge.fit(X_train,y_train)

In [None]:
lreg_ridge.alpha_

In [None]:
num_proc = Pipeline([
    ('scaler', MinMaxScaler())
])
lreg_ridge_p = Pipeline([
    ('num_processer', num_proc),
    ('lreg_ridge', Ridge(alpha=0.99))
])

In [None]:
lreg_ridge_p.fit(X_train, y_train)

In [None]:
y_hat = lreg_ridge_p.predict(X_test)
print(f'MSE: {mean_squared_error(y_test, y_hat)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_hat))}')
print(f'R2: {r2_score(y_test, y_hat)}')

In [None]:
importance = permutation_importance(lreg_ridge_p,X_test,y_test,n_repeats=25,random_state=2025)
importance_df = pd.DataFrame(importance.importances.T,columns=X_test.columns)
ax = importance_df.plot.box(vert=False)
ax.axvline(x=0,color='k',linestyle='--')
ax.set_title('Permutation Importance (Test Set)')
ax.grid(True)
plt.show()