In [1]:
#Import dependencies
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors as NN
from sklearn.cluster import KMeans
import pydotplus
from IPython.display import Image
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression


In [2]:
#read CSV file into a pandas DataFrame
coffee_data = pd.read_csv("coffee_clean_final.csv", index_col=0)
coffee_data.head()

Unnamed: 0,title,rating,acidity_structure,aftertaste,aroma,body,flavor,blind_assessment,coffee_origin,notes,roast_level,city,state,usd_per_oz
0,Bolivia Manantial Gesha,93.0,9.0,8.0,9.0,8.0,9.0,"Richly aromatic, floral-toned. Magnolia, cocoa...","Caranavi, Bolivia","Produced by Angel Mamani Chambi, entirely of t...",Medium-Light,Floyd,Virginia,2.5
1,Yellow Pacamara Carbonic Maceration Nanolot,92.0,8.0,8.0,9.0,8.0,9.0,"Crisply sweet-savory. White peach, hop flowers...","Matagalpa growing region, Nicaragua",Produced by Benjamin Weiner at Finca Idealista...,Medium-Light,Cleveland,Ohio,13.33
2,Ethiopia Gera Genji Challa,94.0,9.0,8.0,9.0,9.0,9.0,"Delicately aromatic, complex. Lilac, cocoa nib...","Agaro Gera, Jimma Zone, Oromia State, Ethiopia",Ethiopia coffees like this one are largely pro...,Medium-Light,San Diego,California,2.33
3,Yirgacheffe Mengesha Natural,94.0,9.0,8.0,9.0,9.0,9.0,"High-toned, fruit-driven. Boysenberry, pear, c...","Yirgacheffe growing region, southern Ethiopia",Produced at Mengesha Farm from selections of i...,Medium-Light,Glendale,California,1.71
4,Tropical Summer Colombia La Sierra,93.0,9.0,8.0,9.0,8.0,9.0,"Fruit-driven, crisply chocolaty. Goji berry, d...","La Sierra, Cauca Department, Colombia",Produced by smallholding farmers from trees of...,Medium-Light,Harrisonburg,Virginia,2.37


In [3]:
coffee_data.shape

(2103, 14)

In [4]:
coffee_data['roast_level'].value_counts()

Medium-Light    1515
Light            350
Medium           201
Medium-Dark       32
Dark               5
Name: roast_level, dtype: int64

In [5]:
for i in range(len(coffee_data)):
    if coffee_data.loc[i, 'roast_level'] == 'Light':
        coffee_data.loc[i, 'roast_level'] = 1
    if coffee_data.loc[i, 'roast_level'] == 'Medium-Light':
        coffee_data.loc[i, 'roast_level'] = 2
    if coffee_data.loc[i, 'roast_level'] == 'Medium':
        coffee_data.loc[i, 'roast_level'] = 3
    if coffee_data.loc[i, 'roast_level'] == 'Medium-Dark':
        coffee_data.loc[i, 'roast_level'] = 4
    if coffee_data.loc[i, 'roast_level'] == 'Dark':
        coffee_data.loc[i, 'roast_level'] = 5

In [6]:
coffee_data['roast_level'] = coffee_data['roast_level'].astype(int)

In [7]:
coffee_data.head()

Unnamed: 0,title,rating,acidity_structure,aftertaste,aroma,body,flavor,blind_assessment,coffee_origin,notes,roast_level,city,state,usd_per_oz
0,Bolivia Manantial Gesha,93.0,9.0,8.0,9.0,8.0,9.0,"Richly aromatic, floral-toned. Magnolia, cocoa...","Caranavi, Bolivia","Produced by Angel Mamani Chambi, entirely of t...",2,Floyd,Virginia,2.5
1,Yellow Pacamara Carbonic Maceration Nanolot,92.0,8.0,8.0,9.0,8.0,9.0,"Crisply sweet-savory. White peach, hop flowers...","Matagalpa growing region, Nicaragua",Produced by Benjamin Weiner at Finca Idealista...,2,Cleveland,Ohio,13.33
2,Ethiopia Gera Genji Challa,94.0,9.0,8.0,9.0,9.0,9.0,"Delicately aromatic, complex. Lilac, cocoa nib...","Agaro Gera, Jimma Zone, Oromia State, Ethiopia",Ethiopia coffees like this one are largely pro...,2,San Diego,California,2.33
3,Yirgacheffe Mengesha Natural,94.0,9.0,8.0,9.0,9.0,9.0,"High-toned, fruit-driven. Boysenberry, pear, c...","Yirgacheffe growing region, southern Ethiopia",Produced at Mengesha Farm from selections of i...,2,Glendale,California,1.71
4,Tropical Summer Colombia La Sierra,93.0,9.0,8.0,9.0,8.0,9.0,"Fruit-driven, crisply chocolaty. Goji berry, d...","La Sierra, Cauca Department, Colombia",Produced by smallholding farmers from trees of...,2,Harrisonburg,Virginia,2.37


In [8]:
coffee_data = coffee_data.drop(columns=['rating','title', 'blind_assessment', 'coffee_origin', 'notes','city','state'])
coffee_data.head()

Unnamed: 0,acidity_structure,aftertaste,aroma,body,flavor,roast_level,usd_per_oz
0,9.0,8.0,9.0,8.0,9.0,2,2.5
1,8.0,8.0,9.0,8.0,9.0,2,13.33
2,9.0,8.0,9.0,9.0,9.0,2,2.33
3,9.0,8.0,9.0,9.0,9.0,2,1.71
4,9.0,8.0,9.0,8.0,9.0,2,2.37


In [9]:
coffee_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2103 entries, 0 to 2102
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   acidity_structure  2103 non-null   float64
 1   aftertaste         2103 non-null   float64
 2   aroma              2103 non-null   float64
 3   body               2103 non-null   float64
 4   flavor             2103 non-null   float64
 5   roast_level        2103 non-null   int32  
 6   usd_per_oz         2103 non-null   float64
dtypes: float64(6), int32(1)
memory usage: 187.8 KB


In [10]:
y = coffee_data['roast_level']
X = coffee_data.drop(columns=['roast_level'])

## Unbalanced Trials

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

In [12]:
# Create the StandardScaler instance
scaler = MinMaxScaler()

In [13]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [14]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [15]:
classifier = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=1)
classifier

LogisticRegression(multi_class='multinomial', random_state=1)

In [16]:
# Train the data
classifier.fit(X_train_scaled, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(multi_class='multinomial', random_state=1)

In [17]:
# Score the model
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.7203551046290425
Testing Data Score: 0.720532319391635


In [18]:
# Predict outcomes for test data set
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
1311,2,2
1151,2,2
920,2,3
1076,2,1
1287,2,2
...,...,...
594,2,1
507,2,2
733,2,2
1923,2,2


In [19]:
from sklearn.metrics import accuracy_score
# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions)

0.720532319391635

In [20]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[  0,  88,   0,   0,   0],
       [  0, 379,   0,   0,   0],
       [  0,  50,   0,   0,   0],
       [  0,   8,   0,   0,   0],
       [  0,   1,   0,   0,   0]], dtype=int64)

In [21]:
from sklearn.metrics import classification_report
target_names = ['Light','Medium-Light', 'Medium', 'Medium-Dark', 'Dark']
print(classification_report(y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

       Light       0.00      0.00      0.00        88
Medium-Light       0.72      1.00      0.84       379
      Medium       0.00      0.00      0.00        50
 Medium-Dark       0.00      0.00      0.00         8
        Dark       0.00      0.00      0.00         1

    accuracy                           0.72       526
   macro avg       0.14      0.20      0.17       526
weighted avg       0.52      0.72      0.60       526



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Over Sampled Trials

In [22]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

In [23]:
# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
oversample = RandomOverSampler(random_state=1)
# Fit the original training data to the random_oversampler model
X_over, y_over = oversample.fit_resample(X, y)

In [24]:
# Count the distinct values of the resampled labels data
y_over.value_counts()

2    1515
1    1515
3    1515
4    1515
5    1515
Name: roast_level, dtype: int64

In [25]:
from sklearn.model_selection import train_test_split

X_over_train, X_over_test, y_over_train, y_over_test = train_test_split(X_over, 
                                                    y_over, 
                                                    random_state=1, 
                                                    stratify=y_over)

In [26]:
# Create the StandardScaler instance
scaler = MinMaxScaler()

In [27]:
# Fit the Standard Scaler with the training data
X_over_scaler = scaler.fit(X_over_train)

In [28]:
# Scale the training data
X_over_train_scaled = X_over_scaler.transform(X_over_train)
X_over_test_scaled = X_over_scaler.transform(X_over_test)

In [29]:
logreg_over= LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=1)
logreg_over

LogisticRegression(multi_class='multinomial', random_state=1)

In [30]:
# Train the data
logreg_over.fit(X_over_train_scaled, y_over_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(multi_class='multinomial', random_state=1)

In [31]:
# Score the model
print(f"Training Data Score: {logreg_over.score(X_over_train_scaled, y_over_train)}")
print(f"Testing Data Score: {logreg_over.score(X_over_test_scaled, y_over_test)}")

Training Data Score: 0.4567857771519099
Testing Data Score: 0.45617740232312565


In [32]:
# Predict outcomes for test data set
predictions = logreg_over.predict(X_over_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_over_test})

Unnamed: 0,Prediction,Actual
1550,3,2
4664,1,4
75,1,2
5778,3,4
6482,5,5
...,...,...
5233,5,4
5671,1,4
1060,2,3
596,1,1


In [33]:
from sklearn.metrics import accuracy_score
# Display the accuracy score for the test dataset.
accuracy_score(y_over_test, predictions)

0.45617740232312565

In [34]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_over_test, predictions)

array([[241,  83,  47,   3,   5],
       [202,  73,  81,  10,  13],
       [ 67,  75, 162,  28,  46],
       [ 55,  26,  93,  84, 121],
       [  0,  75,   0,   0, 304]], dtype=int64)

In [35]:
from sklearn.metrics import classification_report
target_names = ['Light','Medium-Light', 'Medium', 'Medium-Dark', 'Dark']
print(classification_report(y_over_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

       Light       0.43      0.64      0.51       379
Medium-Light       0.22      0.19      0.21       379
      Medium       0.42      0.43      0.43       378
 Medium-Dark       0.67      0.22      0.33       379
        Dark       0.62      0.80      0.70       379

    accuracy                           0.46      1894
   macro avg       0.47      0.46      0.44      1894
weighted avg       0.47      0.46      0.44      1894



In [36]:
def predict_roast(acidity, aftertaste, aroma, body, flavor, price):
    input_data = [[acidity, aftertaste, aroma, body, flavor, price]]
    roast_prediction = logreg_over.predict(input_data)[0]
    if roast_prediction == 1:
        return "Light Roast"
    elif roast_prediction == 2:
        return "Medium-Light Roast"
    elif roast_prediction == 3:
        return "Medium Roast"
    elif roast_prediction == 4:
        return "Medium-Dark Roast"
    elif roast_prediction == 5:
        return "Dark Roast"
    else:
        return "Go drink some tea"


In [37]:
predict_roast(10, 10, 10, 50, 10, 800)

'Medium Roast'

In [38]:
# Import the necessary libraries and functions
import pickle

# Save the trained model
with open('logistic_regression_oversampled.pkl', 'wb') as f:
    pickle.dump(logreg_over, f)