In [97]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from joblib import dump

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, hamming_loss, f1_score, classification_report, multilabel_confusion_matrix, roc_auc_score
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [98]:
df = pd.read_csv('fruit.txt', delimiter = "\t")


#EDA and Preprocessing

In [99]:
df.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [100]:
le_fruit = LabelEncoder()

df['fruit'] = df['fruit_name'] + '_' + df['fruit_subtype']

df['fruit'] = le_fruit.fit_transform(df['fruit'])

df.drop(['fruit_name', 'fruit_subtype', 'fruit_label'] , axis=1, inplace=True)

df.head()

Unnamed: 0,mass,width,height,color_score,fruit
0,192,8.4,7.3,0.55,3
1,180,8.0,6.8,0.59,3
2,176,7.4,7.2,0.6,3
3,86,6.2,4.7,0.8,6
4,84,6.0,4.6,0.79,6


In [101]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59 entries, 0 to 58
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   mass         59 non-null     int64  
 1   width        59 non-null     float64
 2   height       59 non-null     float64
 3   color_score  59 non-null     float64
 4   fruit        59 non-null     int64  
dtypes: float64(3), int64(2)
memory usage: 2.4 KB


In [102]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mass,59.0,163.118644,55.018832,76.0,140.0,158.0,177.0,362.0
width,59.0,7.105085,0.816938,5.8,6.6,7.2,7.5,9.6
height,59.0,7.69322,1.361017,4.0,7.2,7.6,8.2,10.5
color_score,59.0,0.762881,0.076857,0.55,0.72,0.75,0.81,0.93
fruit,59.0,4.830508,2.93112,0.0,2.0,5.0,7.0,9.0


#Building Model

In [103]:
X = df.drop('fruit', axis=1)
y = df['fruit']

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [105]:
model = RandomForestClassifier(random_state=42)

In [106]:
cv_results = cross_validate(model, X_train, y_train, cv=5, scoring=["accuracy"])

In [107]:
cv_results['test_accuracy'].mean()

0.7955555555555555

In [108]:
rf_params = {"max_depth": [5,8,None],
             "min_samples_split": [2,5,8,15,20],
             "n_estimators": [100,200,500]
            }

In [109]:
rf_best_grid = GridSearchCV(model, rf_params,cv=5, n_jobs=1, verbose=True).fit(X_train,y_train)

Fitting 5 folds for each of 45 candidates, totalling 225 fits


In [110]:
rf_final = model.set_params(**rf_best_grid.best_params_, random_state = 42).fit(X_train,y_train)

In [111]:
rf_final.score(X_test, y_test)

0.9166666666666666

In [112]:
y_pred = rf_final.predict(X_test)

In [113]:
X_sample = X_test.sample(5)
y_sample= y_test[X_sample.index]

In [114]:
y_sample

13    2
45    4
25    8
12    0
57    5
Name: fruit, dtype: int64

In [115]:
fruit_pred_decoded = le_fruit.inverse_transform(rf_final.predict(X_sample))
fruit_pred_decoded

array(['apple_golden_delicious', 'lemon_spanish_belsan',
       'orange_spanish_jumbo', 'apple_cripps_pink', 'lemon_unknown'],
      dtype=object)

In [116]:
dump(rf_final, 'final_fruit_model.joblib')

['final_fruit_model.joblib']

In [117]:
dump(le_fruit, 'encoder.joblib')

['encoder.joblib']

In [118]:
my_dict= {
    "mass": 300,
    "width": 5,
    "height": 6,
    "color_score": 0.99
}

In [119]:
inputs =pd.DataFrame.from_dict([my_dict])


In [120]:
le_fruit.inverse_transform(rf_final.predict(inputs))

array(['apple_braeburn'], dtype=object)