In [1]:
from fastai.vision.all import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
## Configuration
## If you are training a new model, set is_training to True, or False to load a pretrained model from a pkl file
is_training = False

## If training a new model, this is the file name that the model will be saved to.
## If not training this will be the name of the pkl file to load.
file_name = '768x512+8_standardized_mae.pkl'

## Set mae_loss to True to use Mean Absolute Error for the loss function, or False to use Mean Squared Error.
mae_loss = True

## Set use_standardization to True to use standardization for feature scaling, or False to use normalization.
use_standardization = True

## Batch size of 28 works for my 16GB of VRAM with 512x images. However, this value will depend on your specs. Use smaller values if less VRAM is available. 12 is fine for 768x images
batch_size = 12

## Set the size in pixels that the images should be scaled to. Higher counts will require more VRAM and take longer per epoch to train.
## However, higher values may allow the model to converge with fewer total epochs and achieve a higher accuracy.
img_size = 768

## Set the learning rate to a custom value. Or, to use lr_find to automatically calculate a good learning rate, set lr = 0 (this is the recommended default).
lr = 0

## Set the number of epochs to train for. Should probably be >= ~200 to yield a usable model.
n_epochs = 512

## Set the number of epochs to train while the body of the weights is frozen. This should not be set too high to avoid overfitting (default = 1).
n_freeze_epochs = 4

In [3]:
## Read in y values.
df = pd.read_csv('PhenotypeDataUGA.csv').drop('line', axis=1)
df

Unnamed: 0,photo_id,perimeter1_cm,width1_cm,heigth1_cm,perimeter2_cm,width2_cm,heigth2_cm,prox_angl_macr,distal_angl_macr,tomat_pericap_area,tomat_pericap_area_ratio,tomat_pericap_thick,tomat_pericap_thick_ratio
0,6742,84.32,16.80,31.41,87.67,17.61,32.41,101.6,112.0,90.94,0.19,1.09,0.2
1,6744,75.96,18.63,25.27,82.09,19.57,26.66,118.3,122.7,73.39,0.18,0.93,0.2
2,6745,69.47,18.00,23.02,73.99,18.98,24.28,124.0,136.9,65.64,0.18,0.92,0.2
3,6746,74.58,14.73,27.44,80.63,15.52,28.92,108.6,101.7,63.50,0.18,0.81,0.2
4,6748,79.19,15.29,30.22,84.72,16.08,31.85,100.5,88.6,73.25,0.18,0.90,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
290,7714,79.37,16.03,22.55,85.12,16.89,31.21,96.8,105.8,76.44,0.18,0.93,0.2
291,7722,54.35,15.63,16.64,58.76,16.44,17.33,144.5,139.6,48.75,0.21,0.87,0.2
292,7723,49.77,14.02,15.43,57.10,14.79,16.22,138.5,143.7,37.69,0.20,0.69,0.2
293,7727,67.71,18.25,22.06,73.26,19.23,23.25,145.2,126.5,69.12,0.20,0.98,0.2


In [4]:
## Extract column names.
traits = df.columns.values.tolist()[1:]
traits

['perimeter1_cm',
 'width1_cm',
 'heigth1_cm',
 'perimeter2_cm',
 'width2_cm',
 'heigth2_cm',
 'prox_angl_macr',
 'distal_angl_macr',
 'tomat_pericap_area',
 'tomat_pericap_area_ratio',
 'tomat_pericap_thick',
 'tomat_pericap_thick_ratio']

In [5]:
## Feature Scaling
scaler = StandardScaler() if use_standardization else MinMaxScaler()
df.iloc[:, 1:] = pd.DataFrame(scaler.fit_transform(df.iloc[:, 1:]), columns=traits)

df

Unnamed: 0,photo_id,perimeter1_cm,width1_cm,heigth1_cm,perimeter2_cm,width2_cm,heigth2_cm,prox_angl_macr,distal_angl_macr,tomat_pericap_area,tomat_pericap_area_ratio,tomat_pericap_thick,tomat_pericap_thick_ratio
0,6742,0.857054,-0.681010,1.441886,0.812500,-0.719054,1.313903,-1.571232,-0.618085,0.509113,0.183480,0.319679,-0.078353
1,6744,0.020654,0.029755,0.002392,0.242991,0.011434,-0.013293,-0.434452,-0.052156,-0.242664,-0.205919,-0.411015,-0.078353
2,6745,-0.628657,-0.214935,-0.525110,-0.583714,-0.208458,-0.562638,-0.046450,0.698891,-0.574646,-0.205919,-0.456684,-0.078353
3,6746,-0.117412,-1.484990,0.511138,0.093980,-1.497991,0.508353,-1.094737,-1.162858,-0.666316,-0.205919,-0.959036,-0.078353
4,6748,0.343809,-1.267488,1.162896,0.511416,-1.289281,1.184646,-1.646110,-1.855725,-0.248661,-0.205919,-0.548021,-0.078353
...,...,...,...,...,...,...,...,...,...,...,...,...,...
290,7714,0.361817,-0.980075,-0.635299,0.552240,-0.987396,1.036923,-1.897971,-0.946007,-0.112013,-0.205919,-0.411015,-0.078353
291,7722,-2.141380,-1.135434,-2.020870,-2.138124,-1.155110,-2.166814,1.348999,0.841696,-1.298151,0.962278,-0.685026,-0.078353
292,7723,-2.599599,-1.760752,-2.304549,-2.307547,-1.770061,-2.423021,0.940575,1.058547,-1.771921,0.572879,-1.507057,-0.078353
293,7727,-0.804741,-0.117835,-0.750177,-0.658219,-0.115284,-0.800379,1.396648,0.148829,-0.425575,0.572879,-0.182674,-0.078353


In [6]:
## Combine values in each row to a list to pass into the model.
df['combined'] = df[traits].values.tolist()

In [7]:
## Split the dataset.
train, test = train_test_split(df, test_size=0.2, random_state=8)

In [8]:
class TitledList(list, ShowTitle):
    _show_args = {'label': 'text'}
    
    def show(self, ctx=None, **kwargs):
        "Show self"
        return show_title(self, ctx=ctx, **merge(self._show_args, kwargs))

class ToListTensor(DisplayedTransform):
    _show_args = {'label': 'text'}
    
    def __init__(self, split_idx=None,):
        super().__init__(split_idx=split_idx)

    def encodes(self, o): return o
    
    def decodes(self, o): return TitledList(o)

In [9]:
## Create the datablock. Of particular importance are the resize dimensions, method, and pad mode.
## The model gets better results faster when using high resolution images.
## Padding with zeros ensures that there is no loss of aspect ratio or information compared to squishing or cropping.
plant = DataBlock(blocks = [ImageBlock, RegressionBlock(n_out=12)],
                  get_x = ColReader('photo_id', pref=f'fruits/IMG_', suff='.JPG'),
                  get_y = Pipeline( [ColReader('combined'), ToListTensor ]),
                  splitter = RandomSplitter(),
                  item_tfms = Resize(img_size, method=ResizeMethod.Pad, pad_mode=PadMode.Zeros),
                  n_inp = 1
)


In [10]:
## Create the dataloader. Use cuda if available.
if torch.cuda.is_available():
    dls = plant.dataloaders(train, bs=batch_size).cuda()
else:
    dls = plant.dataloaders(train, bs=batch_size)

In [11]:
## Create the learner using resnet50 as the initial weights.
learn = vision_learner(
               dls = dls,
               arch = resnet50,
               metrics = [mae, mse, rmse, R2Score()],
               loss_func = L1LossFlat() if mae_loss else MSELossFlat(),
               n_out = 12
)



In [12]:
## If training, set learning rate. When training, lr_find can be useful for picking a more optimal learning rate.
if is_training and lr <= 0:
    lrs = learn.lr_find(suggest_funcs=(minimum, steep, valley, slide))
    lr = lrs.valley

In [13]:
if is_training:
    ## Train the model and export to file.
    learn.fine_tune(epochs=n_epochs, base_lr=lr, freeze_epochs=n_freeze_epochs)
    learn.export(file_name)
else:
    ## Load model for testing
    learn = load_learner(file_name)

In [14]:
## Predict on the test dataset.
dl = learn.dls.test_dl(test)
preds, _ = learn.get_preds(dl=dl)

In [15]:
## Extract list of photo IDs.
photo_id_col = test["photo_id"].tolist()

## Undo feature scaling.
preds = pd.DataFrame(scaler.inverse_transform(preds), columns=traits)

## Append the photo_id column
preds['photo_id'] = photo_id_col
preds.head(10)

Unnamed: 0,perimeter1_cm,width1_cm,heigth1_cm,perimeter2_cm,width2_cm,heigth2_cm,prox_angl_macr,distal_angl_macr,tomat_pericap_area,tomat_pericap_area_ratio,tomat_pericap_thick,tomat_pericap_thick_ratio,photo_id
0,80.394204,21.231676,25.322381,82.944546,22.328177,26.633121,129.447056,140.461573,94.663912,0.196863,1.192224,0.200008,7373
1,63.568904,16.621856,20.405467,67.706555,17.506442,21.603574,135.49465,129.245143,54.008179,0.178437,0.817279,0.200012,7205
2,64.618115,17.23656,20.498602,69.935571,18.187346,22.256365,130.796796,135.782485,59.799296,0.180869,0.876117,0.200011,6894
3,77.000048,15.799273,28.29758,80.228671,16.608876,29.704879,110.620696,89.976849,70.802709,0.179489,0.919941,0.200006,6974
4,79.508032,15.987644,28.649273,82.728926,16.824471,30.599872,104.767142,95.098712,77.318812,0.187178,0.975562,0.200002,7327
5,66.831773,19.00722,17.36554,75.057561,19.460298,23.655715,134.916216,124.718448,74.208607,0.194008,1.003797,0.200022,7727
6,79.609221,17.156411,28.982454,83.546018,17.904489,30.475692,105.319143,104.56101,84.850596,0.190365,1.060882,0.200012,6981
7,65.762961,17.598488,21.329091,70.217475,18.481745,22.152496,134.467205,125.97298,55.438228,0.170939,0.815291,0.200009,7387
8,64.116505,13.476868,22.913922,68.483048,14.35555,24.3012,108.775016,109.732765,55.852978,0.197828,0.842577,0.20004,7294
9,72.050151,17.167283,24.990153,75.122966,18.012392,26.051311,120.330746,108.879358,70.320938,0.187239,0.976254,0.20001,7005


In [16]:
## Drop photo_id and combined as undoing feature scaling using sklearn returns a numpy array, losing the dataframe's column names.
test = test.drop(['photo_id', 'combined'], axis=1, errors='ignore')
test.index = range(len(test.index))

## Undo feature scaling.
test = pd.DataFrame(scaler.inverse_transform(test), columns=traits)

## Append the photo_id column
test['photo_id'] = photo_id_col
test.head(10)

Unnamed: 0,perimeter1_cm,width1_cm,heigth1_cm,perimeter2_cm,width2_cm,heigth2_cm,prox_angl_macr,distal_angl_macr,tomat_pericap_area,tomat_pericap_area_ratio,tomat_pericap_thick,tomat_pericap_thick_ratio,photo_id
0,78.31,21.43,25.21,83.16,22.55,26.6,120.3,140.8,92.13,0.19,1.16,0.2,7373
1,64.07,16.83,19.79,65.93,17.61,20.71,133.6,133.7,35.8,0.12,0.56,0.2,7205
2,64.22,17.38,20.79,69.87,18.34,21.97,130.3,139.8,62.73,0.2,0.94,0.2,6894
3,91.12,14.79,26.33,81.49,16.83,29.87,111.0,76.3,76.62,0.19,0.98,0.2,6974
4,61.69,13.59,22.41,81.97,16.58,30.7,99.1,92.1,72.36,0.18,0.92,0.2,7327
5,67.71,18.25,22.06,73.26,19.23,23.25,145.2,126.5,69.12,0.2,0.98,0.2,7727
6,80.15,16.61,29.63,84.77,17.47,31.4,90.6,99.4,84.74,0.19,1.04,0.2,6981
7,62.3,17.14,19.71,68.79,17.92,20.77,143.1,128.2,36.19,0.12,0.54,0.2,7387
8,60.84,12.64,22.09,64.43,13.65,23.33,105.8,100.9,45.6,0.18,0.74,0.2,7294
9,70.99,16.88,25.27,74.37,17.58,26.6,122.5,88.0,63.03,0.18,0.88,0.2,7005


In [17]:
## Create empty dataframe to combine test and preds into.
test_preds_combined = pd.DataFrame()

## Iterate through test and preds and append each row. Even rows are test data, the next index is the predicted value.
for i in range(len(preds)):
    test_preds_combined = pd.concat([test_preds_combined, test.iloc[i]], axis=1)
    test_preds_combined = pd.concat([test_preds_combined, preds.iloc[i]], axis=1)

## Transform dataframe to have the same structure as the originals.
test_preds_combined = test_preds_combined.T.reset_index(drop=True)

## Export dataframe to a csv file.
test_preds_combined.to_csv(file_name[:-4] + '_combined.csv')
test_preds_combined

Unnamed: 0,perimeter1_cm,width1_cm,heigth1_cm,perimeter2_cm,width2_cm,heigth2_cm,prox_angl_macr,distal_angl_macr,tomat_pericap_area,tomat_pericap_area_ratio,tomat_pericap_thick,tomat_pericap_thick_ratio,photo_id
0,78.310000,21.430000,25.210000,83.160000,22.550000,26.600000,120.300000,140.800000,92.130000,0.190000,1.160000,0.200000,7373.0
1,80.394204,21.231676,25.322381,82.944546,22.328177,26.633121,129.447056,140.461573,94.663912,0.196863,1.192224,0.200008,7373.0
2,64.070000,16.830000,19.790000,65.930000,17.610000,20.710000,133.600000,133.700000,35.800000,0.120000,0.560000,0.200000,7205.0
3,63.568904,16.621856,20.405467,67.706555,17.506442,21.603574,135.494650,129.245143,54.008179,0.178437,0.817279,0.200012,7205.0
4,64.220000,17.380000,20.790000,69.870000,18.340000,21.970000,130.300000,139.800000,62.730000,0.200000,0.940000,0.200000,6894.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,79.097124,16.282955,28.816448,83.845616,17.087934,30.640850,103.314446,103.873076,71.927868,0.172769,0.901651,0.200006,6797.0
114,66.640000,17.890000,21.430000,70.120000,18.840000,22.550000,128.300000,122.900000,58.390000,0.180000,0.870000,0.200000,6933.0
115,68.784342,18.221505,22.376605,71.512175,19.104020,23.131495,133.458661,132.843030,70.733333,0.198417,1.029811,0.200011,6933.0
116,87.240000,20.900000,28.410000,89.650000,22.020000,29.670000,127.100000,134.100000,103.880000,0.190000,1.210000,0.200000,7189.0


In [18]:
r2_table = pd.DataFrame(index=['MSE', 'MAE', 'R2'])

## Calculate regression metrics for all columns combined except photo_id.
r2_table.loc['MSE', 'Combined'] = mean_squared_error(test.iloc[:, :-1], preds.iloc[:, :-1])
r2_table.loc['MAE', 'Combined'] = mean_absolute_error(test.iloc[:, :-1], preds.iloc[:, :-1])
r2_table.loc['R2', 'Combined'] = r2_score(test.iloc[:, :-1], preds.iloc[:, :-1])

## Calculate regression metrics for each column individually.
for i, name in enumerate(traits):
    r2_table.loc['MSE', name] = mean_squared_error(test.iloc[:, i], preds.iloc[:, i])
    r2_table.loc['MAE', name] = mean_absolute_error(test.iloc[:, i], preds.iloc[:, i])
    r2_table.loc['R2', name] = r2_score(test.iloc[:, i], preds.iloc[:, i])

r2_table.to_csv(file_name[:-4] + '_metrics.csv')
r2_table

Unnamed: 0,Combined,perimeter1_cm,width1_cm,heigth1_cm,perimeter2_cm,width2_cm,heigth2_cm,prox_angl_macr,distal_angl_macr,tomat_pericap_area,tomat_pericap_area_ratio,tomat_pericap_thick,tomat_pericap_thick_ratio
MSE,11.666187,12.228696,0.358845,2.106881,3.24669,0.233985,0.645959,23.807954,36.346355,61.008189,0.000321,0.010364,2e-06
MAE,1.665736,2.019034,0.468479,0.932242,1.38636,0.407218,0.610864,3.554127,4.686165,5.834144,0.013262,0.076759,0.000178
R2,0.782467,0.890972,0.948608,0.887623,0.968769,0.968219,0.969368,0.888971,0.890553,0.881716,0.358271,0.752203,-0.015663


In [19]:
## valid_ranges stores margins of error to be checked.
valid_ranges = dict.fromkeys([0.1, 0.05, 0.025, 0.01])
## Iterate through each margin of error and save each row where every column is within that percent of the test values. 
for percent in valid_ranges:
    accurate_preds = preds.copy()
    size_matched_test = test.copy()
    for col in traits:
        accurate_preds = accurate_preds.loc[(accurate_preds[col] >= (size_matched_test[col]*(1-percent))) & (accurate_preds[col] <= (size_matched_test[col]*(1+percent)))]
        size_matched_test = size_matched_test[size_matched_test['photo_id'].isin(accurate_preds['photo_id'])]
    valid_ranges[percent] = accurate_preds

In [20]:
## Print proportion of the predicted values that are within each margin of error of the test set.
for percent in valid_ranges:
    print(f'Accurate within ' + str(percent * 100) + "% of test: " + str(len(valid_ranges[percent]) / len(test)))

Accurate within 10.0% of test: 0.5254237288135594
Accurate within 5.0% of test: 0.15254237288135594
Accurate within 2.5% of test: 0.01694915254237288
Accurate within 1.0% of test: 0.0


In [28]:
r2_table = pd.DataFrame(index=['MSE', 'MAE', 'R2'])

## Calculate regression metrics for all columns combined except photo_id.
r2_table.loc['MSE', 'Combined'] = mean_squared_error(test.iloc[:, :-1].drop(columns=['tomat_pericap_area_ratio', 'tomat_pericap_thick_ratio']), preds.iloc[:, :-1].drop(columns=['tomat_pericap_area_ratio', 'tomat_pericap_thick_ratio']))
r2_table.loc['MAE', 'Combined'] = mean_absolute_error(test.iloc[:, :-1].drop(columns=['tomat_pericap_area_ratio', 'tomat_pericap_thick_ratio']), preds.iloc[:, :-1].drop(columns=['tomat_pericap_area_ratio', 'tomat_pericap_thick_ratio']))
r2_table.loc['R2', 'Combined'] = r2_score(test.iloc[:, :-1].drop(columns=['tomat_pericap_area_ratio', 'tomat_pericap_thick_ratio']), preds.iloc[:, :-1].drop(columns=['tomat_pericap_area_ratio', 'tomat_pericap_thick_ratio']))

r2_table

Unnamed: 0,Combined
MSE,13.999392
MAE,1.997539
R2,0.9047
