In [1]:
import sklearn
import plotly
import statsmodels
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import PolynomialFeatures
from sklearn import preprocessing

In [18]:
# !ls data

In [3]:
model_model_inference = pd.read_csv("data/regular_no_fp_nt_memory_inference_result.csv")
model_with_params = model_model_inference.copy(deep=True)


# https://huggingface.co/transformers/pretrained_models.html
num_params = {
    "BERT_base": 110,
    "BERT_large": 330,
    "RoBERTa_base": 125,
    "RoBERTa_large": 355,
    "dist_bert": 66, # uncased
    "MiniLM": 33,
    "MiniLM_multilingual": 21, 
    "mBERT_cased": 179,
    "XLM-R_base": 270,
    "XLM-R_large": 550,
    "GPT-2": 117,
    "BART_large": 406,
    "mBART_large": 610,
}

for model, value in num_params.items():
    model_with_params['model'].replace({model: value}, inplace=True)
model_with_params

Unnamed: 0,model,batch_size,sequence_length,result
0,110,1,1,1870
1,110,1,2,1870
2,110,1,4,1870
3,110,1,8,1870
4,110,1,16,1870
...,...,...,...,...
775,610,32,32,5964
776,610,32,64,8200
777,610,32,128,12568
778,610,32,256,21516


In [4]:
le = preprocessing.LabelEncoder()
le.fit(model_model_inference['model'])
model_model_inference['model'] = le.transform(model_model_inference['model'])
le.classes_

array(['BART_large', 'BERT_base', 'BERT_large', 'GPT-2', 'MiniLM',
       'MiniLM_multilingual', 'RoBERTa_base', 'RoBERTa_large',
       'XLM-R_base', 'XLM-R_large', 'dist_bert', 'mBART_large',
       'mBERT_cased'], dtype=object)

## Initial Testing of different parameters for the regression model

#### Lets test with the data as it is

In [5]:
# Test models all first
X = model_model_inference[["sequence_length", "batch_size", "model"]]
y = model_model_inference[["result"]]

model = sm.OLS(y, X)
res = model.fit()
print(res.summary())

                                 OLS Regression Results                                
Dep. Variable:                 result   R-squared (uncentered):                   0.685
Model:                            OLS   Adj. R-squared (uncentered):              0.684
Method:                 Least Squares   F-statistic:                              563.8
Date:                Tue, 03 Aug 2021   Prob (F-statistic):                   1.73e-194
Time:                        11:00:34   Log-Likelihood:                         -7078.0
No. Observations:                 780   AIC:                                  1.416e+04
Df Residuals:                     777   BIC:                                  1.418e+04
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                      coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------

#### Let's test the same model, but changing the names to the num of params
We get a much better $R^2$ score

In [6]:
# Test models all but with params
X = model_with_params[["sequence_length", "batch_size", "model"]]
y = model_with_params[["result"]]

model = sm.OLS(y, X)
res = model.fit()
print(res.summary())

                                 OLS Regression Results                                
Dep. Variable:                 result   R-squared (uncentered):                   0.771
Model:                            OLS   Adj. R-squared (uncentered):              0.770
Method:                 Least Squares   F-statistic:                              873.4
Date:                Tue, 03 Aug 2021   Prob (F-statistic):                   2.42e-248
Time:                        11:00:35   Log-Likelihood:                         -6953.4
No. Observations:                 780   AIC:                                  1.391e+04
Df Residuals:                     777   BIC:                                  1.393e+04
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                      coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------

#### Test removing the model name

In [7]:
X = model_with_params[["sequence_length", "batch_size"]]
y = model_with_params[["result"]]

model = sm.OLS(y, X)
res = model.fit()
print(res.summary())

                                 OLS Regression Results                                
Dep. Variable:                 result   R-squared (uncentered):                   0.580
Model:                            OLS   Adj. R-squared (uncentered):              0.579
Method:                 Least Squares   F-statistic:                              538.2
Date:                Tue, 03 Aug 2021   Prob (F-statistic):                   1.85e-147
Time:                        11:00:35   Log-Likelihood:                         -7190.0
No. Observations:                 780   AIC:                                  1.438e+04
Df Residuals:                     778   BIC:                                  1.439e+04
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                      coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------

#### Test the imbact of batch size and sequence length

In [8]:
X = model_with_params[["sequence_length", "model"]]
y = model_with_params[["result"]]

model = sm.OLS(y, X)
res = model.fit()
print(res.summary())

                                 OLS Regression Results                                
Dep. Variable:                 result   R-squared (uncentered):                   0.733
Model:                            OLS   Adj. R-squared (uncentered):              0.733
Method:                 Least Squares   F-statistic:                              1070.
Date:                Tue, 03 Aug 2021   Prob (F-statistic):                   5.08e-224
Time:                        11:00:35   Log-Likelihood:                         -7013.3
No. Observations:                 780   AIC:                                  1.403e+04
Df Residuals:                     778   BIC:                                  1.404e+04
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                      coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------

In [9]:
X = model_with_params[["batch_size", "model"]]
y = model_with_params[["result"]]

model = sm.OLS(y, X)
res = model.fit()
print(res.summary())

                                 OLS Regression Results                                
Dep. Variable:                 result   R-squared (uncentered):                   0.707
Model:                            OLS   Adj. R-squared (uncentered):              0.706
Method:                 Least Squares   F-statistic:                              939.5
Date:                Tue, 03 Aug 2021   Prob (F-statistic):                   3.17e-208
Time:                        11:00:35   Log-Likelihood:                         -7049.7
No. Observations:                 780   AIC:                                  1.410e+04
Df Residuals:                     778   BIC:                                  1.411e+04
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

# Make regression for the best fitting parameters

#### Import statndard models

In [10]:
import sklearn
import plotly
import statsmodels
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import PolynomialFeatures
from sklearn import preprocessing
from typing import List, Union, Optional


# INFERENCE
# Model size and GPU
reformer_model_inference = pd.read_csv("data/efficient_new_no_fp_nt_memory_inference_result.csv")
efficient_model_inference = pd.read_csv("data/efficient_training_no_fp_nt_memory_inference_result.csv")
model_model_inference = pd.read_csv("data/regular_no_fp_nt_memory_inference_result.csv")


# Inference time
reformer_time_inference = pd.read_csv("data/efficient_new_no_fp_nt_time_inference_result.csv")
efficient_time_inference = pd.read_csv("data/efficient_training_no_fp_nt_time_inference_result.csv")
model_time_inference = pd.read_csv("data/regular_no_fp_nt_time_inference_result.csv")


# TRAINING
# Model size and GPU
efficient_model_train_inference = pd.read_csv("data/efficient_training_no_fp_wt_memory_inference_result.csv")   # inference
efficient_model_train_train = pd.read_csv("data/efficient_training_no_fp_wt_memory_train_result.csv")           # training
model_model_train_inference = pd.read_csv("data/regular_no_fp_wt_memory_inference_result.csv")                  # inference
model_model_train_train = pd.read_csv("data/regular_no_fp_wt_memory_train_result.csv")                          # train


# Inference Time
efficient_time_train_inference = pd.read_csv("data/efficient_training_no_fp_wt_time_inference_result.csv")    # inference
efficient_time_train_train = pd.read_csv("data/efficient_training_no_fp_wt_time_train_result.csv")            # training
model_time_train_inference = pd.read_csv("data/regular_no_fp_wt_time_inference_result.csv")                   # inference
model_time_train_train = pd.read_csv("data/regular_no_fp_wt_time_train_result.csv")                  


#### Import fp16 models

In [11]:
# INFERENCE
# Model size and GPU
reformer_model_inference_fp16 = pd.read_csv("data_fp16/efficient_new_fp_nt_memory_inference_result.csv")
efficient_model_inference_fp16 = pd.read_csv("data_fp16/efficient_training_fp_nt_memory_inference_result.csv")
model_model_inference_fp16 = pd.read_csv("data_fp16/regular_fp_nt_memory_inference_result.csv")


# Inference time
reformer_time_inference_fp16 = pd.read_csv("data_fp16/efficient_new_fp_nt_time_inference_result.csv")
efficient_time_inference_fp16 = pd.read_csv("data_fp16/efficient_training_fp_nt_time_inference_result.csv")
model_time_inference_fp16 = pd.read_csv("data_fp16/regular_fp_nt_time_inference_result.csv")


# TRAINING
# Model size and GPU
efficient_model_train_inference_fp16 = pd.read_csv("data_fp16/efficient_training_fp_wt_memory_inference_result.csv")   # inference
efficient_model_train_train_fp16 = pd.read_csv("data_fp16/efficient_training_fp_wt_memory_train_result.csv")           # training
model_model_train_inference_fp16 = pd.read_csv("data_fp16/regular_fp_wt_memory_inference_result.csv")                  # inference
model_model_train_train_fp16 = pd.read_csv("data_fp16/regular_fp_wt_memory_train_result.csv")                          # train


# Inference Time
efficient_time_train_inference_fp16 = pd.read_csv("data_fp16/efficient_training_fp_wt_time_inference_result.csv")    # inference
efficient_time_train_train_fp16 = pd.read_csv("data_fp16/efficient_training_fp_wt_time_train_result.csv")            # training
model_time_train_inference_fp16 = pd.read_csv("data_fp16/regular_fp_wt_time_inference_result.csv")                   # inference
model_time_train_train_fp16 = pd.read_csv("data_fp16/regular_fp_wt_time_train_result.csv")                           # train


In [12]:
# https://huggingface.co/transformers/pretrained_models.html
num_params = {
    "BERT_base": 110,
    "BERT_large": 330,
    "RoBERTa_base": 125,
    "RoBERTa_large": 355,
    "dist_bert": 66, # uncased
    "MiniLM": 33,
    "MiniLM_multilingual": 21, 
    "mBERT_cased": 179,
    "XLM-R_base": 270,
    "XLM-R_large": 550,
    "GPT-2": 117,
    "BART_large": 406,
    "mBART_large": 610,
    "Reformer": 149,
    'BigBird': 149, 
    'Longformer_base': 149, 
    'Longformer_large': 435, 
    'Transformer-XL': 257, 
}



def change_model_name_to_params(
        df: pd.DataFrame,
    ) -> pd.DataFrame:
        
    df.dropna(inplace=True)
    df = df[df.model != "Transformer-XL"]
    df_with_params = df.copy(deep=True)
    
    for model, value in num_params.items():
        if model in df_with_params.model.unique():
            df_with_params['model'].replace({model: value}, inplace=True)
    return df_with_params
    
def make_ols(
        df: pd.DataFrame, 
        regressors:List[str]=["sequence_length", "batch_size", "model"], 
        text="GPU allocation"
    ) -> None:
    
    """Get OLS regressors from a Huggingface Transformers benchmark."""
    
    _df = change_model_name_to_params(df)
    # Test models all first
    X = _df[regressors]
    y = _df[["result"]]

    model = sm.OLS(y, X)
    res = model.fit()
    print(text)
    print(f"{res.summary()}\n\n")

In [13]:
make_ols(model_model_inference,        text="Standard Models")
make_ols(model_model_inference_fp16,   text="Standard Models with fp16")
make_ols(model_model_train_train,      text="Standard Models Train")
make_ols(model_model_train_train_fp16, text="Standard Models Train fp16")

Standard Models
                                 OLS Regression Results                                
Dep. Variable:                 result   R-squared (uncentered):                   0.771
Model:                            OLS   Adj. R-squared (uncentered):              0.770
Method:                 Least Squares   F-statistic:                              873.4
Date:                Tue, 03 Aug 2021   Prob (F-statistic):                   2.42e-248
Time:                        11:00:37   Log-Likelihood:                         -6953.4
No. Observations:                 780   AIC:                                  1.391e+04
Df Residuals:                     777   BIC:                                  1.393e+04
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                      coef    std err          t      P>|t|      [0.025      0.975]
--------------------

  if model in df_with_params.model.unique():
  if model in df_with_params.model.unique():
  if model in df_with_params.model.unique():
  if model in df_with_params.model.unique():


In [14]:
make_ols(efficient_model_inference,        text="Efficient Models")
make_ols(efficient_model_inference_fp16,   text="Efficient Models with fp16")
make_ols(efficient_model_train_train,      text="Efficient Models Train")
make_ols(efficient_model_train_train_fp16, text="Efficient Models Train fp16")


  if model in df_with_params.model.unique():
  if model in df_with_params.model.unique():


Efficient Models
                                 OLS Regression Results                                
Dep. Variable:                 result   R-squared (uncentered):                   0.797
Model:                            OLS   Adj. R-squared (uncentered):              0.794
Method:                 Least Squares   F-statistic:                              301.7
Date:                Tue, 03 Aug 2021   Prob (F-statistic):                    1.35e-79
Time:                        11:00:37   Log-Likelihood:                         -2167.3
No. Observations:                 234   AIC:                                      4341.
Df Residuals:                     231   BIC:                                      4351.
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                      coef    std err          t      P>|t|      [0.025      0.975]
-------------------

  if model in df_with_params.model.unique():
  if model in df_with_params.model.unique():


In [15]:
make_ols(reformer_model_inference, text="Reformer", regressors=["sequence_length", "batch_size"])
make_ols(reformer_model_inference_fp16, text="Reformer with fp16", regressors=["sequence_length", "batch_size"])

Reformer
                                 OLS Regression Results                                
Dep. Variable:                 result   R-squared (uncentered):                   0.700
Model:                            OLS   Adj. R-squared (uncentered):              0.692
Method:                 Least Squares   F-statistic:                              87.42
Date:                Tue, 03 Aug 2021   Prob (F-statistic):                    2.52e-20
Time:                        11:00:37   Log-Likelihood:                         -737.41
No. Observations:                  77   AIC:                                      1479.
Df Residuals:                      75   BIC:                                      1484.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------

  if model in df_with_params.model.unique():
  if model in df_with_params.model.unique():


In [16]:
model_time_inference

Unnamed: 0,model,batch_size,sequence_length,result
0,BERT_base,1,1,0.0060
1,BERT_base,1,2,0.0063
2,BERT_base,1,4,0.0063
3,BERT_base,1,8,0.0066
4,BERT_base,1,16,0.0066
...,...,...,...,...
775,mBART_large,32,32,0.1134
776,mBART_large,32,64,0.2171
777,mBART_large,32,128,0.4312
778,mBART_large,32,256,0.8824


In [17]:
make_ols(model_time_inference,        text="Standard Models")
make_ols(model_time_inference_fp16,   text="Standard Models with fp16")
make_ols(model_time_train_train,      text="Standard Models Train")
make_ols(model_time_train_train_fp16, text="Standard Models Train fp16")

Standard Models
                                 OLS Regression Results                                
Dep. Variable:                 result   R-squared (uncentered):                   0.372
Model:                            OLS   Adj. R-squared (uncentered):              0.369
Method:                 Least Squares   F-statistic:                              153.1
Date:                Tue, 03 Aug 2021   Prob (F-statistic):                    5.77e-78
Time:                        11:00:37   Log-Likelihood:                          539.27
No. Observations:                 780   AIC:                                     -1073.
Df Residuals:                     777   BIC:                                     -1059.
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                      coef    std err          t      P>|t|      [0.025      0.975]
--------------------

  if model in df_with_params.model.unique():
  if model in df_with_params.model.unique():
  if model in df_with_params.model.unique():
  if model in df_with_params.model.unique():


# Test build other models

In [99]:
import numpy as np
from sklearn.linear_model import LinearRegression


# fake data
X = model_with_params[["sequence_length", "model"]]
y = model_with_params[["result"]]

linreg = LinearRegression()
linreg.fit(X, y)

# prediction
y_pred = linreg.predict(X)
#print(y_pred)
print(r2_score(y, y_pred.reshape(-1, 1)))



import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline



polyreg = make_pipeline(
        PolynomialFeatures(degree=3),
        LinearRegression())
polyreg.fit(X, y)

# prediction
y_pred = polyreg.predict(X)
#print(y_pred)

from sklearn.metrics import r2_score
print(r2_score(y, y_pred.reshape(-1, 1)))

0.3633135264334514
0.4965320608231666


In [48]:
seq_lengths = 10
batches = 6
x = model_model_inference.iloc[:seq_lengths*batches]
X = x[["sequence_length", "batch_size"]]
y = x[["result"]]

model = sm.OLS(y, X)

res = model.fit()
print(res.summary())


                                 OLS Regression Results                                
Dep. Variable:                 result   R-squared (uncentered):                   0.709
Model:                            OLS   Adj. R-squared (uncentered):              0.699
Method:                 Least Squares   F-statistic:                              70.67
Date:                Sun, 01 Aug 2021   Prob (F-statistic):                    2.83e-16
Time:                        19:31:10   Log-Likelihood:                         -506.88
No. Observations:                  60   AIC:                                      1018.
Df Residuals:                      58   BIC:                                      1022.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                      coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------

In [49]:
seq_lengths = 10
batches = 6
x = model_model_inference.iloc[:seq_lengths*batches]
X = x[["sequence_length", "batch_size", "model"]]
y = x[["result"]]

model = sm.OLS(y, X)

res = model.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                 result   R-squared:                       0.568
Model:                            OLS   Adj. R-squared:                  0.552
Method:                 Least Squares   F-statistic:                     37.41
Date:                Sun, 01 Aug 2021   Prob (F-statistic):           4.19e-11
Time:                        19:31:39   Log-Likelihood:                -427.14
No. Observations:                  60   AIC:                             860.3
Df Residuals:                      57   BIC:                             866.6
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
sequence_length     1.8408      0.253     

In [50]:
seq_lengths = 10
batches = 6
x = model_model_inference.iloc[:seq_lengths*batches]
X = x[["sequence_length"]]
y = x[["result"]]

model = sm.OLS(y, X)

res = model.fit()
print(res.summary())

                                 OLS Regression Results                                
Dep. Variable:                 result   R-squared (uncentered):                   0.422
Model:                            OLS   Adj. R-squared (uncentered):              0.412
Method:                 Least Squares   F-statistic:                              43.01
Date:                Sun, 01 Aug 2021   Prob (F-statistic):                    1.51e-08
Time:                        19:32:42   Log-Likelihood:                         -527.50
No. Observations:                  60   AIC:                                      1057.
Df Residuals:                      59   BIC:                                      1059.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                      coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------

In [51]:
seq_lengths = 10
batches = 6
x = model_model_inference.iloc[:seq_lengths*batches]
X = x[["batch_size"]]
y = x[["result"]]

model = sm.OLS(y, X)

res = model.fit()
print(res.summary())

                                 OLS Regression Results                                
Dep. Variable:                 result   R-squared (uncentered):                   0.552
Model:                            OLS   Adj. R-squared (uncentered):              0.545
Method:                 Least Squares   F-statistic:                              72.73
Date:                Sun, 01 Aug 2021   Prob (F-statistic):                    7.04e-12
Time:                        19:32:52   Log-Likelihood:                         -519.82
No. Observations:                  60   AIC:                                      1042.
Df Residuals:                      59   BIC:                                      1044.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

## Compare separate models
We will see that comparing a sinlge model, the highest $R^2$ score is obtained when leaving out the model size.   
However, when we introduce mode models we will see how importaint the addition on the number of parameters of the model truely is:


#### A single model

In [73]:
bert = model_model_inference[(model_model_inference.model == "BERT_base") | (model_model_inference.model == "BERT_large")]
make_ols(bert, regressors=["sequence_length", "batch_size"])
make_ols(bert, regressors=["sequence_length", "batch_size", "model"])

GPU allocation
                                 OLS Regression Results                                
Dep. Variable:                 result   R-squared (uncentered):                   0.677
Model:                            OLS   Adj. R-squared (uncentered):              0.671
Method:                 Least Squares   F-statistic:                              123.6
Date:                Mon, 02 Aug 2021   Prob (F-statistic):                    1.13e-29
Time:                        10:39:05   Log-Likelihood:                         -1043.5
No. Observations:                 120   AIC:                                      2091.
Df Residuals:                     118   BIC:                                      2097.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
  if model in df_with_params.model.unique():
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
  if model in df_with_params.model.unique():


#### Now with an additional model
We will see that adding regressor "model" increases the R2 score greatly!

In [72]:
bert = model_model_inference[(model_model_inference.model == "BERT_base") | (model_model_inference.model == "BERT_large")]
make_ols(bert, regressors=["sequence_length", "batch_size"])
make_ols(bert, regressors=["sequence_length", "batch_size", "model"])

GPU allocation
                                 OLS Regression Results                                
Dep. Variable:                 result   R-squared (uncentered):                   0.677
Model:                            OLS   Adj. R-squared (uncentered):              0.671
Method:                 Least Squares   F-statistic:                              123.6
Date:                Mon, 02 Aug 2021   Prob (F-statistic):                    1.13e-29
Time:                        10:38:59   Log-Likelihood:                         -1043.5
No. Observations:                 120   AIC:                                      2091.
Df Residuals:                     118   BIC:                                      2097.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
  if model in df_with_params.model.unique():
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
  if model in df_with_params.model.unique():
