In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
import json
from sklearn.svm import NuSVR
from sklearn.model_selection import train_test_split
import os
from math import sqrt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor

def calculate_metrics(y_true, y_pred):    
    r2 = r2_score(y_true, y_pred)
    print(" R2 of the model is ", r2)
    rms = sqrt(mean_squared_error(y_true, y_pred))
    print(" RMSE of the model is ", rms)
    mae = mean_absolute_error(y_true, y_pred)
    print(" MAE of the model is ", mae)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    print(" MAPE of the model is ", mape)
    
def make_plot(y_test, y_pred):
    plt.figure(figsize=(15, 5))
    plt.plot(y_pred)
    plt.plot(y_test.values)
    plt.legend(['Predicted', 'Actual'])
    
def make_scatterplot(y_test, y_pred):
    plt.scatter(y_test, y_pred)
    plt.xlabel('Actual')
    plt.ylabel('Predicted')


DATA_PATH = "./Simulations/"
datapoints = {}

for root, dirs, files in os.walk(DATA_PATH):
    if 'FormFactor.json' in files:
        simul_id = root[len(DATA_PATH):]
        
        f = open(root + '/FormFactor.json')
        data = json.load(f)
        form_factor = pd.DataFrame(data)[1]
        
        if 'thickness.json' in files:
            f = open(root + '/thickness.json')
            thickness = json.load(f)
        else:
            thickness = np.nan
            
        if 'apl.json' in files:
            f = open(root + '/apl.json')
            data = json.load(f)
            
            value_mean = np.mean(list(data.values()))
            
            if np.isnan(value_mean):
                apl = np.nan
                apl_len = np.nan
            else:
                apl = value_mean
                apl_len = len(list(data.values()))
        else:
            apl = np.nan
            apl_len = np.nan
            
        if 'TotalDensity.json' in files:
            f = open(root + '/TotalDensity.json')
            data = json.load(f)
            
            total_density = pd.DataFrame(data)
            total_density_len = total_density.shape[0]
        else:
            total_density = np.nan
            total_density_len = np.nan
            
        result = {
            'form_factor': form_factor,
            'thickness': thickness,
            'apl': apl,
            'apl_len': apl_len,
            'total_density': total_density,
            'total_density_len': total_density_len
        }
        
        datapoints[simul_id] = result

In [3]:
len(datapoints.keys())

715

In [6]:
form_factor = []
apl = []

for key in datapoints:
    if not np.isnan(datapoints[key]['apl']):
        form_factor.append(datapoints[key]['form_factor'])
        apl.append(datapoints[key]['apl'])

In [8]:
len(apl)

714

In [9]:
form_factor = np.array(form_factor)

In [12]:
df = pd.DataFrame(form_factor)
df['apl'] = apl

In [13]:
df_copy=df

In [14]:
x = df[['apl']]

In [15]:
y = df.drop(['apl'],axis=1)

In [16]:
y

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,6.344403,6.320138,6.247351,6.126077,5.956366,5.738296,5.471964,5.157489,4.795020,4.384730,...,0.680424,0.707091,0.735799,0.766320,0.798438,0.831960,0.866705,0.902513,0.939236,0.976743
1,22.490975,22.459825,22.366391,22.210718,21.992883,21.712994,21.371188,20.967635,20.502534,19.976114,...,4.380145,4.377129,4.372789,4.367126,4.360142,4.351838,4.342218,4.331287,4.319049,4.305511
2,13.783229,13.762646,13.700906,13.598035,13.454075,13.269087,13.043148,12.776350,12.468807,12.120644,...,2.249947,2.218988,2.187594,2.155785,2.123578,2.090991,2.058044,2.024753,1.991138,1.957217
3,21.447578,21.483182,21.589971,21.767875,22.016780,22.336524,22.726898,23.187651,23.718482,24.319047,...,2.408326,2.474208,2.538082,2.599919,2.659692,2.717376,2.772949,2.826390,2.877679,2.926800
4,13.997836,13.973720,13.901380,13.780851,13.612188,13.395470,13.130795,12.818287,12.458089,12.050368,...,0.242800,0.235561,0.228223,0.220795,0.213286,0.205709,0.198076,0.190400,0.182700,0.174993
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
709,75.019056,74.984627,74.881354,74.709289,74.468513,74.159143,73.781326,73.335246,72.821115,72.239182,...,1.208284,1.180976,1.154276,1.128200,1.102765,1.077986,1.053877,1.030453,1.007729,0.985716
710,92.289343,92.246671,92.118676,91.905425,91.607026,91.223633,90.755444,90.202699,89.565682,88.844720,...,9.335528,9.133224,8.931639,8.730875,8.531033,8.332216,8.134524,7.938056,7.742911,7.549186
711,51.763699,51.740017,51.668979,51.550611,51.384957,51.172078,50.912051,50.604972,50.250954,49.850127,...,3.539732,3.460534,3.380503,3.299691,3.218150,3.135933,3.053098,2.969702,2.885806,2.801474
712,71.950184,71.915288,71.810614,71.636207,71.392140,71.078515,70.695464,70.243148,69.721756,69.131507,...,1.276403,1.347048,1.417474,1.487645,1.557526,1.627084,1.696287,1.765104,1.833502,1.901452


In [17]:
X_train, X_test, y_train, y_test = train_test_split(y, x, test_size=0.30, random_state=3)

In [18]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
204,48.606275,48.630936,48.704908,48.828153,49.000611,49.222196,49.492799,49.812286,50.180498,50.597253,...,6.214034,6.275971,6.335988,6.394065,6.450180,6.504317,6.556457,6.606583,6.654681,6.700735
686,30.243649,30.222959,30.160896,30.057484,29.912759,29.726775,29.499600,29.231315,28.922019,28.571825,...,1.418072,1.440353,1.461691,1.482089,1.501552,1.520083,1.537689,1.554375,1.570146,1.585010
211,87.837443,87.805530,87.709804,87.550305,87.327100,87.040281,86.689968,86.276306,85.799469,85.259654,...,0.237966,0.240245,0.244531,0.250689,0.258550,0.267928,0.278632,0.290476,0.303287,0.316908
16,38.608844,38.584945,38.513256,38.393803,38.226628,38.011789,37.749364,37.439444,37.082140,36.677577,...,2.717158,2.736302,2.755918,2.775984,2.796480,2.817386,2.838682,2.860347,2.882363,2.904711
8,44.793382,44.775384,44.721398,44.631443,44.505548,44.343760,44.146134,43.912738,43.643656,43.338979,...,0.017372,0.019730,0.022067,0.024383,0.026678,0.028951,0.031203,0.033433,0.035641,0.037828
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
659,22.800373,22.778921,22.714573,22.607355,22.457313,22.264507,22.029019,21.750944,21.430399,21.067516,...,3.779281,3.796960,3.814912,3.833111,3.851532,3.870149,3.888937,3.907869,3.926921,3.946067
256,35.247161,35.274186,35.355249,35.490310,35.679306,35.922148,36.218720,36.568881,36.972466,37.429284,...,8.719486,8.846607,8.971691,9.094691,9.215562,9.334259,9.450737,9.564953,9.676865,9.786431
643,5.089394,5.103772,5.146902,5.218770,5.319352,5.448617,5.606521,5.793014,6.008036,6.251517,...,7.783291,7.866851,7.949108,8.030044,8.109643,8.187891,8.264771,8.340269,8.414371,8.487062
249,62.063384,62.023428,61.903579,61.703902,61.424503,61.065529,60.627171,60.109661,59.513274,58.838325,...,2.372707,2.277806,2.185727,2.096491,2.010118,1.926625,1.846028,1.768341,1.693578,1.621749


In [19]:
rf = RandomForestRegressor(n_jobs=-1, n_estimators=1000)

In [20]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

  rf.fit(X_train, y_train)


In [21]:
calculate_metrics(y_test, y_pred)

 R2 of the model is  0.8666303035363394
 RMSE of the model is  2.6960625208783653
 MAE of the model is  1.6038135214502622
 MAPE of the model is  0.027324815985837532


In [29]:
knn = KNeighborsRegressor()

In [30]:
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [31]:
calculate_metrics(y_test, y_pred)

 R2 of the model is  0.9208602315748711
 RMSE of the model is  2.076819559453366
 MAE of the model is  1.5356873926388945
 MAPE of the model is  0.02589600260749989


In [25]:
from lazypredict.Supervised import LazyRegressor

In [26]:
reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)

 55%|███████████████████████▌                   | 23/42 [04:06<03:51, 12.19s/it]

LassoLarsIC model failed to execute
You are using LassoLarsIC in the case where the number of samples is smaller than the number of features. In this setting, getting a good estimate for the variance of the noise is not possible. Provide an estimate of the noise variance in the constructor.


 76%|████████████████████████████████▊          | 32/42 [04:30<00:51,  5.19s/it]

RANSACRegressor model failed to execute
`min_samples` may not be larger than number of samples: n_samples = 499.


100%|███████████████████████████████████████████| 42/42 [05:20<00:00,  7.63s/it]


In [27]:
models

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SGDRegressor,6.599651725474689e+20,-2.423984231879956e+21,363468076742.44,0.12
Lars,5.784952631135992e+16,-2.124753629940603e+17,3402950781.16,0.34
KernelRidge,18.91,-64.78,59.87,0.09
GaussianProcessRegressor,17.06,-57.99,56.7,1.04
LinearRegression,2.04,-2.83,14.44,0.42
TransformedTargetRegressor,2.04,-2.83,14.44,0.34
QuantileRegressor,1.3,-0.12,7.8,9.98
DummyRegressor,1.27,-0.0,7.4,0.05
LarsCV,1.18,0.35,5.95,3.95
DecisionTreeRegressor,1.09,0.69,4.14,0.56
