In [1]:
#import
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Reading dataset
df=pd.read_csv('Morgan_desc.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Columns: 2049 entries, Yield to 1023.1
dtypes: int64(2049)
memory usage: 1.1 MB


In [3]:
#Preparing

# delete variables with zero variance
Var0Variable = np.where( df.var(axis=0) == 0 )
if len(Var0Variable[0]) == 0:
    print( "There is no variable with zero variance." )
    print( "" )
else:
    print( "There were {0} variables with zero variance".format(len(Var0Variable[0])))
    print( "the number of variable is: {0}".format(Var0Variable[0]) )
    print( "remove them" )
    print( "" )
    #df_var0 = np.delete(df, Var0Variable, 1)    
    df_var0 = df.drop(df.columns[Var0Variable], axis=1)

#removal of columns which data has all 0
def remove_all_zero_col(df):
    df = df.copy()
    for col in df.columns:
        if (df[col] == 0).all():
            df.drop(col, axis=1, inplace=True)
    return df
df_var0 = remove_all_zero_col(df_var0)
df_var0.info()

There were 1914 variables with zero variance
the number of variable is: [   1    2    3 ... 2046 2047 2048]
remove them

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Columns: 135 entries, Yield to 919.1
dtypes: int64(135)
memory usage: 71.8 KB


In [4]:
#Building descriptors
X = df_var0.drop(columns=['Yield'])

print('---Descriptors---')
print(X.head())

y = pd.DataFrame(df_var0['Yield'],columns=['Yield'])
print('---Objective---')
print(y.head())

---Descriptors---
   8  9  15  33  58  63  64  68  80  102  ...  623.1  665.1  695.1  726.1  \
0  0  0   0   0   0   0   1   0   0    0  ...      1      1      1      0   
1  0  0   0   0   0   0   1   0   0    0  ...      1      0      0      0   
2  0  0   0   0   0   0   1   0   0    0  ...      1      0      0      1   
3  0  0   0   0   0   0   1   0   0    0  ...      0      0      0      0   
4  0  0   0   1   0   0   0   0   0    0  ...      1      1      1      0   

   755.1  802.1  807.1  841.1  849.1  919.1  
0      0      1      0      1      0      0  
1      1      0      1      0      0      0  
2      0      0      1      0      1      1  
3      0      0      1      0      0      0  
4      0      1      0      1      0      0  

[5 rows x 134 columns]
---Objective---
   Yield
0     80
1      7
2      3
3     48
4     99


In [5]:
#storage 
data_r2_train =[]
data_RMSE_train = []
data_MAE_train = []
data_r2_test = []
data_RMSE_test = []
data_MAE_test = []
best_model_parametors = []
data_cv_score = []

#SVM_a_X,log(y)
for i in range(10):
    seed=i
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
    
    #autoscaling
    a_X_train = (X_train - X_train.mean(axis=0)) / X_train.std(axis=0, ddof=1)
    a_X_test = (X_test - X_train.mean(axis=0)) / X_train.std(axis=0, ddof=1)
    a_X_train=a_X_train.dropna(axis=1)
    a_X_test=a_X_test.dropna(axis=1)
    log_y_train = np.log(y_train)
    
    param_svm={'C':[0.1,1,5,10,50,100,200,300,400,500,750,1000],
               'gamma':[100,10,1,0.1,0.01,0.001,0.0001,0.00001],
               'epsilon':[0.3]}
    reg_svm = GridSearchCV(SVR(kernel='rbf'), param_grid=param_svm, cv=5, n_jobs=2)
    reg_svm.fit(a_X_train,log_y_train['Yield'])
    reg_best = reg_svm.best_estimator_
    y_pred1 = np.exp(reg_best.predict(a_X_train))
    y_pred2 = np.exp(reg_best.predict(a_X_test))
    
    #train
    r2_train = metrics.r2_score(y_train, y_pred1)
    RMSE_train = metrics.mean_squared_error(y_train, y_pred1)
    MAE_train =  metrics.mean_absolute_error(y_train, y_pred1)
    #test
    r2_test = metrics.r2_score(y_test, y_pred2)
    RMSE_test = metrics.mean_squared_error(y_test, y_pred2)
    MAE_test = metrics.mean_absolute_error(y_test, y_pred2)
    
    data_r2_train.append(r2_train)
    data_RMSE_train.append(RMSE_train)
    data_MAE_train.append(MAE_train)
    data_r2_test.append(r2_test)
    data_RMSE_test.append(RMSE_test)
    data_MAE_test.append(MAE_test)
    data_cv_score.append(reg_svm.best_score_)

    print('----------------------')
    print('seed:', seed)
    print("Best Model Parameter:",reg_svm.best_params_)
    print("Best Model Score:",reg_svm.best_score_)
    print('R2_test:', r2_test)
print('R2_train_means:', sum(data_r2_train)/10)
print('CV_score_means:', sum(data_cv_score)/10)
print('R2_test_means:', sum(data_r2_test)/10)

----------------------
seed: 0
Best Model Parameter: {'C': 400, 'epsilon': 0.3, 'gamma': 0.0001}
Best Model Score: 0.9264598178940388
R2_test: 0.7170090777747915
----------------------
seed: 1
Best Model Parameter: {'C': 750, 'epsilon': 0.3, 'gamma': 0.0001}
Best Model Score: 0.9048125437920589
R2_test: 0.81671841967098
----------------------
seed: 2
Best Model Parameter: {'C': 500, 'epsilon': 0.3, 'gamma': 0.0001}
Best Model Score: 0.9379501314880925
R2_test: 0.7792480806551921
----------------------
seed: 3
Best Model Parameter: {'C': 400, 'epsilon': 0.3, 'gamma': 0.0001}
Best Model Score: 0.91224338493578
R2_test: 0.8136230653814907
----------------------
seed: 4
Best Model Parameter: {'C': 1000, 'epsilon': 0.3, 'gamma': 0.0001}
Best Model Score: 0.9101076823558467
R2_test: 0.8133639245255738
----------------------
seed: 5
Best Model Parameter: {'C': 1000, 'epsilon': 0.3, 'gamma': 1e-05}
Best Model Score: 0.8987256523453109
R2_test: 0.5633267988407349
----------------------
seed: 6


In [7]:
data_r2_train_pd = pd.DataFrame(data= data_r2_train, columns=['r2_train'])
data_RMSE_train_pd = pd.DataFrame(data=data_RMSE_train, columns=['MSE_train'])
data_MAE_train_pd = pd.DataFrame(data=data_MAE_train, columns=['MAE_train'])
data_r2_test_pd = pd.DataFrame(data=data_r2_test, columns=['r2_test'])
data_RMSE_test_pd = pd.DataFrame(data=data_RMSE_test, columns=['RMSE_test'])
data_MAE_test_pd = pd.DataFrame(data=data_MAE_test, columns=['MAE_test'])
data_cv_score_pd = pd.DataFrame(data=data_cv_score, columns=['cv_score'])
data_parametors = pd.DataFrame([best_model_parametors])
        
data_all = pd.concat([data_r2_train_pd, data_RMSE_train_pd, data_MAE_train_pd, data_r2_test_pd, data_RMSE_test_pd, data_MAE_test_pd, data_cv_score_pd], 
                     axis=1, join='inner')
data_all.loc['mean'] = data_all.mean()
print(data_all)

data_all.to_csv('../../score/MorganF/SVM_score.csv')

      r2_train   MSE_train  MAE_train   r2_test   RMSE_test   MAE_test  \
0     0.910675   65.856383   5.091945  0.717009  370.138009  11.876006   
1     0.879309  109.141822   6.840061  0.816718  177.572006   7.751250   
2     0.892922   99.057790   6.389224  0.779248  213.480621   8.800067   
3     0.881913  110.141458   6.779067  0.813623  178.874523   7.960875   
4     0.900176   86.057923   5.911058  0.813364  198.454667   8.719839   
5     0.788403  171.338518   7.028079  0.563327  531.083730  14.205720   
6     0.886763   92.930947   6.584412  0.633342  441.258842  13.449676   
7     0.897484   98.056075   6.585389  0.762779  215.042805   9.045482   
8     0.894704   92.672408   5.992927  0.661528  334.490584  12.526893   
9     0.850261  151.752374   8.336058  0.871936   90.241044   5.807575   
mean  0.878261  107.700570   6.553822  0.743287  275.063683  10.014338   

      cv_score  
0     0.926460  
1     0.904813  
2     0.937950  
3     0.912243  
4     0.910108  
5     0.8