In [1]:
#import
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Reading dataset
df=pd.read_csv('MF.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Columns: 150 entries, Cate_name to Yield
dtypes: int64(148), object(2)
memory usage: 79.8+ KB


In [3]:
#Building descriptors
X = df.drop(columns=['Yield', 'Cate_name', 'P_name' ])
print('---Descriptors---')
print(X.head())

y = pd.DataFrame(df['Yield'],columns=['Yield'])
print('---Objective---')
print(y.head())

---Descriptors---
   Cate_MF_9  Cate_MF_33  Cate_MF_58  Cate_MF_68  Cate_MF_80  Cate_MF_102  \
0          0           0           0           0           0            0   
1          0           0           0           0           0            0   
2          0           0           0           0           0            0   
3          0           0           0           0           0            0   
4          0           0           0           0           0            0   

   Cate_MF_105  Cate_MF_114  Cate_MF_125  Cate_MF_142  ...  P_MF_1154  \
0            0            0            0            0  ...          1   
1            0            0            0            0  ...          0   
2            0            0            0            0  ...          0   
3            0            0            0            0  ...          0   
4            0            0            0            1  ...          1   

   P_MF_1199  P_MF_1272  P_MF_1380  P_MF_1570  P_MF_1689  P_MF_1750  \
0        

In [4]:
#storage 
data_r2_train =[]
data_RMSE_train = []
data_MAE_train = []
data_r2_test = []
data_RMSE_test = []
data_MAE_test = []

#LR
for i in range(10):
    seed=i
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
    
    a_X_train = (X_train - X_train.mean(axis=0)) / X_train.std(axis=0, ddof=1)
    a_X_test = (X_test - X_train.mean(axis=0)) / X_train.std(axis=0, ddof=1)
    a_X_train = a_X_train.dropna(how='any', axis=1)
    a_X_test = a_X_test[a_X_train.columns]
    
    model = LinearRegression()
    model.fit(a_X_train, y_train['Yield'])
    
    y_pred1 = model.predict(a_X_train)
    y_pred2 = model.predict(a_X_test)
    
    #train
    r2_train = metrics.r2_score(y_train, y_pred1)
    RMSE_train = metrics.root_mean_squared_error(y_train, y_pred1)
    MAE_train =  metrics.mean_absolute_error(y_train, y_pred1)
    #test
    r2_test = metrics.r2_score(y_test, y_pred2)
    RMSE_test = metrics.root_mean_squared_error(y_test, y_pred2)
    MAE_test = metrics.mean_absolute_error(y_test, y_pred2)
    
    data_r2_train.append(r2_train)
    data_RMSE_train.append(RMSE_train)
    data_MAE_train.append(MAE_train)
    data_r2_test.append(r2_test)
    data_RMSE_test.append(RMSE_test)
    data_MAE_test.append(MAE_test)
    
    print('----------------------')
    print('seed:', seed)
    print('R2_train:', r2_train)
    print('R2_test:', r2_test)
print('R2_train_means:', sum(data_r2_train)/10)
print('R2_test_means:', sum(data_r2_test)/10)

----------------------
seed: 0
R2_train: 0.9031426676042149
R2_test: 0.6982755873604696
----------------------
seed: 1
R2_train: 0.8393419859639167
R2_test: 0.758694575823311
----------------------
seed: 2
R2_train: 0.8083528505364648
R2_test: 0.6245128100046202
----------------------
seed: 3
R2_train: 0.8797232027574308
R2_test: 0.6913585101350004
----------------------
seed: 4
R2_train: 0.8654077724970741
R2_test: -3.5792140433995518e+25
----------------------
seed: 5
R2_train: 0.8650093806388526
R2_test: 0.7016317294677079
----------------------
seed: 6
R2_train: 0.8708164444496075
R2_test: -2.68165892267318e+27
----------------------
seed: 7
R2_train: 0.8761357802802903
R2_test: 0.6214627630407883
----------------------
seed: 8
R2_train: 0.8913650304501388
R2_test: 0.6028309014823907
----------------------
seed: 9
R2_train: 0.7434097796859211
R2_test: 0.013051988795198755
R2_train_means: 0.8542704894863912
R2_test_means: -2.7174510631071756e+26


In [5]:
data_r2_train_pd = pd.DataFrame(data= data_r2_train, columns=['r2_train'])
data_RMSE_train_pd = pd.DataFrame(data=data_RMSE_train, columns=['RMSE_train'])
data_MAE_train_pd = pd.DataFrame(data=data_MAE_train, columns=['MAE_train'])
data_r2_test_pd = pd.DataFrame(data=data_r2_test, columns=['r2_test'])
data_RMSE_test_pd = pd.DataFrame(data=data_RMSE_test, columns=['RMSE_test'])
data_MAE_test_pd = pd.DataFrame(data=data_MAE_test, columns=['MAE_test'])

data_all = pd.concat([data_r2_train_pd, data_RMSE_train_pd, data_MAE_train_pd, data_r2_test_pd, 
                      data_RMSE_test_pd, data_MAE_test_pd],
                     axis=1, join='inner')

data_all.loc['mean'] = data_all.mean()
data_all.loc['std'] = data_all.std(ddof=0)
print(data_all)

data_all.to_csv('../../score/MF/LR_score.csv')

      r2_train  RMSE_train  MAE_train       r2_test     RMSE_test  \
0     0.903143    8.450420   6.146456  6.982756e-01  1.986556e+01   
1     0.839342   12.053406   9.186029  7.586946e-01  1.529014e+01   
2     0.808353   13.315127  10.647447  6.245128e-01  1.905568e+01   
3     0.879723   10.591669   7.697833  6.913585e-01  1.721097e+01   
4     0.865408   10.771794   7.980537 -3.579214e+25  1.950863e+14   
5     0.865009   10.455003   7.191675  7.016317e-01  1.904932e+01   
6     0.870816   10.296482   6.914883 -2.681659e+27  1.796460e+15   
7     0.876136   10.884662   8.442657  6.214628e-01  1.852423e+01   
8     0.891365    9.778111   7.126876  6.028309e-01  1.981153e+01   
9     0.743410   16.125744  12.765779  1.305199e-02  2.637158e+01   
mean  0.854270   11.272242   8.410017 -2.717451e+26  1.991547e+14   
std   0.042667    1.931062   1.803119  7.659885e+26  5.106772e+14   

          MAE_test  
0     1.389382e+01  
1     1.187524e+01  
2     1.409318e+01  
3     1.281760e+01