# Train Test Split different ratios

In [None]:
#Importing essentials
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import norm
from sklearn.model_selection import RandomizedSearchCV


In [None]:
#Innitial look at personal data 
#difference between experimental binding energies and Liquid Drop Model Binding Energies
#The Random Forrest is being trained to predict this difference to 'fix' the liquid drop model 

N = np.genfromtxt('expthediff.dat',usecols=0) #Number of neutrons
Z = np.genfromtxt('expthediff.dat',usecols=1) #Number of protons
A = N+Z # Total number of Nucleons
# Difference between the experimental binding energy and the Liquid drop model
y = np.genfromtxt('expthediff.dat',usecols=2)
X = np.transpose(np.stack((N,Z)))


In [None]:
plt.plot(Z,y,'o',markersize=3)
plt.show()

In [None]:
plt.plot(N,y,'o',markersize=3)
plt.show()

In [None]:
plt.plot(A,y,'o',markersize=3)
plt.show()

In [None]:
ext = np.int(np.ceil(max(-np.min(y),np.max(y))))
fig=plt.figure(figsize=(18, 8), dpi= 80, facecolor='w', edgecolor='k')
sc = plt.scatter(N,Z,c=y,cmap='PiYG',vmin=-ext,vmax=ext,s=10)
plt.colorbar(sc)
plt.show()

In [None]:
# Instantiate model 
rf = RandomForestRegressor(n_estimators=1000) #, random_state=42)

In [None]:
nl_train=[] #ratios
nl_test =[]

x_tr    =[] #actual 
x_te    =[]

pred_tr =[] #predicted
pred_te =[]

err_tr  =[] #error
err_te  =[]

sco_tr  =[] #score
sco_te  =[]

nerr_tr =[] #mean of error
nerr_te =[]

sderr_tr=[] #standard Deviation of error
sderr_te=[]

derr_tr =[] #median of error
derr_te =[]

#loop to see how different train/test ratios effects bias and variance going from test data set of 5% to 95%
for n in range(5, 96, 5):
    
    #Splitting data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =n/100, random_state=42 )
    
    #fitting the rf to the training values
    rf.fit(X_train, y_train);
    
    #appending the ratios to the ratio lists
    nl_test.append(n)
    nl_train.append(100-n)
    
    
    
    #appending the ACTUAL values into a list
    x_tr.append(X_train)
    x_te.append(X_test )
    
    ###########
    
    pred_train= rf.predict(X_train)
    pred_test = rf.predict(X_test )
    
    #appending the PREDICTED values of rf into a list
    pred_tr.append(pred_train)
    pred_te.append(pred_test)
    
    ###########
    
    #all error values
    err_train = y_train-pred_train
    err_test  = y_test- pred_test 
    
    #appending the ERROR values to list
    err_tr.append(err_train)
    err_te.append(err_test)
    
    ###########
    
    #appending the SCORES to the scores lists
    sco_tr.append(rf.score(X_train, y_train))
    sco_te.append(rf.score(X_test,  y_test ))
    
    ###########
    
    #appending the MEAN of the error vaues
    nerr_tr.append(np.mean(err_train))
    nerr_te.append(np.mean(err_test ))
    
    ###########
    
    #appending the STANDARD DEVIATION of the error values
    sderr_tr.append(np.std(err_train))
    sderr_te.append(np.std(err_test ))
    
    ###########
    
    #appending the MEDIAN of the error values
    derr_tr.append(np.median(err_train))
    derr_te.append(np.median(err_test ))
    
    
    #Printing all the values:
    
    #ratios
    print('train size: ', 100-n, '%')
    print('test size:  ', n, '%')
    
    #scores
    print('train score: ', round(rf.score(X_train, y_train),5))
    print('test  score: ', round(rf.score(X_test,  y_test ),5))
    
    #mean error
    print('mean error of train:', round(np.mean(err_train),5))
    print('mean error of test: ', round(np.mean(err_test),5))
    
    
    #standard Deviatin of error
    print('Standard Deviation error of train:', round(np.std(err_train),5))
    print('Standard Deviation of test: ', round(np.std(err_test),5))
    
    #median error
    print('median error of train:', round(np.median(err_train),5))
    print('median error of test: ', round(np.median(err_test),5))
    
    
    #printing histogram of train
    num_bins = 80
    n, bins, patches = plt.hist([err_train, err_test], num_bins, alpha=0.5)
    #y = mlab.normpdf(bins, np.mean(err_train), np.std(err_train))
    #plt.plot(bins, y, 'r--')
    plt.ylabel('training error')
    plt.xlabel('bins')  
    plt.show()
    
    #printing histogram of train
    num_bins = 80
    n, bins, patches = plt.hist(err_train , num_bins, facecolor='blue', alpha=0.5)
    #y = mlab.normpdf(bins, np.mean(err_test), np.std(err_test))
    #plt.plot(bins, y, 'r--')
    plt.ylabel('testing error')
    plt.xlabel('bins')
    plt.show()
    
    #printing histogram of test
    num_bins = 80
    n, bins, patches = plt.hist(err_test , num_bins, facecolor='blue', alpha=0.5)
    #y = mlab.normpdf(bins, np.mean(err_test), np.std(err_test))
    #plt.plot(bins, y, 'r--')
    plt.ylabel('testing error')
    plt.xlabel('bins')
    plt.show()
    
  
    #printing error of train and test
    plt.plot(np.transpose(X_train)[0]+np.transpose(X_train)[1], err_train, 'o',markersize=1,label='train Error')
    plt.plot(np.transpose(X_test )[0]+np.transpose(X_test )[1], err_test , 'o',markersize=1,label='test  Error')
    plt.legend()
    plt.ylabel('Error')
    plt.xlabel('A')
    plt.show()
    
    print('\n')
    
    

In [None]:
#arranging data
actual    =np.transpose(np.stack((nl_train, nl_test, x_tr, x_te)))
Predicted =np.transpose(np.stack((nl_train, nl_test, pred_tr, pred_te)))

Error     = np.transpose(np.stack((nl_train, nl_test, err_tr, err_te)))


Mean      = np.transpose(np.stack((nl_train, nl_test, nerr_tr, nerr_te)))
Stdev     = np.transpose(np.stack((nl_train, nl_test, sderr_tr, sderr_te)))

                                 
Score     = np.transpose(np.stack((nl_train, nl_test, sco_tr , sco_te)))
Median    = np.transpose(np.stack((nl_train, nl_test, derr_tr, derr_te)))




everything=np.transpose(np.stack((nl_train,
                                  nl_test,
                                  x_tr,
                                  x_te,
                                  pred_tr,
                                  pred_te,
                                  err_tr,
                                  err_te,
                                  nerr_tr, 
                                  nerr_te,
                                  sderr_tr,
                                  sderr_te,
                                  sco_tr, 
                                  sco_te,
                                  derr_tr, 
                                  derr_te, 
                                   )))

"""
plt.plot(np.transpose(Score)[0],np.transpose(Score)[2],'o',markersize=3,label='training  score')
plt.xlabel('% portion Training')
plt.ylabel('R^2')
plt.legend()
plt.show()

plt.plot(np.transpose(Score)[1],np.transpose(Score)[3],'o',markersize=3,label='testing  score')
plt.xlabel('% portion Testing')
plt.ylabel('R^2')
plt.legend()
plt.show()
"""
                                 
plt.plot(np.transpose(Score)[0],np.transpose(Score)[3],'o',markersize=3,label='test  score')
plt.plot(np.transpose(Score)[0],np.transpose(Score)[2],'o',markersize=3,label='train score')
plt.xlabel('% portion Training')
plt.ylabel('R^2')
plt.legend()
plt.show()

"""                                 
plt.plot(np.transpose(Score)[1],np.transpose(Score)[3],'o',markersize=3,label='test  score')
plt.plot(np.transpose(Score)[1],np.transpose(Score)[2],'o',markersize=3,label='train score')
plt.xlabel('% portion Testing')
plt.ylabel('R^2')
plt.legend()
plt.show()
"""

In [None]:
"""
plt.plot(np.transpose(Mean)[0],np.transpose(Mean)[2],'o',markersize=3,label='training  Mean error')
plt.xlabel('% portion Training')
plt.ylabel('Mean error')
plt.legend()
plt.show()
plt.plot(np.transpose(Mean)[1],np.transpose(Mean)[3],'o',markersize=3,label='testing  Mean error')
plt.xlabel('% portion Testing')
plt.ylabel('Mean error')
plt.legend()
plt.show()
"""
plt.plot(np.transpose(Mean)[0],np.transpose(Mean)[3],'o',markersize=3,label='test  Mean error')
plt.plot(np.transpose(Mean)[0],np.transpose(Mean)[2],'o',markersize=3,label='train Mean error')
plt.xlabel('% portion Training')
plt.ylabel('absolute Mean error')
plt.legend()
plt.show()

plt.plot(np.transpose(Mean)[0],abs(np.transpose(Mean)[3]),'o',markersize=3,label='abs test  Mean error')
plt.plot(np.transpose(Mean)[0],abs(np.transpose(Mean)[2]),'o',markersize=3,label='abs train Mean error')
plt.xlabel('% portion Training')
plt.ylabel('absolute Mean error')
plt.legend()
plt.show()

plt.plot(np.transpose(Stdev)[0],np.transpose(Stdev)[3],'o',markersize=3,label='test  STD error')
plt.plot(np.transpose(Stdev)[0],np.transpose(Stdev)[2],'o',markersize=3,label='train STD error')
plt.xlabel('% portion Training')
plt.ylabel('Standard Deviation of error')
plt.legend()
plt.show()

print(abs(np.transpose(Mean)[3]))

print(abs(np.transpose(Mean)[2]))
"""
plt.plot(np.transpose(Mean)[1],np.transpose(Mean)[3],'o',markersize=3,label='test  Mean error')
plt.plot(np.transpose(Mean)[1],np.transpose(Mean)[2],'o',markersize=3,label='train Mean error')
plt.xlabel('% portion Testing')
plt.ylabel('Mean error')
plt.legend()
plt.show()
"""


In [None]:
"""
plt.plot(np.transpose(Median)[0],np.transpose(Median)[2],'o',markersize=3,label='training  Median error')
plt.xlabel('% portion Training')
plt.ylabel('Median error')
plt.legend()
plt.show()

plt.plot(np.transpose(Median)[1],np.transpose(Median)[3],'o',markersize=3,label='testing  Median error')
plt.xlabel('% portion Testing')
plt.ylabel('Median error')
plt.legend()
plt.show()
"""

plt.plot(np.transpose(Median)[0],abs(np.transpose(Median)[3]),'o',markersize=3,label='test  Median error')
plt.plot(np.transpose(Median)[0],abs(np.transpose(Median)[2]),'o',markersize=3,label='train Median error')
plt.xlabel('% portion Training')
plt.ylabel('Median error')
plt.legend()
plt.show()


plt.plot(np.transpose(Median)[0],np.transpose(Median)[3],'o',markersize=3,label='test  Median error')
plt.plot(np.transpose(Median)[0],np.transpose(Median)[2],'o',markersize=3,label='train Median error')
plt.xlabel('% portion Training')
plt.ylabel('Median error')
plt.legend()
plt.show()
"""
plt.plot(np.transpose(Median)[1],np.transpose(Median)[3],'o',markersize=3,label='test  Median error')
plt.plot(np.transpose(Median)[1],np.transpose(Median)[2],'o',markersize=3,label='train Median error')
plt.xlabel('% portion Testing')
plt.ylabel('Median error')
plt.legend()
plt.show()
"""


In [None]:
#Score plot
plt.plot(np.transpose(Score)[0],np.transpose(Score)[3],'o',markersize=3,label='test  score')
plt.plot(np.transpose(Score)[0],np.transpose(Score)[2],'o',markersize=3,label='train score')
plt.xlabel('% portion Training')
plt.ylabel('R^2')
plt.legend()
plt.show()

#mean plot
plt.plot(abs(np.transpose(Mean)[0]),abs(np.transpose(Mean)[3]),'o',markersize=3,label='test  Mean error')
plt.plot(abs(np.transpose(Mean)[0]),abs(np.transpose(Mean)[2]),'o',markersize=3,label='train Mean error')
plt.xlabel('% portion Training')
plt.ylabel('Mean error')
plt.legend()
plt.show()

#Standard Deviation Plot
plt.plot(np.transpose(Stdev)[0],np.transpose(Stdev)[3],'o',markersize=3,label='test  STD error')
plt.plot(np.transpose(Stdev)[0],np.transpose(Stdev)[2],'o',markersize=3,label='train STD error')
plt.xlabel('% portion Training')
plt.ylabel('Standard Deviation of error')
plt.legend()
plt.show()

#Median error plot
plt.plot(abs(np.transpose(Median)[0]),abs(np.transpose(Median)[3]),'o',markersize=3,label='test  Median error')
plt.plot(abs(np.transpose(Median)[0]),abs(np.transpose(Median)[2]),'o',markersize=3,label='train Median error')
plt.xlabel('% portion Training')
plt.ylabel('Median error')
plt.legend()
plt.show()


In [None]:
;