In [12]:
import h5py
import pandas
import numpy as np

def load_data(name):
    with h5py.File(f'{name}.h5', 'r') as f:
        filename = name.split('/')[0]
        return pandas.DataFrame(f[filename][:], dtype=np.float64)

train = load_data('train')
test  = load_data('test')

In [13]:
all_variables = ['actualInteractionsPerCrossing', 'averageInteractionsPerCrossing', 'correctedActualMu', 'correctedAverageMu', 'correctedScaledActualMu', 'correctedScaledAverageMu', 'NvtxReco', 'p_nTracks', 'p_pt_track', 'p_eta', 'p_phi', 'p_charge', 'p_qOverP', 'p_z0', 'p_d0', 'p_sigmad0', 'p_d0Sig', 'p_EptRatio', 'p_dPOverP', 'p_z0theta', 'p_etaCluster', 'p_phiCluster', 'p_eCluster', 'p_rawEtaCluster', 'p_rawPhiCluster', 'p_rawECluster', 'p_eClusterLr0', 'p_eClusterLr1', 'p_eClusterLr2', 'p_eClusterLr3', 'p_etaClusterLr1', 'p_etaClusterLr2', 'p_phiClusterLr2', 'p_eAccCluster', 'p_f0Cluster', 'p_etaCalo', 'p_phiCalo', 'p_eTileGap3Cluster', 'p_cellIndexCluster', 'p_phiModCalo', 'p_etaModCalo', 'p_dPhiTH3', 'p_R12', 'p_fTG3', 'p_weta2', 'p_Reta', 'p_Rphi', 'p_Eratio', 'p_f1', 'p_f3', 'p_Rhad', 'p_Rhad1', 'p_deltaEta1', 'p_deltaPhiRescaled2', 'p_TRTPID', 'p_TRTTrackOccupancy', 'p_numberOfInnermostPixelHits', 'p_numberOfPixelHits', 'p_numberOfSCTHits', 'p_numberOfTRTHits', 'p_numberOfTRTXenonHits', 'p_chi2', 'p_ndof', 'p_SharedMuonTrack', 'p_E7x7_Lr2', 'p_E7x7_Lr3', 'p_E_Lr0_HiG', 'p_E_Lr0_LowG', 'p_E_Lr0_MedG', 'p_E_Lr1_HiG', 'p_E_Lr1_LowG', 'p_E_Lr1_MedG', 'p_E_Lr2_HiG', 'p_E_Lr2_LowG', 'p_E_Lr2_MedG', 'p_E_Lr3_HiG', 'p_E_Lr3_LowG', 'p_E_Lr3_MedG', 'p_ambiguityType', 'p_asy1', 'p_author', 'p_barys1', 'p_core57cellsEnergyCorrection', 'p_deltaEta0', 'p_deltaEta2', 'p_deltaEta3', 'p_deltaPhi0', 'p_deltaPhi1', 'p_deltaPhi2', 'p_deltaPhi3', 'p_deltaPhiFromLastMeasurement', 'p_deltaPhiRescaled0', 'p_deltaPhiRescaled1', 'p_deltaPhiRescaled3', 'p_e1152', 'p_e132', 'p_e235', 'p_e255', 'p_e2ts1', 'p_ecore', 'p_emins1', 'p_etconeCorrBitset', 'p_ethad', 'p_ethad1', 'p_f1core', 'p_f3core', 'p_maxEcell_energy', 'p_maxEcell_gain', 'p_maxEcell_time', 'p_maxEcell_x', 'p_maxEcell_y', 'p_maxEcell_z', 'p_nCells_Lr0_HiG', 'p_nCells_Lr0_LowG', 'p_nCells_Lr0_MedG', 'p_nCells_Lr1_HiG', 'p_nCells_Lr1_LowG', 'p_nCells_Lr1_MedG', 'p_nCells_Lr2_HiG', 'p_nCells_Lr2_LowG', 'p_nCells_Lr2_MedG', 'p_nCells_Lr3_HiG', 'p_nCells_Lr3_LowG', 'p_nCells_Lr3_MedG', 'p_pos', 'p_pos7', 'p_poscs1', 'p_poscs2', 'p_ptconeCorrBitset', 'p_ptconecoreTrackPtrCorrection', 'p_r33over37allcalo', 'p_topoetconeCorrBitset', 'p_topoetconecoreConeEnergyCorrection', 'p_topoetconecoreConeSCEnergyCorrection', 'p_weta1', 'p_widths1', 'p_widths2', 'p_wtots1', 'p_e233', 'p_e237', 'p_e277', 'p_e2tsts1', 'p_ehad1', 'p_emaxs1', 'p_fracs1', 'p_DeltaE', 'p_E3x5_Lr0', 'p_E3x5_Lr1', 'p_E3x5_Lr2', 'p_E3x5_Lr3', 'p_E5x7_Lr0', 'p_E5x7_Lr1', 'p_E5x7_Lr2', 'p_E5x7_Lr3', 'p_E7x11_Lr0', 'p_E7x11_Lr1', 'p_E7x11_Lr2', 'p_E7x11_Lr3', 'p_E7x7_Lr0', 'p_E7x7_Lr1' ]

In [14]:
X = train[all_variables]
y = train['Truth']
z = train['p_truth_E']

In [4]:
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Prepare your dataset by filtering and selecting variables
filtered_dataset = X[y==1]

# Prepare the input features and target variable for training
X_train = filtered_dataset
y_train = z[y==1]

# Perform feature selection using ExtraTreesRegressor
feature_selector = ExtraTreesRegressor(n_estimators=20, random_state=42)
feature_selector.fit(X_train, y_train)

# Get the feature importances and sort them in descending order
feature_importances = feature_selector.feature_importances_
sorted_indices = feature_importances.argsort()[::-1]

# Select the top 20 variables based on importance
top_20_indices = sorted_indices[:20]
top_20_variables = [all_variables[i] for i in top_20_indices]

print("Selected Variables:")
print(top_20_variables)

file = open("Regression_MiaoShang_LinearRegression_VariableList.txt",'w+')
for i in range(len(top_20_variables)):
    file.write(str(top_20_variables[i])+'\n')

Selected Variables:
['p_eAccCluster', 'p_eCluster', 'p_ecore', 'p_e277', 'p_rawECluster', 'p_E7x7_Lr2', 'p_numberOfTRTXenonHits', 'p_E5x7_Lr2', 'p_e237', 'p_E7x11_Lr2', 'p_E3x5_Lr2', 'p_e255', 'p_e235', 'p_deltaPhiRescaled0', 'p_E_Lr1_MedG', 'p_deltaEta1', 'p_e2tsts1', 'p_deltaEta2', 'p_E3x5_Lr1', 'p_E7x11_Lr1']


In [15]:
top_20_variables=['p_eAccCluster', 'p_eCluster', 'p_ecore', 'p_e277', 'p_rawECluster', 'p_E7x7_Lr2', 'p_numberOfTRTXenonHits', 'p_E5x7_Lr2', 'p_e237', 'p_E7x11_Lr2', 'p_E3x5_Lr2', 'p_e255', 'p_e235', 'p_deltaPhiRescaled0', 'p_E_Lr1_MedG', 'p_deltaEta1', 'p_e2tsts1', 'p_deltaEta2', 'p_E3x5_Lr1', 'p_E7x11_Lr1']

In [17]:
# Split the dataset into training and testing sets
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
# Prepare your dataset by filtering and selecting variables
filtered_dataset = X[y==1]

# Prepare the input features and target variable for training
X_train = filtered_dataset
y_train = z[y==1]

train_data, test_data, train_target, test_target = train_test_split(filtered_dataset[top_20_variables], z[y==1], test_size=0.2, random_state=42)

# Fit a regression model
# Choose either LinearRegression or RandomForestRegressor
# Uncomment the desired model and comment out the other one

# Linear Regression
regression_model = LinearRegression()
regression_model.fit(train_data, train_target)

# Random Forest Regression
# regression_model = RandomForestRegressor(n_estimators=100, random_state=42)
# regression_model.fit(X_train, y_train)

# Once the model is trained, you can use it for prediction on new data
X_test = test_data[top_20_variables]

# Make predictions
y_pred = regression_model.predict(X_test)

# Calculate the relative estimate accuracy
relative_accuracy = (y_pred - test_target) / test_target

# Calculate the Mean Absolute Error (MAE)
mae = abs(relative_accuracy).mean()

# Print the MAE
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 0.07277070862355514


In [7]:
energy = regression_model.predict(test[top_20_variables])
energy = pandas.DataFrame(energy)
energy.columns=['p_truth_E']

NameError: name 'regression_model' is not defined

In [6]:
energy.to_csv('Regression_MiaoShang_LinearRegression.txt', index=True, header=False)

NameError: name 'energy' is not defined

In [5]:
file = open("Regression_MiaoShang_LinearRegression_VariableList.txt",'w+')
for i in range(len(top_20_variables)):
    file.write(str(top_20_variables[i])+'\n')