In [1]:
import h5py
import pandas
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def load_data(name):
    with h5py.File(f'{name}.h5', 'r') as f:
        filename = name.split('/')[-1]
        return pandas.DataFrame(f[filename][:], dtype=np.float64)

train = load_data('C:/Users/Chris Bhysicisd/Desktop/Applied machine learning/Project 1/train')
test  = load_data('C:/Users/Chris Bhysicisd/Desktop/Applied machine learning/Project 1/test')

In [2]:
print (f'Shape of training data set: {train.shape}')
print (f'Shape of test data set: {test.shape}')

Shape of training data set: (162500, 166)
Shape of test data set: (160651, 164)


In [3]:
all_variables = ['actualInteractionsPerCrossing', 'averageInteractionsPerCrossing', 'correctedActualMu', 'correctedAverageMu', 'correctedScaledActualMu', 'correctedScaledAverageMu', 'NvtxReco', 'p_nTracks', 'p_pt_track', 'p_eta', 'p_phi', 'p_charge', 'p_qOverP', 'p_z0', 'p_d0', 'p_sigmad0', 'p_d0Sig', 'p_EptRatio', 'p_dPOverP', 'p_z0theta', 'p_etaCluster', 'p_phiCluster', 'p_eCluster', 'p_rawEtaCluster', 'p_rawPhiCluster', 'p_rawECluster', 'p_eClusterLr0', 'p_eClusterLr1', 'p_eClusterLr2', 'p_eClusterLr3', 'p_etaClusterLr1', 'p_etaClusterLr2', 'p_phiClusterLr2', 'p_eAccCluster', 'p_f0Cluster', 'p_etaCalo', 'p_phiCalo', 'p_eTileGap3Cluster', 'p_cellIndexCluster', 'p_phiModCalo', 'p_etaModCalo', 'p_dPhiTH3', 'p_R12', 'p_fTG3', 'p_weta2', 'p_Reta', 'p_Rphi', 'p_Eratio', 'p_f1', 'p_f3', 'p_Rhad', 'p_Rhad1', 'p_deltaEta1', 'p_deltaPhiRescaled2', 'p_TRTPID', 'p_TRTTrackOccupancy', 'p_numberOfInnermostPixelHits', 'p_numberOfPixelHits', 'p_numberOfSCTHits', 'p_numberOfTRTHits', 'p_numberOfTRTXenonHits', 'p_chi2', 'p_ndof', 'p_SharedMuonTrack', 'p_E7x7_Lr2', 'p_E7x7_Lr3', 'p_E_Lr0_HiG', 'p_E_Lr0_LowG', 'p_E_Lr0_MedG', 'p_E_Lr1_HiG', 'p_E_Lr1_LowG', 'p_E_Lr1_MedG', 'p_E_Lr2_HiG', 'p_E_Lr2_LowG', 'p_E_Lr2_MedG', 'p_E_Lr3_HiG', 'p_E_Lr3_LowG', 'p_E_Lr3_MedG', 'p_ambiguityType', 'p_asy1', 'p_author', 'p_barys1', 'p_core57cellsEnergyCorrection', 'p_deltaEta0', 'p_deltaEta2', 'p_deltaEta3', 'p_deltaPhi0', 'p_deltaPhi1', 'p_deltaPhi2', 'p_deltaPhi3', 'p_deltaPhiFromLastMeasurement', 'p_deltaPhiRescaled0', 'p_deltaPhiRescaled1', 'p_deltaPhiRescaled3', 'p_e1152', 'p_e132', 'p_e235', 'p_e255', 'p_e2ts1', 'p_ecore', 'p_emins1', 'p_etconeCorrBitset', 'p_ethad', 'p_ethad1', 'p_f1core', 'p_f3core', 'p_maxEcell_energy', 'p_maxEcell_gain', 'p_maxEcell_time', 'p_maxEcell_x', 'p_maxEcell_y', 'p_maxEcell_z', 'p_nCells_Lr0_HiG', 'p_nCells_Lr0_LowG', 'p_nCells_Lr0_MedG', 'p_nCells_Lr1_HiG', 'p_nCells_Lr1_LowG', 'p_nCells_Lr1_MedG', 'p_nCells_Lr2_HiG', 'p_nCells_Lr2_LowG', 'p_nCells_Lr2_MedG', 'p_nCells_Lr3_HiG', 'p_nCells_Lr3_LowG', 'p_nCells_Lr3_MedG', 'p_pos', 'p_pos7', 'p_poscs1', 'p_poscs2', 'p_ptconeCorrBitset', 'p_ptconecoreTrackPtrCorrection', 'p_r33over37allcalo', 'p_topoetconeCorrBitset', 'p_topoetconecoreConeEnergyCorrection', 'p_topoetconecoreConeSCEnergyCorrection', 'p_weta1', 'p_widths1', 'p_widths2', 'p_wtots1', 'p_e233', 'p_e237', 'p_e277', 'p_e2tsts1', 'p_ehad1', 'p_emaxs1', 'p_fracs1', 'p_DeltaE', 'p_E3x5_Lr0', 'p_E3x5_Lr1', 'p_E3x5_Lr2', 'p_E3x5_Lr3', 'p_E5x7_Lr0', 'p_E5x7_Lr1', 'p_E5x7_Lr2', 'p_E5x7_Lr3', 'p_E7x11_Lr0', 'p_E7x11_Lr1', 'p_E7x11_Lr2', 'p_E7x11_Lr3', 'p_E7x7_Lr0', 'p_E7x7_Lr1' ]


In [4]:
new_data_set=train[train["Truth"]==1]

In [5]:
new_data_set["Truth"]

0         1.0
3         1.0
4         1.0
7         1.0
11        1.0
         ... 
162493    1.0
162494    1.0
162495    1.0
162496    1.0
162497    1.0
Name: Truth, Length: 121495, dtype: float64

In [6]:
# Separate the features (X) and target variable (y)
final_data_set = new_data_set.drop('Truth', axis=1)
X = final_data_set.drop('p_truth_E', axis=1)
y = final_data_set['p_truth_E']

# Split the dataset into training and test sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
'''
20-best feature selection
'''

KBest = SelectKBest(score_func=f_regression, k=20).fit(X, y)
short_feature_names = KBest.get_support(1)
X_new = X[X.columns[short_feature_names]] # final features

  corr /= X_norms
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom


In [8]:
X_new

Unnamed: 0,p_eCluster,p_rawECluster,p_eClusterLr2,p_eAccCluster,p_cellIndexCluster,p_numberOfTRTHits,p_numberOfTRTXenonHits,p_ndof,p_E7x7_Lr2,p_E_Lr2_MedG,p_e235,p_e255,p_ecore,p_maxEcell_energy,p_e233,p_e237,p_e277,p_E3x5_Lr2,p_E5x7_Lr2,p_E7x11_Lr2
0,129279.335938,118228.468750,73859.039062,118228.468750,73.0,28.0,28.0,53.0,74796.945312,0.000000,70749.429688,73239.281250,116902.500000,31828.267578,68536.257812,71439.609375,74807.515625,70738.921875,74045.820312,74447.539062
3,140121.546875,132577.937500,104611.984375,132577.937500,80.0,17.0,17.0,36.0,105142.125000,41780.808594,99356.359375,102771.585938,130480.195312,41780.808594,95602.218750,100507.500000,104868.156250,99237.500000,104002.000000,106995.789062
4,141874.187500,131914.531250,84169.898438,131914.531250,82.0,0.0,0.0,23.0,89036.359375,0.000000,79455.710938,83249.382812,125893.218750,23741.416016,72990.429688,84057.000000,89240.484375,75901.640625,87924.406250,93710.968750
7,121228.468750,113393.125000,73788.343750,113393.125000,79.0,17.0,17.0,34.0,76943.218750,0.000000,66433.625000,70019.070312,107875.890625,25011.597656,58647.375000,70844.875000,75896.695312,70161.070312,75593.671875,77926.218750
11,123975.093750,116238.164062,81619.078125,116238.164062,81.0,11.0,11.0,32.0,82565.617188,37154.539062,77596.882812,80911.367188,114340.398438,37154.539062,75564.929688,78316.843750,82557.671875,76690.414062,81557.367188,83021.085938
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162493,152751.812500,146789.062500,86140.375000,137611.812500,71.0,36.0,36.0,51.0,89004.304688,0.000000,74915.773438,80131.835938,135404.468750,32786.199219,67796.062500,77267.476562,89006.203125,74910.070312,85073.117188,90556.890625
162494,170185.265625,157808.062500,85682.078125,157808.062500,89.0,0.0,0.0,25.0,85528.132812,39484.984375,78871.023438,82986.039062,153299.000000,39484.984375,75400.015625,80394.039062,87000.046875,73704.515625,83686.460938,86513.257812
162495,132337.046875,124776.109375,88057.046875,123652.023438,73.0,32.0,26.0,51.0,88353.765625,0.000000,82345.000000,86262.015625,120256.007812,26879.332031,79224.000000,83015.000000,88932.015625,81626.820312,86889.070312,88872.992188
162496,123980.445312,116509.882812,79047.640625,116509.882812,78.0,18.0,18.0,39.0,80584.453125,0.000000,75251.000000,78529.992188,115338.546875,22023.763672,67145.000000,76370.007812,80591.000000,75242.835938,79933.289062,83049.414062


In [9]:

'''
make a list of 15 features used
'''
lista = X_new.columns.tolist()
X_test=test[lista]
lista

['p_eCluster',
 'p_rawECluster',
 'p_eClusterLr2',
 'p_eAccCluster',
 'p_cellIndexCluster',
 'p_numberOfTRTHits',
 'p_numberOfTRTXenonHits',
 'p_ndof',
 'p_E7x7_Lr2',
 'p_E_Lr2_MedG',
 'p_e235',
 'p_e255',
 'p_ecore',
 'p_maxEcell_energy',
 'p_e233',
 'p_e237',
 'p_e277',
 'p_E3x5_Lr2',
 'p_E5x7_Lr2',
 'p_E7x11_Lr2']

In [10]:
X = X_new
#y = train['Truth']

# Split the data into training and test sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [11]:
X_test = test[lista]
X_test

Unnamed: 0,p_eCluster,p_rawECluster,p_eClusterLr2,p_eAccCluster,p_cellIndexCluster,p_numberOfTRTHits,p_numberOfTRTXenonHits,p_ndof,p_E7x7_Lr2,p_E_Lr2_MedG,p_e235,p_e255,p_ecore,p_maxEcell_energy,p_e233,p_e237,p_e277,p_E3x5_Lr2,p_E5x7_Lr2,p_E7x11_Lr2
0,82187.609375,71278.000000,34399.031250,64975.027344,63.0,41.0,25.0,65.0,34925.179688,0.000000,32689.027344,34224.019531,71690.007812,12214.185547,31336.017578,32771.039062,34919.027344,32694.718750,34428.593750,35057.257812
1,85043.468750,48086.570312,18479.544922,29639.380859,59.0,43.0,27.0,60.0,19340.070312,0.000000,16570.472656,17692.191406,17692.191406,7016.348145,15674.483398,17007.460938,19011.896484,16792.835938,18553.693359,19125.595703
2,233958.984375,213211.953125,172773.468750,213211.953125,97.0,0.0,0.0,19.0,171499.015625,116148.203125,162108.609375,168776.218750,170233.765625,63084.750000,154417.562500,164848.656250,173223.359375,155060.593750,168862.281250,173636.875000
3,109344.992188,100085.601562,48286.660156,87736.617188,67.0,34.0,34.0,53.0,48823.789062,0.000000,45098.023438,47360.031250,95592.000000,23430.736328,43231.015625,45690.031250,49668.078125,44183.761719,47991.351562,49673.574219
4,105603.453125,101453.898438,66239.726562,98877.570312,68.0,30.0,30.0,41.0,68432.492188,0.000000,55938.042969,59245.050781,87641.648438,14649.957031,41718.027344,60917.062500,67544.070312,60173.855469,66441.101562,70718.359375
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160646,223671.625000,206877.984375,148754.328125,206877.984375,93.0,0.0,0.0,25.0,147162.484375,92521.781250,134565.828125,140014.062500,197012.640625,52154.242188,125429.695312,139946.953125,148044.593750,123559.570312,144228.375000,150338.828125
160647,147922.343750,139902.281250,109973.726562,139902.281250,84.0,0.0,0.0,25.0,113984.078125,0.000000,100186.179688,104373.640625,134197.062500,30492.431641,69390.507812,104427.851562,113990.500000,100180.257812,109270.625000,121041.343750
160648,126295.046875,120722.125000,100732.601562,120722.125000,78.0,19.0,19.0,38.0,102612.726562,52456.148438,94102.882812,98614.375000,116440.171875,52456.148438,90001.328125,95337.429688,102617.765625,94097.039062,100385.117188,102540.796875
160649,112429.585938,94286.265625,42285.320312,76980.710938,66.0,30.0,30.0,43.0,42645.628906,0.000000,33083.347656,39170.511719,68503.914062,12575.457031,29488.207031,34295.484375,42645.882812,33080.488281,41132.343750,44453.101562


In [21]:
from sklearn.linear_model import LinearRegression

# Assuming you have training data X_train and corresponding target values y_train

# Create a linear regression model
model_reg = LinearRegression()

# Train the model using the training data
model_reg.fit(X_train, y_train)

# Assuming you have val data X_val
y_pred_val = model_reg.predict(X_val)

# Get the number of parameters (coefficients)
num_parameters = len(model_reg.coef_) + 1  # Include the intercept term

# Print the number of parameters
print("Number of parameters:", num_parameters)

# Make predictions on the test data
y_pred = model_reg.predict(X_test)

Number of parameters: 21


In [17]:
# # # Calculate the mean squared error (MSE)
# mse = mean_squared_error(y_val, y_pred_val)

In [19]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Calculate mean absolute error (MAE)
mae = mean_absolute_error(y_val, y_pred_val)

# Calculate root mean squared error (RMSE)
rmse = mean_squared_error(y_val, y_pred_val, squared=False)

# Calculate R-squared (R2) score
r2 = r2_score(y_val, y_pred_val)

# Print the metrics
print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared (R2) Score:", r2)


Mean Absolute Error (MAE): 6376.624807378088
Root Mean Squared Error (RMSE): 13741.821336721981
R-squared (R2) Score: 0.9091461940749641


In [18]:
mse

188837653.6503875

In [13]:
#print(y.values)
#y_val.shape

In [14]:
print(y_pred)
y_pred.shape

[ 83216.48682238  81999.20577612 230157.34147638 ... 123846.65668066
 100100.05998797 154100.33665534]


(160651,)

In [15]:
import numpy as np


# Generate indices for the index column
indices = np.arange(len(y_pred))




# Convert indices to a one-dimensional array
indices = np.ravel(indices)

# Round y_pred to 8 significant digits
y_pred_rounded = np.around(y_pred, decimals=8)

# Concatenate indices and cluster labels horizontally
data_with_index = np.column_stack((indices, y_pred))

# Save the data with index to a text file
np.savetxt("y_pred_Linear_reg.txt", data_with_index, fmt=["%d"] + ["%.8f"], delimiter=",", header="", comments="")
