import pandas as pd

#### Load train and test datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

#### Check data structure and columns
print(train_data.head())
print(test_data.head())
print(train_data.info())
print(test_data.info()) 

In [2]:
from sklearn.impute import SimpleImputer

# Handling missing values in train and test datasets
imputer = SimpleImputer(strategy='mean')

# Fill missing values in sensor data (A-P) and spectral reflectance data (350nm-2500nm)
sensor_columns = train_data.columns[7:]  # Columns from 'A' to '2500nm'
train_data[sensor_columns] = imputer.fit_transform(train_data[sensor_columns])
test_data[sensor_columns] = imputer.transform(test_data[sensor_columns])

In [3]:
missing_values = train_data.isnull().sum()
print(missing_values)

Id            0
Property_A    0
Property_B    0
Property_C    0
Property_D    0
             ..
2496 nm       0
2497 nm       0
2498 nm       0
2499 nm       0
2500 nm       0
Length: 2174, dtype: int64


In [4]:
sensor1_cols = train_data.columns[7:23]  # Columns 'A' to 'P'

X_train = train_data[sensor1_cols].copy()
X_test = test_data[sensor1_cols].copy()

X_train = pd.concat([X_train, train_data.iloc[:, 23:]], axis=1)
X_test = pd.concat([X_test, test_data.iloc[:, 23:]], axis=1)

In [5]:
print(train_data.columns)

Index(['Id', 'Property_A', 'Property_B', 'Property_C', 'Property_D',
       'Property_E', 'Property_F', 'A ', 'B', 'C',
       ...
       '2491 nm', '2492 nm', '2493 nm', '2494 nm', '2495 nm', '2496 nm',
       '2497 nm', '2498 nm', '2499 nm', '2500 nm'],
      dtype='object', length=2174)


In [6]:
print(X_test.columns)

Index(['A ', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
       ...
       '2491 nm', '2492 nm', '2493 nm', '2494 nm', '2495 nm', '2496 nm',
       '2497 nm', '2498 nm', '2499 nm', '2500 nm'],
      dtype='object', length=2161)


In [7]:
sensor1_cols

Index(['A ', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
       'O', 'P'],
      dtype='object')

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

# Splitting the train dataset for model evaluation
y_train = train_data.iloc[:, 1:7]  # Target columns

X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

# Model initialization and training
model = XGBRegressor()  # Initialize XGBoost regressor
model.fit(X_train_split, y_train_split)

# Model evaluation
predictions = model.predict(X_val)
rmse = mean_squared_error(y_val, predictions, squared=False)  # Calculate RMSE
print(f"RMSE: {rmse}")

RMSE: 4.401839050366957


In [9]:
#pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [9]:
predictions = model.predict(X_train_split)
rmse = mean_squared_error(y_train_split, predictions, squared=False)  # Calculate RMSE
print(f"RMSE: {rmse}")

RMSE: 0.0006237929787845799


In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

# Load sample dataset (replace this with your data)

X = X_train
y = y_train

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the parameter grid to search through
param_grid = {
    'max_depth':[3,4,5],
    'learning_rate': [0.1,0.01,0.001],
    'n_estimators': [100,300,500],
    'gamma': [0, 0.1,0.3],
    'subsample': [0.8,0.9,1],
    'colsample_bytree': [0.8,0.9,1]
}

# Initialize XGBRegressor
xgb = XGBRegressor()

# GridSearchCV with the defined parameter grid
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, verbose=1, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and best negative mean squared error
best_params = grid_search.best_params_
best_neg_mse = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Negative MSE:", best_neg_mse)