In [23]:
import pandas as pd
import numpy as np

In [24]:
df=pd.read_csv('D:/Competitions/SOI/AI-Clowns/datasets/data_1997_1998.csv')
df.head()

Unnamed: 0,Index,year,month,day,latitude,longitude,zon.winds,mer.winds,humidity,air temp.
0,0,1997,1,1,0.07,-110.0,-3.1,1.4,89.6,22.66
1,1,1997,1,2,0.07,-110.0,-2.8,3.0,86.8,22.45
2,2,1997,1,3,0.07,-110.0,-5.2,3.3,85.2,22.14
3,3,1997,1,4,0.07,-110.0,-5.7,1.6,86.8,22.19
4,4,1997,1,5,0.07,-110.0,-5.1,1.5,83.6,22.34


In [25]:
df['date']=pd.to_datetime(df[['year','month','day']])
df=df.sort_values(by='date').reset_index(drop=True)
df.drop(columns=['date','day'],inplace=True)
df.head()

Unnamed: 0,Index,year,month,latitude,longitude,zon.winds,mer.winds,humidity,air temp.
0,0,1997,1,0.07,-110.0,-3.1,1.4,89.6,22.66
1,14389,1997,1,5.04,-109.94,,,86.4,25.05
2,14802,1997,1,5.12,-124.9,-1.4,1.7,,24.19
3,15338,1997,1,4.91,-139.9,-6.5,3.4,80.5,26.2
4,15838,1997,1,4.95,147.0,,,75.9,28.64


In [26]:
print(f"Before imputing:\n {df.isnull().sum()}")

# List of features to interpolate
features_to_interpolate = ['air temp.', 'zon.winds', 'mer.winds']

# Apply spline interpolation to each feature
for feature in features_to_interpolate:
    df[feature] = df[feature].interpolate(method='spline', order=3)

# Check if there are any remaining missing values
missing_values_count = df[features_to_interpolate].isnull().sum()
print("Remaining missing values after interpolation:\n", missing_values_count)

from sklearn.model_selection import train_test_split,GridSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Split data into two sets: one with NaN values and one without
df_known = df.dropna(subset=['humidity'])  # DataFrame without NaN values
df_nan = df[df['humidity'].isna()]         # DataFrame with NaN values

# Features and target variable
X = df_known[['month', 'year','latitude','longitude','air temp.']]
y = df_known['humidity']

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

param_grid = {
    'n_estimators': [400, 500, 600],  # Number of trees
    'max_depth': [7, 9, 11],           # Maximum depth of each tree
    'learning_rate': [0.5, 0.1, 0.5], # Learning rate
    'subsample': [0.7, 0.9, 1.1],      # Subsample ratio of the training instances
    'colsample_bytree': [0.9, 1.0, 2.0],# Subsample ratio of columns when constructing each tree
}

# Create the XGBoost regressor
model = XGBRegressor()

# Instantiate the grid search with cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

# Perform grid search
grid_search.fit(X_train, y_train)

# Best parameters found during grid search
best_params = grid_search.best_params_

# Predict on the test set with the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

imputed_humidity = best_model.predict(df_nan[['month', 'year','latitude','longitude','air temp.']])
df.loc[df['humidity'].isna(), 'humidity'] = imputed_humidity

print(f"Finally null values :\n {df.isnull().sum()}")


Before imputing:
 Index           0
year            0
month           0
latitude        0
longitude       0
zon.winds    4897
mer.winds    4897
humidity     4878
air temp.    2803
dtype: int64
Remaining missing values after interpolation:
 air temp.    0
zon.winds    0
mer.winds    0
dtype: int64
Fitting 5 folds for each of 243 candidates, totalling 1215 fits


675 fits failed out of a total of 1215.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
270 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Dell\AppData\Roaming\Python\Python310\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Dell\anaconda3_original\envs\py310\lib\site-packages\xgboost\core.py", line 730, in inner_f
    return func(**kwargs)
  File "c:\Users\Dell\anaconda3_original\envs\py310\lib\site-packages\xgboost\sklearn.py", line 1090, in fit
    self._Booster = train(
  File "c:\Users\Dell\anaconda3_original\envs\py310\lib\site-packages\xgboost\core.py", line 730, in inner_f
    return fun

Finally null values :
 Index        0
year         0
month        0
latitude     0
longitude    0
zon.winds    0
mer.winds    0
humidity     0
air temp.    0
dtype: int64


In [27]:
# Tranformers

from sklearn.preprocessing import StandardScaler,FunctionTransformer,PowerTransformer

X=df
X.drop(columns=['Index'],inplace=True)

yeojohnson_transformer = PowerTransformer(method='yeo-johnson')
X['zon.winds'] = yeojohnson_transformer.fit_transform(X[['zon.winds']])
X['air temp.']=yeojohnson_transformer.fit_transform(X[['air temp.']])

scaler=StandardScaler()
X_scaled=scaler.fit_transform(X)

In [28]:
import pickle
import numpy as np

# Assuming X and X_scaled are defined
# X = ... (original data)
# X_scaled = ... (scaled data used for predictions)

# Load the saved model
with open('xgb_best_model.pkl', 'rb') as f:
    xgb_best_model = pickle.load(f)

# Use the model to predict values on X_scaled
y_pred = xgb_best_model.predict(X_scaled)

# Ensure y_pred is a column vector
y_pred = y_pred.reshape(-1, 1)

y_pred_df = pd.DataFrame(y_pred, columns=['s.s.temp'])

# Save y_pred as CSV using pandas.DataFrame.to_csv
y_pred_df.to_csv('prediction_1997_1998.csv', index=False)

print("Predictions (s.s.temp) saved to 'y_pred.csv'.")

Predictions (s.s.temp) saved to 'y_pred.csv'.


In [29]:
print(df.shape,y_pred.shape)

(32314, 8) (32314, 1)
