In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [2]:
# Load the dataset
file_path = 'daily_data.csv'  # Replace with the correct path to your file
submission_path = 'submission.csv'  # Replace with the correct path to your submission file
data = pd.read_csv(file_path)
submission_df = pd.read_csv(submission_path)

In [3]:
data.head()

Unnamed: 0,day_id,city_id,temperature_celsius,condition_text,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,uv_index,gust_kph,air_quality_us-epa-index,sunrise,sunset
0,D0001,C001,27.0,,6.1,210,1006.0,0.0,54,75,28.0,10.0,6.0,11.9,2,06:04 AM,07:19 PM
1,D0002,C001,22.0,,6.1,170,1006.0,0.0,73,75,24.5,10.0,1.0,23.4,1,06:05 AM,07:18 PM
2,D0003,C001,20.0,Light Rain with Thunder,3.6,10,1011.0,4.5,100,75,20.0,10.0,1.0,12.6,1,06:05 AM,07:18 PM
3,D0004,C001,17.0,Clear and Sunny,6.1,150,1018.0,0.0,88,0,17.0,10.0,1.0,11.2,1,06:06 AM,07:16 PM
4,D0005,C001,18.0,,3.6,92,1019.0,0.0,94,0,18.0,10.0,1.0,9.0,1,06:07 AM,07:15 PM


In [4]:
columns_to_drop = ['day_id', 'sunrise', 'sunset','wind_degree','air_quality_us-epa-index']
data =data.drop(columns=columns_to_drop, axis =1)


In [5]:
data.head()

Unnamed: 0,city_id,temperature_celsius,condition_text,wind_kph,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,uv_index,gust_kph
0,C001,27.0,,6.1,1006.0,0.0,54,75,28.0,10.0,6.0,11.9
1,C001,22.0,,6.1,1006.0,0.0,73,75,24.5,10.0,1.0,23.4
2,C001,20.0,Light Rain with Thunder,3.6,1011.0,4.5,100,75,20.0,10.0,1.0,12.6
3,C001,17.0,Clear and Sunny,6.1,1018.0,0.0,88,0,17.0,10.0,1.0,11.2
4,C001,18.0,,3.6,1019.0,0.0,94,0,18.0,10.0,1.0,9.0


In [6]:
# Drop rows with missing target values

data = data.dropna(subset=['condition_text'])
#columns_to_drop = ['feels_like_celsius']
#data.drop(columns=columns_to_drop, axis =1)
# Encode the target variable
le = LabelEncoder()
data['condition_text_encoded'] = le.fit_transform(data['condition_text'])
# data_clean.loc[:, 'condition_text_encoded'] = le.fit_transform(data_clean['condition_text'])

#columns_to_drop = ['condition_text_predicted']
#data =data.drop(columns=columns_to_drop, axis =1)
# Define features and target
features = [col for col in data.columns if col not in [ 'city_id', 'condition_text', 'condition_text_encoded']]
X = data[features]

y = data['condition_text_encoded']

data.head()

Unnamed: 0,city_id,temperature_celsius,condition_text,wind_kph,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,uv_index,gust_kph,condition_text_encoded
2,C001,20.0,Light Rain with Thunder,3.6,1011.0,4.5,100,75,20.0,10.0,1.0,12.6,3
3,C001,17.0,Clear and Sunny,6.1,1018.0,0.0,88,0,17.0,10.0,1.0,11.2,0
6,C001,21.0,Partly Cloudy,4.0,1015.0,0.0,100,50,21.0,10.0,1.0,15.1,6
18,C001,19.0,Clear and Sunny,3.6,1017.0,0.0,88,0,19.0,10.0,1.0,8.3,0
27,C002,19.0,Partly Cloudy,3.6,1010.0,0.0,73,25,19.0,10.0,1.0,8.3,6


In [7]:
# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Polynomial Features
poly = PolynomialFeatures(degree=2, interaction_only=True)

# Preprocessing Pipeline
numeric_features = features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('poly', poly)])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)])

In [8]:
from xgboost import XGBClassifier  # Import XGBoost classifier

# Update the pipeline to use XGBoost classifier instead of RandomForestClassifier
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('clf', XGBClassifier(random_state=42))])  # Use XGBClassifier

# Define the parameter grid for GridSearchCV
param_grid = {
    'clf__n_estimators': [100],  # Number of boosting rounds
    'clf__max_depth': [5],        # Maximum depth of a tree
    'clf__learning_rate': [0.01],  # Step size shrinkage used to prevent overfitting
    'clf__subsample': [0.8],      # Subsample ratio of the training instances
    'clf__colsample_bytree': [0.8],  # Subsample ratio of columns when constructing each tree
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=2, n_jobs=-1, verbose=2)

# Fit GridSearchCV to the resampled data
grid_search.fit(X_res, y_res)

# Get the best parameters from GridSearchCV
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Train the XGBoost Classifier with the best parameters on the full dataset
pipeline.set_params(**best_params)
pipeline.fit(X_res, y_res)

# Use the full dataset to train the model
X_full = data[features]
y_full = data['condition_text_encoded']

# Train the XGBoost Classifier with the best parameters on the full dataset
pipeline.fit(X_full, y_full)

# Predict the condition_text for all entries, including those with missing values
data['condition_text'] = data['condition_text'].fillna('')
X_all = data[features]
y_all_pred = pipeline.predict(X_all)

# Decode the predicted labels back to the original condition_text
data['condition_text_predicted'] = le.inverse_transform(y_all_pred)

# Update the original condition_text column with the predicted values for missing entries
data['condition_text'] = data.apply(
    lambda row: row['condition_text_predicted'] if row['condition_text'] == '' else row['condition_text'],
    axis=1
)

# Prepare the submission file
submission_df['condition_text'] = data['condition_text']
submission_df.to_csv(submission_path, index=False)

# Display the first few rows of the updated submission file
print(submission_df.head())

# Get the predicted labels for X_full
y_full_pred = pipeline.predict(X_full)

# Calculate the accuracy score
accuracy = accuracy_score(y_full, y_full_pred)
print(f"Accuracy score: {accuracy}")


Fitting 2 folds for each of 1 candidates, totalling 2 fits
Best parameters: {'clf__colsample_bytree': 0.8, 'clf__learning_rate': 0.01, 'clf__max_depth': 5, 'clf__n_estimators': 100, 'clf__subsample': 0.8}
  day_id           condition_text
0  D0001                      NaN
1  D0002                      NaN
2  D0003  Light Rain with Thunder
3  D0004          Clear and Sunny
4  D0005                      NaN
Accuracy score: 0.918580375782881


In [9]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Initialize XGBoost classifier with custom hyperparameters
xgb_clf = XGBClassifier(
    n_estimators=100,        # Number of boosting rounds
    max_depth=5,             # Maximum depth of a tree
    learning_rate=0.1,       # Step size shrinkage used to prevent overfitting
    subsample=0.8,           # Subsample ratio of the training instances
    colsample_bytree=0.8,    # Subsample ratio of columns when constructing each tree
    random_state=42
)

# Train the classifier on the training data
xgb_clf.fit(X_train, y_train)

# Predictions on the testing set
y_pred = xgb_clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on the testing set: {accuracy}")


Accuracy on the testing set: 0.8818181818181818
