In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
import xgboost as xgb

In [2]:
# Load the datasets
daily_data = pd.read_csv('daily_data.csv')
submission_template = pd.read_csv('submission.csv')

daily_data.head()

Unnamed: 0,day_id,city_id,temperature_celsius,condition_text,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,uv_index,gust_kph,air_quality_us-epa-index,sunrise,sunset
0,D0001,C001,27.0,,6.1,210,1006.0,0.0,54,75,28.0,10.0,6.0,11.9,2,06:04 AM,07:19 PM
1,D0002,C001,22.0,,6.1,170,1006.0,0.0,73,75,24.5,10.0,1.0,23.4,1,06:05 AM,07:18 PM
2,D0003,C001,20.0,Light Rain with Thunder,3.6,10,1011.0,4.5,100,75,20.0,10.0,1.0,12.6,1,06:05 AM,07:18 PM
3,D0004,C001,17.0,Clear and Sunny,6.1,150,1018.0,0.0,88,0,17.0,10.0,1.0,11.2,1,06:06 AM,07:16 PM
4,D0005,C001,18.0,,3.6,92,1019.0,0.0,94,0,18.0,10.0,1.0,9.0,1,06:07 AM,07:15 PM


In [3]:
# Drop rows with missing target values
data_clean = daily_data.dropna(subset=['condition_text'])

# Fill missing values for other columns
numerical_cols = data_clean.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = data_clean.select_dtypes(include=['object']).columns
data_clean[numerical_cols] = data_clean[numerical_cols].fillna(data_clean[numerical_cols].mean())
data_clean[categorical_cols] = data_clean[categorical_cols].fillna(data_clean[categorical_cols].mode().iloc[0])

# Encode the target variable
le = LabelEncoder()
data_clean['condition_text_encoded'] = le.fit_transform(data_clean['condition_text'])

# Define features and target
features = [col for col in data_clean.columns if col not in ['day_id', 'city_id', 'wind_degree', 'sunrise', 'sunset', 'condition_text', 'condition_text_encoded']]
X = data_clean[features]
y = data_clean['condition_text_encoded']

# Encode categorical variables and standardize numerical features
X = pd.get_dummies(X, drop_first=True)
scaler = StandardScaler()
X = scaler.fit_transform(X)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean[numerical_cols] = data_clean[numerical_cols].fillna(data_clean[numerical_cols].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean[categorical_cols] = data_clean[categorical_cols].fillna(data_clean[categorical_cols].mode().iloc[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-co

In [4]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape input data for CNN
X_train_reshaped = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_val_reshaped = X_val.reshape((X_val.shape[0], X_val.shape[1], 1))


In [6]:
# Define the CNN model
def create_cnn(input_shape):
    model = tf.keras.Sequential([
        tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu', input_shape=input_shape, padding='same'), # Add padding to the first layer
        tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu', padding='same'), # Add padding to the second layer
        tf.keras.layers.MaxPooling1D(pool_size=2),
        tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu', padding='same'), # Add padding
        tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu', padding='same'), # Add padding
        tf.keras.layers.MaxPooling1D(pool_size=2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(10, activation='softmax') # Add a final output layer with the number of classes
    ])
    return model

# Create the CNN model
cnn_model = create_cnn((X_train_reshaped.shape[1], 1))

# Compile the CNN model
cnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the CNN model for more epochs
cnn_model.fit(X_train_reshaped, y_train, epochs=30, batch_size=32, validation_data=(X_val_reshaped, y_val))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7dc059b393f0>

In [7]:
# Extract features using the CNN model
train_features = cnn_model.predict(X_train_reshaped)
val_features = cnn_model.predict(X_val_reshaped)

# Hyperparameter tuning for XGBoost
param_grid = {
    'n_estimators': [100],
    'max_depth': [7],
    'learning_rate': [0.2]
}

xgb_classifier = xgb.XGBClassifier()
grid_search = GridSearchCV(xgb_classifier, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(train_features, y_train)

# Use the best estimator from grid search for prediction
best_xgb = grid_search.best_estimator_
val_predictions = best_xgb.predict(val_features)



  pid = os.fork()
  pid = os.fork()


In [9]:
# Convert predictions back to original labels
predicted_labels = le.inverse_transform(val_predictions)
true_labels = le.inverse_transform(y_val)

# Calculate the accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
print(f'Validation Accuracy: {accuracy * 100:.2f}%')

# Prepare the submission file

submission = pd.DataFrame({'id': range(len(predicted_labels)), 'condition_text': predicted_labels}) # Use range() to generate indices matching the length of predicted labels
submission.to_csv('submission_final.csv', index=False)

Validation Accuracy: 59.38%
