In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [5]:
# Load the dataset
file_path = 'daily_data.csv'  # Replace with the correct path to your file
submission_path = 'submission.csv'  # Replace with the correct path to your submission file
data = pd.read_csv(file_path)
submission_df = pd.read_csv(submission_path)

In [6]:
# Drop rows with missing target values
data_clean = data.dropna(subset=['condition_text'])

# Encode the target variable
le = LabelEncoder()
data_clean['condition_text_encoded'] = le.fit_transform(data_clean['condition_text'])
# data_clean.loc[:, 'condition_text_encoded'] = le.fit_transform(data_clean['condition_text'])

# Define features and target
features = [col for col in data_clean.columns if col not in ['day_id', 'city_id', 'wind_degree', 'sunrise', 'sunset', 'condition_text', 'condition_text_encoded']]
X = data_clean[features]
y = data_clean['condition_text_encoded']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['condition_text_encoded'] = le.fit_transform(data_clean['condition_text'])


In [7]:
# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Polynomial Features
poly = PolynomialFeatures(degree=2, interaction_only=True)

# Preprocessing Pipeline
numeric_features = features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('poly', poly)])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)])

In [13]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'clf__n_estimators': [200],
    'clf__max_features': ['sqrt'],
    'clf__max_depth': [10],
    'clf__criterion': ['gini']
}

# Pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('clf', RandomForestClassifier(random_state=42))])

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_res, y_res)

# Best parameters from GridSearchCV
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Train the Random Forest Classifier with the best parameters on the full dataset
pipeline.set_params(**best_params)
pipeline.fit(X_res, y_res)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best parameters: {'clf__criterion': 'gini', 'clf__max_depth': 10, 'clf__max_features': 'sqrt', 'clf__n_estimators': 200}


In [14]:
# Use the full dataset to train the model
X_full = data_clean[features]
y_full = data_clean['condition_text_encoded']

# Train the Random Forest Classifier with the best parameters on the full dataset
pipeline.fit(X_full, y_full)

In [15]:
# Predict the condition_text for all entries, including those with missing values
data['condition_text'] = data['condition_text'].fillna('')

# Use the features to predict the missing condition_text
X_all = data[features]
y_all_pred = pipeline.predict(X_all)

# Decode the predicted labels back to the original condition_text
data['condition_text_predicted'] = le.inverse_transform(y_all_pred)

# Update the original condition_text column with the predicted values for missing entries
data['condition_text'] = data.apply(
    lambda row: row['condition_text_predicted'] if row['condition_text'] == '' else row['condition_text'],
    axis=1
)

In [16]:
from sklearn.metrics import accuracy_score

# Prepare the submission file
submission_df['condition_text'] = data['condition_text']
submission_df.to_csv(submission_path, index=False)

# Display the first few rows of the updated submission file
print(submission_df.head())

# Get the predicted labels for X_full
y_full_pred = pipeline.predict(X_full)

# Calculate the accuracy score
accuracy = accuracy_score(y_full, y_full_pred)
print(f"Accuracy score: {accuracy}")

  day_id           condition_text
0  D0001            Partly Cloudy
1  D0002            Partly Cloudy
2  D0003  Light Rain with Thunder
3  D0004          Clear and Sunny
4  D0005          Clear and Sunny
Accuracy score: 0.9686847599164927


In [18]:
# Load the original submission file
original_submission_path = 'submission_original.csv'
original_submission_df = pd.read_csv(original_submission_path)

# Ensure condition_text from the previous predictions is available in the dataframe
predicted_condition_texts = data['condition_text']

# Update the original submission file with predicted classification_texts where needed
original_submission_df['condition_text'] = original_submission_df.apply(
    lambda row: row['condition_text'] if pd.notna(row['condition_text']) else predicted_condition_texts[row.name],
    axis=1
)

# Save the updated submission file
original_submission_df.to_csv(original_submission_path, index=False)

# Display the first few rows of the updated submission file to verify
original_submission_df.head()

Unnamed: 0,day_id,condition_text
0,D0001,Partly Cloudy
1,D0002,Partly Cloudy
2,D0003,Light Rain with Thunder
3,D0004,Clear and Sunny
4,D0005,Clear and Sunny
