In [None]:
# Step 1: Install necessary libraries
!pip install kaggle seaborn scikit-learn

# Step 2: Upload kaggle.json
from google.colab import files
files.upload()  # This will prompt you to upload the kaggle.json file

# Step 3: Move kaggle.json to the appropriate directory
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json



Saving kaggle.json to kaggle.json


In [None]:
!kaggle competitions download -c predicta-1-0-predict-the-unpredictable-part-2
!unzip predicta-1-0-predict-the-unpredictable-part-2

Downloading predicta-1-0-predict-the-unpredictable-part-2.zip to /content
  0% 0.00/65.0k [00:00<?, ?B/s]
100% 65.0k/65.0k [00:00<00:00, 55.1MB/s]
Archive:  predicta-1-0-predict-the-unpredictable-part-2.zip
  inflating: daily_data.csv          
  inflating: submission.csv          


In [None]:
#TUNED
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load data
data = pd.read_csv('daily_data.csv')

# Convert sunrise and sunset columns to datetime
data['sunrise'] = pd.to_datetime(data['sunrise'])
data['sunset'] = pd.to_datetime(data['sunset'])

# Calculate the difference and add it as a new column
data['day_length'] = data['sunset'] - data['sunrise']
data['day_length_hours'] = data['day_length'].dt.total_seconds() / 3600

# Features selection
features = [
    'temperature_celsius', 'wind_kph', 'pressure_mb', 'precip_mm',
    'humidity', 'cloud', 'feels_like_celsius', 'visibility_km',
    'uv_index', 'gust_kph', 'air_quality_us-epa-index', 'day_length_hours'
]

# Create datasets
data_complete = data.dropna(subset=['condition_text'])
data_missing = data[data['condition_text'].isnull()]

X_complete = data_complete[features]
y_complete = data_complete['condition_text']

X_missing = data_missing[features]

# Split the complete data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_complete, y_complete, test_size=0.2, random_state=42)

# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(rf_classifier, X_complete, y_complete, cv=5)  # 5-fold cross-validation
print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean()}")

# Hyperparameter tuning using Grid Search CV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_complete, y_complete)

print("Best Parameters Found:")
print(grid_search.best_params_)
print("Best Cross-Validation Score:")
print(grid_search.best_score_)

# Use the best model from Grid Search
best_rf_classifier = grid_search.best_estimator_

# Train the best model on the complete training data
best_rf_classifier.fit(X_complete, y_complete)

# Predict on the test set
y_pred = best_rf_classifier.predict(X_test)

# Evaluate the best model
print("\nBest Random Forest Classification Report:")
print(classification_report(y_test, y_pred))
print("Best Random Forest Accuracy:", accuracy_score(y_test, y_pred))

# Predict the missing condition_text values
predicted_condition_text = best_rf_classifier.predict(X_missing)

# Fill the missing condition_text values in the original dataframe
data.loc[data['condition_text'].isnull(), 'condition_text'] = predicted_condition_text

# Verify the filling
print("\nMissing values filled count:", data['condition_text'].isnull().sum())  # Should be 0 if all missing values are filled


  data['sunrise'] = pd.to_datetime(data['sunrise'])
  data['sunset'] = pd.to_datetime(data['sunset'])


Cross-Validation Accuracy Scores: [0.69791667 0.64583333 0.65625    0.75       0.71578947]
Mean CV Accuracy: 0.6931578947368421
Best Parameters Found:
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Cross-Validation Score:
0.707763157894737

Best Random Forest Classification Report:
                         precision    recall  f1-score   support

        Clear and Sunny       1.00      1.00      1.00        22
    Cloudy and Overcast       1.00      0.92      0.96        12
    Light Precipitation       1.00      1.00      1.00        12
Light Rain with Thunder       1.00      0.80      0.89         5
            Mist or Fog       1.00      1.00      1.00         8
 Moderate to Heavy Rain       1.00      1.00      1.00         4
          Partly Cloudy       0.90      1.00      0.95        26
           Rain Showers       1.00      0.80      0.89         5
          Thunderstorms       1.00      1.00      1.00         2

               accura

In [None]:
data.head()

Unnamed: 0,day_id,city_id,temperature_celsius,condition_text,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,uv_index,gust_kph,air_quality_us-epa-index,sunrise,sunset,day_length,day_length_hours
0,D0001,C001,27.0,Partly Cloudy,6.1,210,1006.0,0.0,54,75,28.0,10.0,6.0,11.9,2,2024-06-22 06:04:00,2024-06-22 19:19:00,0 days 13:15:00,13.25
1,D0002,C001,22.0,Partly Cloudy,6.1,170,1006.0,0.0,73,75,24.5,10.0,1.0,23.4,1,2024-06-22 06:05:00,2024-06-22 19:18:00,0 days 13:13:00,13.216667
2,D0003,C001,20.0,Light Rain with Thunder,3.6,10,1011.0,4.5,100,75,20.0,10.0,1.0,12.6,1,2024-06-22 06:05:00,2024-06-22 19:18:00,0 days 13:13:00,13.216667
3,D0004,C001,17.0,Clear and Sunny,6.1,150,1018.0,0.0,88,0,17.0,10.0,1.0,11.2,1,2024-06-22 06:06:00,2024-06-22 19:16:00,0 days 13:10:00,13.166667
4,D0005,C001,18.0,Clear and Sunny,3.6,92,1019.0,0.0,94,0,18.0,10.0,1.0,9.0,1,2024-06-22 06:07:00,2024-06-22 19:15:00,0 days 13:08:00,13.133333


In [None]:
drop = ['city_id', 'temperature_celsius',
       'wind_kph', 'wind_degree', 'pressure_mb', 'precip_mm', 'humidity',
       'cloud', 'feels_like_celsius', 'visibility_km', 'uv_index', 'gust_kph',
       'air_quality_us-epa-index', 'sunrise', 'sunset', 'day_length',
       'day_length_hours']

data.drop(drop, axis=1, inplace=True)

In [None]:
data.to_csv('submission.csv', index= False)
from google.colab import files
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>