# INTELLIHACK 5.0 - TEAM HYPER TUNERS
## Task 1 - Part 1: Weather Prediction Model - Training file

In [None]:
# --- Install Required Libraries ---
!pip install lightgbm optuna

# --- Import Libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, confusion_matrix
from lightgbm import LGBMClassifier
import optuna
from datetime import timedelta
from google.colab import drive

# --- Mount Google Drive ---
drive.mount('/content/drive')

# --- Step 1: Load the Dataset ---
np.random.seed(42)
file_path = '/content/weather_data.csv'  # Update this path
try:
    df = pd.read_csv(file_path)
    print("Dataset Loaded Successfully. Shape:", df.shape)
    print("First 5 Rows:\n", df.head())
except FileNotFoundError:
    print("Error: File not found. Please provide the correct path to 'weather_data.csv'.")
    exit()

# --- Step 2: Preprocessing ---
numeric_cols = ['avg_temperature', 'humidity', 'avg_wind_speed', 'cloud_cover', 'pressure']
df[numeric_cols] = df[numeric_cols].clip(lower=0)
df['humidity'] = df['humidity'].clip(upper=100)
df['cloud_cover'] = df['cloud_cover'].clip(upper=100)
df['rain_or_not'] = df['rain_or_not'].map({'Rain': 1, 'No Rain': 0})

print("\nMissing Values Before Preprocessing:\n", df.isnull().sum())
df = df.infer_objects(copy=False)
df.interpolate(method='linear', inplace=True)
df.ffill(inplace=True)
df.bfill(inplace=True)
print("\nAfter Interpolation and Fill:\n", df.isnull().sum())

if df[numeric_cols].isnull().sum().sum() > 0:
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df[numeric_cols])
    imputer = KNNImputer(n_neighbors=4)
    X_imputed = imputer.fit_transform(X_scaled)
    df[numeric_cols] = scaler.inverse_transform(X_imputed)
    print("\nAfter KNN Imputation:\n", df.isnull().sum())

df.reset_index(drop=True, inplace=True)

# --- Step 3: Feature Engineering ---
df['date'] = pd.to_datetime(df['date'])
df['temp_humidity_interaction'] = df['avg_temperature'] * df['humidity']
df['cloud_pressure_ratio'] = df['cloud_cover'] / (df['pressure'] + 1e-6)
df['month'] = df['date'].dt.month

features = [
    'avg_temperature', 'humidity', 'avg_wind_speed', 'cloud_cover', 'pressure',
    'temp_humidity_interaction', 'cloud_pressure_ratio', 'month'
]
X = df[features]
y = df['rain_or_not']

# --- Step 4: Train-Test Split and Scaling ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100, stratify=y)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Step 5: Define Optuna Objective Function ---
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 50),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 30),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
    }

    lgb_model = LGBMClassifier(**params, random_state=100, verbose=-1)
    lgb_model.fit(X_train_scaled, y_train)
    y_pred = lgb_model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# --- Step 6: Run Optuna Optimization ---
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("\nBest Parameters Found by Optuna for LightGBM:")
print(study.best_params)
print("Best Test Accuracy:", study.best_value)

# --- Step 7: Train Final Model with Best Parameters ---
best_lgb_model = LGBMClassifier(
    n_estimators=study.best_params['n_estimators'],
    max_depth=study.best_params['max_depth'],
    learning_rate=study.best_params['learning_rate'],
    num_leaves=study.best_params['num_leaves'],
    min_child_samples=study.best_params['min_child_samples'],
    subsample=study.best_params['subsample'],
    colsample_bytree=study.best_params['colsample_bytree'],
    reg_alpha=study.best_params['reg_alpha'],
    reg_lambda=study.best_params['reg_lambda'],
    random_state=100,
    verbose=-1
)
best_lgb_model.fit(X_train_scaled, y_train)

train_pred = best_lgb_model.predict(X_train_scaled)
test_pred = best_lgb_model.predict(X_test_scaled)
print("\nTuned LightGBM Results:")
print("Train Accuracy:", accuracy_score(y_train, train_pred))
print("Test Accuracy:", accuracy_score(y_test, test_pred))
print("Classification Report:\n", classification_report(y_test, test_pred))
print("ROC AUC Score:", roc_auc_score(y_test, best_lgb_model.predict_proba(X_test_scaled)[:, 1]))

# --- Step 8: Feature Importance ---
feature_importance = pd.Series(best_lgb_model.feature_importances_, index=features).sort_values(ascending=False)
plt.figure(figsize=(10, 6))
feature_importance.plot(kind='bar')
plt.title("Feature Importance in LightGBM Model")
plt.xlabel("Features")
plt.ylabel("Importance")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# --- Step 9: Probability Output for Next 21 Days ---
last_date = df['date'].max()
future_dates = [last_date + timedelta(days=i) for i in range(1, 22)]
future_data = pd.DataFrame({
    col: df[col].tail(21).mean() + np.random.normal(0, df[col].std() / 10, 21)
    for col in features
})
future_data_scaled = scaler.transform(future_data)

future_probabilities = best_lgb_model.predict_proba(future_data_scaled)[:, 1]
future_predictions = pd.DataFrame({
    'Date': future_dates,
    'Rain_Probability': future_probabilities
})
future_predictions['Rain_or_No_Rain'] = future_predictions['Rain_Probability'].apply(lambda x: 'Rain' if x >= 0.5 else 'No Rain')

print("\nRain Probabilities for Next 21 Days:\n", future_predictions)

# Visualize predictions
plt.figure(figsize=(10, 6))
plt.plot(future_predictions['Date'], future_predictions['Rain_Probability'], marker='o')
plt.axhline(0.5, color='red', linestyle='--', label='Threshold (0.5)')
plt.title("Rain Probability for Next 21 Days (LightGBM Model)")
plt.xlabel("Date")
plt.ylabel("Probability of Rain")
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()