In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,mean_absolute_error,mean_squared_error,confusion_matrix,classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [3]:
df = pd.read_csv("weather_prediction_dataset.csv")
df.shape

FileNotFoundError: [Errno 2] No such file or directory: 'weather_prediction_dataset.csv'

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:
#corr_data = df.drop('outlook',axis=1)
#corr_data.corr().head()

In [None]:
#sns.heatmap(corr_data.corr().head(),annot=True)

In [None]:
sns.countplot(df['outlook'])

In [None]:
sns.scatterplot(x=df['BASEL_humidity'],y=df['BASEL_pressure'],hue=df['BASEL_precipitation'],data=df)

In [None]:
df_BASEL = df[['BASEL_humidity', 'BASEL_pressure', 'BASEL_precipitation','BASEL_cloud_cover','BASEL_sunshine']]
sns.pairplot(df_BASEL)

In [None]:
features = ["cloud_cover", "humidity", "pressure", "global_radiation", "precipitation", "sunshine", "temp_mean", "temp_max"]
base_features = [f"BASEL_{f}" for f in features]
stockholm_features = [f"STOCKHOLM_{f}" for f in features]

def outlook(row, city_prefix, noise_prob = 0.1):
    # Convert to real units:
    precip_mm = row[f"{city_prefix}_precipitation"] * 10       # mm
    cloud_cover_pct = (row[f"{city_prefix}_cloud_cover"] / 8) * 100  # %
    global_rad_wm2 = row[f"{city_prefix}_global_radiation"] * 100     # W/m²
    sunshine_hours = row[f"{city_prefix}_sunshine"] * 0.1             # hours

    if np.random.rand() < noise_prob:
        np.random.choice(["Rainy", "Cloudy", "Clear", "Uncertain"])

    # Thresholds based on converted values:
    if precip_mm > 1.0:
        return "Rainy"
    elif cloud_cover_pct > 70 and global_rad_wm2 < 120:
        return "Cloudy"
    elif sunshine_hours > 5 and cloud_cover_pct < 30 and global_rad_wm2 > 150:
        return "Clear"
    else:
        return "Uncertain"
# Generate labels for BASEL
city = "BASEL"
df["outlook"] = df.apply(lambda row: outlook(row, city), axis=1)
safe_features = ["humidity", "pressure", "temp_mean", "temp_max", "temp_min"]
X = df[[f"{city}_{f}" for f in safe_features]]
y = df["outlook"]
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, shuffle = True)

In [None]:
len(X_train)

In [None]:
len(X_test)

In [None]:
X.columns

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

In [None]:
def run_model(grid,X_train,X_test,y_train,y_test):
  grid.fit(X_train,y_train)
  best_model = grid.best_estimator_
  y_pred = best_model.predict(X_test)
  error = 1 - accuracy_score(y_test,y_pred)
  print(f"Best Parameters: {grid.best_params_}")
  print(f"Error: {error}")
  print(classification_report(y_test,y_pred))
  print(f"Accuracy: {accuracy_score(y_test,y_pred)}")
  return best_model

In [None]:
import pandas as pd

cities = ["BASEL", "BUDAPEST", "DE BILT", "DUSSELDORF", "DRESDEN", "HEATHROW",
          "KASSEL", "MAASTRICHT", "MALMO", "MONTELIMAR", "MUNCHEN", "OSLO",
          "PERPIGNAN", "ROMA", "STOCKHOLM", "SONNBLICK", "TOURS", "LJUBLJANA"]

features = ["humidity", "pressure", "temp_mean", "temp_max", "temp_min"]

def outlook_simple(row):
    humidity = row.get("humidity", 0)
    temp_max = row.get("temp_max", 16)
    temp_min = row.get("temp_min", 7.7)

    humidity_pct = humidity * 100
    precip_mm = max(0, 15 - humidity_pct / 5)
    temp_range = temp_max - temp_min
    cloud_cover_pct = min(100, max(0, humidity_pct + (10 - temp_range) * 4))
    global_rad_wm2 = max(0, 250 - cloud_cover_pct * 2)
    sunshine_hours = max(0, 10 - cloud_cover_pct / 10)

    # Relaxed Clear condition
    if precip_mm > 3:
        return "Rainy"
    elif cloud_cover_pct > 60 and global_rad_wm2 < 150:
        return "Cloudy"
    elif sunshine_hours > 4 and cloud_cover_pct < 50 and global_rad_wm2 > 150:
        return "Clear"
    else:
        return "Uncertain"

all_rows = []

for city in cities:
    city_cols = [f"{city}_{feature}" for feature in features]

    # Check if all required columns for the city exist in the DataFrame
    if all(col in df.columns for col in city_cols):
        # Extract city-specific data
        city_data = df[city_cols].copy()

        # Rename columns to remove city prefix for uniform processing
        city_data.columns = features

        # Add city name column
        city_data["city"] = city

        # Generate outlook using your custom function
        city_data["outlook"] = city_data.apply(outlook_simple, axis=1)

        all_rows.append(city_data)
    else:
        # Print a message for cities where data is incomplete
        print(f"Skipping {city} due to missing features.")
        missing_cols = [col for col in city_cols if col not in df.columns]
        print(f"Missing columns for {city}: {missing_cols}")


# Concatenate all valid city data into a single long-format dataframe
df_long = pd.concat(all_rows, ignore_index=True)

print(df_long.head())

In [None]:
df_long['city'].value_counts()

In [None]:
df_long['outlook'].value_counts()

In [None]:
df_long.head(10)

In [None]:
df_long.describe()

In [None]:
df_long.isnull().sum()

In [None]:
df_long.dropna(inplace=True)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_long['city'] = le.fit_transform(df_long['city'])
df_long['outlook'] = le.fit_transform(df_long['outlook'])

In [None]:
X = df_long.drop(columns=['outlook','city'],axis=1)
y = df_long['outlook']

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=42,shuffle=True)

In [None]:
len(X_train) , len(X_test)

In [None]:
model = DecisionTreeClassifier(class_weight='balanced',random_state=42)
param_grid = {'criterion' : ["gini", "entropy", "log_loss"], 'max_depth' : [2,3,5,10]}
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv = 5)
run_model(grid,X_train,X_test,y_train,y_test)

In [None]:
model_rf=RandomForestClassifier(n_estimators=100,random_state=42,class_weight='balanced')
param_grid={'criterion':['gini', 'entropy', 'log_loss'],'max_depth':[1,5,8,10],'max_leaf_nodes':list(range(2,5))}
grid_rf=GridSearchCV(estimator=model_rf,param_grid=param_grid)
run_model(grid_rf,X_train,X_test,y_train,y_test)

In [None]:
model_boost=AdaBoostClassifier(random_state=42)
param_grid={'n_estimators':[1,5,8,10,50,80,100]}
grid_boost=GridSearchCV(estimator=model_boost,param_grid=param_grid)
run_model(grid_boost,X_train,X_test,y_train,y_test)

In [None]:
model_gboost=GradientBoostingClassifier(random_state=42)
param_grid={'loss':['log_loss'],'n_estimators':[1,5,8,10,50,80,100],'criterion':['friedman_mse', 'squared_error']}
grid_gboost=GridSearchCV(estimator=model_gboost,param_grid=param_grid)
run_model(grid_gboost,X_train,X_test,y_train,y_test)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)
model_lr=LogisticRegression(max_iter=10000,random_state=42,class_weight='balanced')
from sklearn.metrics import accuracy_score,mean_absolute_error,mean_squared_error,confusion_matrix,classification_report
param_grid={'C':[1,5,8,10],'solver':['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']}
grid_lr=GridSearchCV(estimator=model_lr,param_grid=param_grid)
run_model(grid_lr,scaled_X_train,scaled_X_test,y_train,y_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model_knn=KNeighborsClassifier(n_neighbors=5,leaf_size=5)
param_grid={'weights': ['uniform', 'distance'], 'algorithm' : ['auto', 'ball_tree','kd_tree', 'brute']}
grid_knn=GridSearchCV(estimator=model_knn,param_grid=param_grid)
run_model(grid_knn,scaled_X_train,scaled_X_test,y_train,y_test)

In [None]:
from sklearn.svm import SVC
model_svc = SVC()
param_grid = {'C': list(range(1, 10)), 'kernel': ['linear', 'poly', 'rbf']}
grid_svc = GridSearchCV(estimator=model_svc, param_grid=param_grid)
run_model(grid_svc,scaled_X_train,scaled_X_test,y_train,y_test)

In [None]:
import joblib
model = run_model(grid_svc,scaled_X_train,scaled_X_test,y_train,y_test)
joblib.dump(model, 'model.pkl')
joblib.dump(le, 'label_encoder.pkl')

In [None]:
import numpy as np
import requests
import joblib

# Load the best model (SVC) and label encoder
svc_model = joblib.load("model.pkl")     # Make sure this file exists
le = joblib.load("label_encoder.pkl")             # Label encoder for target

safe_features = ["humidity", "pressure", "temp_mean", "temp_max", "temp_min"]

def fetch_weather_data(lat, lon):
    API_KEY = "17b2877bc8114065a27174823251805"
    url = f"http://api.weatherapi.com/v1/current.json?key={API_KEY}&q={lat},{lon}"
    res = requests.get(url).json()

    temp_max = res['current']['temp_c']
    temp_min = temp_max - 2
    temp_mean = (temp_max + temp_min) / 2

    weather_data = {
        "temp_max": temp_max,
        "temp_min": temp_min,
        "temp_mean": temp_mean,
        "humidity": res['current']['humidity'],
        "pressure": res['current']['pressure_mb'] / 10,
    }
    return weather_data

def xyz(mode, temp_min, temp_max, temp_mean, humidity, pressure, lat, lon):
    if mode == "Auto":
        if lat is None or lon is None:
            return "Please provide lat and lon"
        data = fetch_weather_data(lat, lon)
    else:
        data = {
            "temp_max": temp_max,
            "temp_min": temp_min,
            "temp_mean": temp_mean,
            "humidity": humidity,
            "pressure": pressure,
        }

    for feature in safe_features:
        if feature not in data or data[feature] is None:
            return "Missing data in input."

    X = np.array([[data[f] for f in safe_features]])
    y_pred = svc_model.predict(X)
    return le.inverse_transform(y_pred)[0]

In [None]:
xyz("Auto", None, None, None, None, None, 28.6139, 77.2090)

In [None]:
import os
os.makedirs("Weather_Predictor",exist_ok=True)

In [None]:
%%writefile Weather_Predictor/app.py
import gradio as gr
import numpy as np
import pandas as pd
import joblib
import requests

model = joblib.load("model.pkl")
le = joblib.load("label_encoder.pkl")
safe_features = ["humidity", "pressure", "temp_mean", "temp_max", "temp_min"]

def fetch_weather_data(lat, lon):
    API_KEY = "17b2877bc8114065a27174823251805"
    url = f"http://api.weatherapi.com/v1/current.json?key={API_KEY}&q={lat},{lon}"
    res = requests.get(url).json()

    temp_max = res['current']['temp_c']
    temp_min = temp_max - 2
    temp_mean = (temp_max + temp_min) / 2

    weather_data = {
        "temp_max": temp_max,
        "temp_min": temp_min,
        "temp_mean": temp_mean,
        "humidity": res['current']['humidity'],
        "pressure": res['current']['pressure_mb'] / 10,
    }
    return weather_data

def xyz(mode, temp_min, temp_max, temp_mean, humidity, pressure, lat, lon):
    if mode == "Auto":
        if lat is None or lon is None:
            return "Please provide lat and lon"
        data = fetch_weather_data(lat, lon)
    else:
        data = {
            "temp_max": temp_max,
            "temp_min": temp_min,
            "temp_mean": temp_mean,
            "humidity": humidity,
            "pressure": pressure,
        }

    for feature in safe_features:
        if feature not in data or data[feature] is None:
            return "Missing data in input."

    X = np.array([[data[f] for f in safe_features]])
    y_pred = model.predict(X)
    return le.inverse_transform(y_pred)[0]

interface = gr.Interface(
    fn=xyz,
    inputs=[
        gr.Radio(["Manual", "Auto"], label="Mode"),
        gr.Number(label="Temp Min (°C)", value=20),
        gr.Number(label="Temp Max (°C)", value=25),
        gr.Number(label="Temp Mean (°C)", value=22.5),
        gr.Number(label="Humidity (%)", value=60),
        gr.Number(label="Pressure (kPa)", value=101.3),
        gr.Number(label="Latitude", value=None),
        gr.Number(label="Longitude", value=None),
    ],
    outputs="text",
    title="Weather Condition Predictor",
    description="Predicts weather condition using either manual or automatic weather data input."
)
interface.launch(share=True)

In [None]:
%%writefile Weather_Predictor/requirements.txt
gradio
requests
joblib
numpy
pandas
scikit-learn

In [None]:
import joblib
joblib.dump(model, 'Weather_Predictor/model.pkl')
joblib.dump(le, 'Weather_Predictor/label_encoder.pkl')

In [None]:
%%writefile Weather_Predictor/model.py