In [1]:
# Imports
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

print("✅ Libraries loaded successfully!")


✅ Libraries loaded successfully!


In [4]:
import pandas as pd
from sklearn.linear_model import LogisticRegression

# Training
X_train = pd.DataFrame({
    "wave_height": [1, 2, 3, 4, 5],
    "wind_speed": [10, 20, 30, 40, 50]
})
y_train = [0, 0, 1, 1, 1]

model = LogisticRegression()
model.fit(X_train, y_train)

# Prediction (with same column names)
X_new = pd.DataFrame({
    "wave_height": [6],
    "wind_speed": [60]
})
print(model.predict(X_new))


[1]


In [5]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import joblib

# Example dataset
data = pd.DataFrame({
    "wave_height": [1, 2, 3, 6, 7, 8],
    "wind_speed": [10, 20, 25, 55, 65, 70],
    "alert": [0, 0, 0, 1, 1, 1]
})

X = data[["wave_height", "wind_speed"]]
y = data["alert"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

# Save model
joblib.dump(model, "alert_model.pkl")


['alert_model.pkl']

In [3]:
import pandas as pd
import os

# Load all available datasets
dfs = []

# 1. Load station/environmental data
if os.path.exists("Dataset_for_chatbot.csv"):
    station = pd.read_csv("Dataset_for_chatbot.csv")
    station["date"] = pd.to_datetime(station["date"], errors="coerce").dt.date
    dfs.append(station)

# 2. Load weather data with timestamp
if os.path.exists("weather_data_clean.csv"):
    weather = pd.read_csv("weather_data_clean.csv")
    if "timestamp" in weather.columns:
        weather["date"] = pd.to_datetime(weather["timestamp"], errors="coerce").dt.date
    dfs.append(weather)

# 3. Load rainfall data if available
if os.path.exists("weather_data_with_rainfall.csv"):
    rain = pd.read_csv("weather_data_with_rainfall.csv")
    if "timestamp" in rain.columns:
        rain["date"] = pd.to_datetime(rain["timestamp"], errors="coerce").dt.date
    dfs.append(rain)

# 4. Merge all on 'date' and 'city' where possible
from functools import reduce

def merge_dfs(left, right):
    common_cols = [col for col in ["date", "city"] if col in left.columns and col in right.columns]
    if common_cols:
        return pd.merge(left, right, on=common_cols, how="outer")
    else:
        return pd.merge(left, right, left_index=True, right_index=True, how="outer")

final = reduce(merge_dfs, dfs)

# Remove duplicate columns if any (from merges)
final = final.loc[:,~final.columns.duplicated()]

# Save the combined dataset
final.to_csv("final_training_dataset.csv", index=False)
print("Combined dataset saved as final_training_dataset.csv")

Combined dataset saved as final_training_dataset.csv


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load your dataset
df = pd.read_csv("final_training_dataset.csv")

# Drop columns not useful for training (IDs, timestamps, text, city, etc.)
drop_cols = [col for col in df.columns if
             'timestamp' in col or
             'weather' in col or
             'station_id' in col or
             'city' in col or
             'date' in col or
             'longitude' in col or
             'latitude' in col]
df = df.drop(columns=drop_cols, errors='ignore')

# Drop rows with missing target
df = df.dropna(subset=['anomaly'])

# Fill missing values in features
df = df.fillna(0)

# Keep only numeric columns for features
X = df.drop(columns=['anomaly'])
X = X.select_dtypes(include=['number'])
y = df['anomaly']

# Split and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       337
         1.0       1.00      1.00      1.00         4

    accuracy                           1.00       341
   macro avg       1.00      1.00      1.00       341
weighted avg       1.00      1.00      1.00       341



In [None]:
from flask import Flask, request, jsonify
import pandas as pd
from geopy.distance import geodesic

app = Flask(__name__)
file_path = 'final_training_dataset.csv'
# Load your dataset once (or query your DB)
df = pd.read_csv(file_path)

@app.route('/check_anomaly', methods=['POST'])
def check_anomaly():
    user_lat = float(request.json['latitude'])
    user_lon = float(request.json['longitude'])
    radius_km = 10  # Alert radius

    # Filter for anomalies
    anomalies = df[df['anomaly'] == 1]

    # Check for nearby anomalies
    for _, row in anomalies.iterrows():
        anomaly_loc = (row['latitude'], row['longitude'])
        user_loc = (user_lat, user_lon)
        if geodesic(anomaly_loc, user_loc).km <= radius_km:
            return jsonify({"alert": True, "message": "Anomaly detected nearby!"})

    return jsonify({"alert": False, "message": "No anomalies nearby."})

if __name__ == '__main__':
    app.run()

In [2]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "weatherHistory.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "muthuj7/weather-dataset",
  file_path,
  # Provide any additional arguments like 
  # sql_query or pandas_kwargs. See the 
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

print("First 5 records:", df.head())

ModuleNotFoundError: No module named 'kagglehub'

In [3]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "weatherHistory.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "muthuj7/weather-dataset",
  file_path,
  # Provide any additional arguments like 
  # sql_query or pandas_kwargs. See the 
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

print("First 5 records:", df.head())

ModuleNotFoundError: No module named 'kagglehub'

In [1]:
!pip install kagglehub


Collecting kagglehub
  Using cached kagglehub-0.3.13-py3-none-any.whl.metadata (38 kB)
Using cached kagglehub-0.3.13-py3-none-any.whl (68 kB)
Installing collected packages: kagglehub
Successfully installed kagglehub-0.3.13



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd
import joblib

# Download and load the dataset with encoding fix
file_path = "weatherHistory.csv"
df = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    "muthuj7/weather-dataset",
    file_path,
    pandas_kwargs={
        "encoding": "latin1",      # or "cp1252"
        "on_bad_lines": "skip"     # skip problematic lines
    }
)
print("First 5 records:", df.head())

# Load your trained model
model = joblib.load("alert_model.pkl")

# Prepare the features for prediction (adjust column names as needed)
features = [
    'water_level_m',
    'wind_speed_m_s',
    'air_pressure_hpa',
    'chlorophyll_mg_m3',
    'rainfall'
]

# If your dataset uses different column names, rename or select accordingly
# Example: df = df.rename(columns={"Wind Speed (km/h)": "wind_speed_m_s", ...})

# Fill missing values with mean (or as appropriate)
X = df[features].fillna(df[features].mean())

# Predict anomalies
predictions = model.predict(X)
probabilities = model.predict_proba(X)[:, 1]  # Probability of anomaly

# Add predictions to DataFrame
df['anomaly_pred'] = predictions
df['anomaly_prob'] = probabilities

print(df[['anomaly_pred', 'anomaly_prob']].head())

ValueError: Error reading file: Error tokenizing data. C error: Buffer overflow caught - possible malformed input file.


In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Load the dataset (robust options for messy CSVs)
df = pd.read_csv(
    "weatherHistory.csv",
    encoding="latin1",
    engine="python",
    on_bad_lines="skip"
)

# Add a synthetic 'region' column with random Indian cities
indian_cities = [
    'Mumbai', 'Delhi', 'Bangalore', 'Chennai', 'Kolkata',
    'Hyderabad', 'Pune', 'Ahmedabad', 'Jaipur', 'Lucknow'
]
df['region'] = np.random.choice(indian_cities, size=len(df))

# One-hot encode the 'region' column
df = pd.get_dummies(df, columns=['region'])

# Choose a target variable to predict (e.g., 'Temperature (C)')
target = 'Temperature (C)'

# Select only numeric columns for features (plus one-hot region columns)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if target in numeric_cols:
    numeric_cols.remove(target)
features = numeric_cols

# Drop rows with missing values in features/target
df = df.dropna(subset=features + [target])

# Prepare data for training
X = df[features]
y = df[target]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))

# Predict for a random region in India
random_region = random.choice(indian_cities)
# Create a sample input (use the first row of X_test and set region columns)
sample = X_test.iloc[[0]].copy()
for city in indian_cities:
    col_name = f'region_{city}'
    if col_name in sample.columns:
        sample[col_name] = 1 if city == random_region else 0

sample_pred = model.predict(sample)
print(f"Predicted temperature for {random_region}: {sample_pred[0]:.2f} °C")

In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Load the dataset (robust options for messy CSVs)
df = pd.read_csv(
    "weatherHistory.csv",
    encoding="latin1",
    engine="python",
    on_bad_lines="skip"
)

# Add a synthetic 'region' column with random Indian cities
indian_cities = [
    'Mumbai', 'Delhi', 'Bangalore', 'Chennai', 'Kolkata',
    'Hyderabad', 'Pune', 'Ahmedabad', 'Jaipur', 'Lucknow'
]
df['region'] = np.random.choice(indian_cities, size=len(df))

# One-hot encode the 'region' column
df = pd.get_dummies(df, columns=['region'])

# Choose a target variable to predict (e.g., 'Temperature (C)')
target = 'Temperature (C)'

# Select only numeric columns for features (plus one-hot region columns)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if target in numeric_cols:
    numeric_cols.remove(target)
features = numeric_cols

# Drop rows with missing values in features/target
df = df.dropna(subset=features + [target])

# Prepare data for training
X = df[features]
y = df[target]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))

# Predict for a random region in India
random_region = random.choice(indian_cities)
# Create a sample input (use the first row of X_test and set region columns)
sample = X_test.iloc[[0]].copy()
for city in indian_cities:
    col_name = f'region_{city}'
    if col_name in sample.columns:
        sample[col_name] = 1 if city == random_region else 0

sample_pred = model.predict(sample)
print(f"Predicted temperature for {random_region}: {sample_pred[0]:.2f} °C")

In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Load the dataset (robust options for messy CSVs)
df = pd.read_csv(
    "weatherHistory.csv",
    encoding="latin1",
    engine="python",
    on_bad_lines="skip"
)

# Add a synthetic 'region' column with random Indian cities
indian_cities = [
    'Mumbai', 'Delhi', 'Bangalore', 'Chennai', 'Kolkata',
    'Hyderabad', 'Pune', 'Ahmedabad', 'Jaipur', 'Lucknow'
]
df['region'] = np.random.choice(indian_cities, size=len(df))

# One-hot encode the 'region' column
df = pd.get_dummies(df, columns=['region'])

# Choose a target variable to predict (e.g., 'Temperature (C)')
target = 'Temperature (C)'

# Select only numeric columns for features (plus one-hot region columns)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if target in numeric_cols:
    numeric_cols.remove(target)
features = numeric_cols

# Drop rows with missing values in features/target
df = df.dropna(subset=features + [target])

# Prepare data for training
X = df[features]
y = df[target]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))

# Predict for a random region in India
random_region = random.choice(indian_cities)
# Create a sample input (use the first row of X_test and set region columns)
sample = X_test.iloc[[0]].copy()
for city in indian_cities:
    col_name = f'region_{city}'
    if col_name in sample.columns:
        sample[col_name] = 1 if city == random_region else 0

sample_pred = model.predict(sample)
print(f"Predicted temperature for {random_region}: {sample_pred[0]:.2f} °C")

In [None]:
# Region-specific weather forecast and event probability
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, accuracy_score

# Load data
df = pd.read_csv(
    "weatherHistory.csv",
    encoding="latin1",
    engine="python",
    on_bad_lines="skip"
)

# Add synthetic region column if not present
if 'region' not in df.columns:
    indian_cities = [
        'Mumbai', 'Delhi', 'Bangalore', 'Chennai', 'Kolkata',
        'Hyderabad', 'Pune', 'Ahmedabad', 'Jaipur', 'Lucknow'
    ]
    df['region'] = np.random.choice(indian_cities, size=len(df))

# Choose region and target variable
region = 'Mumbai'  # Change as needed
target = 'Temperature (C)'  # Change to 'Precipitation (mm)' or other variable as needed

# Filter for the selected region
df_region = df[df['region'] == region]

# One-hot encode region (if needed)
if 'region' in df_region.columns:
    df_region = pd.get_dummies(df_region, columns=['region'])

# Select numeric features
numeric_cols = df_region.select_dtypes(include=[np.number]).columns.tolist()
if target in numeric_cols:
    numeric_cols.remove(target)
features = numeric_cols

# Drop missing values
df_region = df_region.dropna(subset=features + [target])

# Prepare data
X = df_region[features]
y = df_region[target]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Regression: Predict weather variable (e.g., temperature)
reg = RandomForestRegressor(n_estimators=20, random_state=42)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print(f"{region} {target} MAE:", mean_absolute_error(y_test, y_pred))
print(f"Predicted {target} for {region}: {y_pred[0]:.2f}")

# Classification: Probability of a weather event (e.g., rain)
# Example: Predict if 'Precipitation (mm)' > 0 (chance of rain)
event = 'Precipitation (mm)'
if event in df_region.columns:
    df_region['rain'] = (df_region[event] > 0).astype(int)
    Xc = df_region[features]
    yc = df_region['rain']
    Xc_train, Xc_test, yc_train, yc_test = train_test_split(Xc, yc, test_size=0.2, random_state=42)
    clf = RandomForestClassifier(n_estimators=20, random_state=42)
    clf.fit(Xc_train, yc_train)
    rain_prob = clf.predict_proba(Xc_test)[0][1]
    print(f"Chance of rain in {region}: {rain_prob*100:.1f}%")
    print("Rain prediction accuracy:", accuracy_score(yc_test, clf.predict(Xc_test)))
else:
    print(f"Column '{event}' not found for rain probability.")


In [None]:
# Predict next likely weather record for a region (all columns)
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from datetime import datetime, timedelta

# Load data
df = pd.read_csv(
    "weatherHistory.csv",
    encoding="latin1",
    engine="python",
    on_bad_lines="skip"
)

# Add synthetic region if not present
if 'region' not in df.columns:
    indian_cities = [
        'Mumbai', 'Delhi', 'Bangalore', 'Chennai', 'Kolkata',
        'Hyderabad', 'Pune', 'Ahmedabad', 'Jaipur', 'Lucknow'
    ]
    df['region'] = np.random.choice(indian_cities, size=len(df))

region = 'Mumbai'  # Change as needed
df_region = df[df['region'] == region].copy()

# Predict next date
last_date = pd.to_datetime(df_region['Formatted Date']).max()
next_date = last_date + timedelta(days=1)
next_date_str = next_date.strftime('%Y-%m-%d %H:%M:%S')

# Prepare features and targets
numeric_cols = [
    'Temperature (C)', 'Apparent Temperature (C)', 'Humidity',
    'Wind Speed (km/h)', 'Wind Bearing (degrees)', 'Visibility (km)',
    'Loud Cover', 'Pressure (millibars)'
]
cat_cols = ['Summary', 'Precip Type', 'Daily Summary']

# Predict numeric columns using regression
predicted = {}
for col in numeric_cols:
    if col in df_region.columns:
        X = np.arange(len(df_region)).reshape(-1, 1)
        y = df_region[col].values
        model = RandomForestRegressor(n_estimators=20, random_state=42)
        model.fit(X, y)
        predicted[col] = model.predict([[len(df_region)]])[0]
    else:
        predicted[col] = np.nan

# Predict categorical columns using most frequent value
for col in cat_cols:
    if col in df_region.columns and not df_region[col].isnull().all():
        predicted[col] = df_region[col].mode()[0]
    else:
        predicted[col] = ""

# Predict Precip Type as most frequent non-null value
if 'Precip Type' in df_region.columns and not df_region['Precip Type'].isnull().all():
    predicted['Precip Type'] = df_region['Precip Type'].dropna().mode()[0]

# Format output
forecast = {
    "Formatted Date": next_date_str,
    "Summary": predicted['Summary'],
    "Precip Type": predicted['Precip Type'],
    "Temperature (C)": round(predicted['Temperature (C)'], 2),
    "Apparent Temperature (C)": round(predicted['Apparent Temperature (C)'], 2),
    "Humidity": round(predicted['Humidity'], 2),
    "Wind Speed (km/h)": round(predicted['Wind Speed (km/h)'], 2),
    "Wind Bearing (degrees)": round(predicted['Wind Bearing (degrees)'], 2),
    "Visibility (km)": round(predicted['Visibility (km)'], 2),
    "Loud Cover": round(predicted['Loud Cover'], 2),
    "Pressure (millibars)": round(predicted['Pressure (millibars)'], 2),
    "Daily Summary": predicted['Daily Summary']
}

print("Next likely weather record for", region)
for k, v in forecast.items():
    print(f"{k}: {v}")


In [None]:
# Predict weather for user at a specific location (lat/lon)
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from datetime import datetime, timedelta
from geopy.distance import geodesic

# User's location (replace with actual user input)
user_lat = 19.0760   # Example: Mumbai latitude
user_lon = 72.8777   # Example: Mumbai longitude

# City coordinates (add more as needed)
city_coords = {
    'Mumbai': (19.0760, 72.8777),
    'Delhi': (28.7041, 77.1025),
    'Bangalore': (12.9716, 77.5946),
    'Chennai': (13.0827, 80.2707),
    'Kolkata': (22.5726, 88.3639),
    'Hyderabad': (17.3850, 78.4867),
    'Pune': (18.5204, 73.8567),
    'Ahmedabad': (23.0225, 72.5714),
    'Jaipur': (26.9124, 75.7873),
    'Lucknow': (26.8467, 80.9462)
}

# Find nearest city
min_dist = float('inf')
nearest_city = None
for city, coords in city_coords.items():
    dist = geodesic((user_lat, user_lon), coords).km
    if dist < min_dist:
        min_dist = dist
        nearest_city = city

print(f"Nearest city to user: {nearest_city} (distance: {min_dist:.2f} km)")

# Load data
df = pd.read_csv(
    "weatherHistory.csv",
    encoding="latin1",
    engine="python",
    on_bad_lines="skip"
)

# Add synthetic region if not present
if 'region' not in df.columns:
    df['region'] = np.random.choice(list(city_coords.keys()), size=len(df))

df_region = df[df['region'] == nearest_city].copy()

# Predict next date
last_date = pd.to_datetime(df_region['Formatted Date'], utc=True).max()
next_date = last_date + timedelta(days=1)
next_date_str = next_date.strftime('%Y-%m-%d %H:%M:%S')

# Prepare features and targets
numeric_cols = [
    'Temperature (C)', 'Apparent Temperature (C)', 'Humidity',
    'Wind Speed (km/h)', 'Wind Bearing (degrees)', 'Visibility (km)',
    'Loud Cover', 'Pressure (millibars)'
]
cat_cols = ['Summary', 'Precip Type', 'Daily Summary']

# Predict numeric columns using regression
predicted = {}
for col in numeric_cols:
    if col in df_region.columns:
        X = np.arange(len(df_region)).reshape(-1, 1)
        y = df_region[col].values
        model = RandomForestRegressor(n_estimators=20, random_state=42)
        model.fit(X, y)
        predicted[col] = model.predict([[len(df_region)]])[0]
    else:
        predicted[col] = np.nan

# Predict categorical columns using most frequent value
for col in cat_cols:
    if col in df_region.columns and not df_region[col].isnull().all():
        predicted[col] = df_region[col].mode()[0]
    else:
        predicted[col] = ""

# Predict Precip Type as most frequent non-null value
if 'Precip Type' in df_region.columns and not df_region['Precip Type'].isnull().all():
    predicted['Precip Type'] = df_region['Precip Type'].dropna().mode()[0]

# Format output
forecast = {
    "Formatted Date": next_date_str,
    "Summary": predicted['Summary'],
    "Precip Type": predicted['Precip Type'],
    "Temperature (C)": round(predicted['Temperature (C)'], 2),
    "Apparent Temperature (C)": round(predicted['Apparent Temperature (C)'], 2),
    "Humidity": round(predicted['Humidity'], 2),
    "Wind Speed (km/h)": round(predicted['Wind Speed (km/h)'], 2),
    "Wind Bearing (degrees)": round(predicted['Wind Bearing (degrees)'], 2),
    "Visibility (km)": round(predicted['Visibility (km)'], 2),
    "Loud Cover": round(predicted['Loud Cover'], 2),
    "Pressure (millibars)": round(predicted['Pressure (millibars)'], 2),
    "Daily Summary": predicted['Daily Summary']
}

print(f"Weather forecast for user at ({user_lat}, {user_lon}) - {nearest_city}:")
for k, v in forecast.items():
    print(f"{k}: {v}")
