In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_score

# Load the data
data_path = 'Total.csv'
data = pd.read_csv(data_path)

# Independent variables (features)
features = ['WVR', 'CHLA_RESULT', 'PIP_PT', 'TIP_PT', 'EVI']

# Dependent variables (targets)
targets = ['AMMONIA_N_RESULT', 'CHLORIDE_RESULT', 'COND_RESULT', 'DOC_RESULT',
           'NITRATE_NITRITE_N_RESULT', 'NTL_RESULT', 'PH_RESULT', 'PTL_RESULT',
           'SULFATE_RESULT', 'TKN_RESULT', 'TURB_RESULT']

# Prepare data by dropping rows with NaN values in features or targets
filtered_data = data.dropna(subset=features + targets)

# Extract features and targets
X = filtered_data[features].values
y = filtered_data[targets].values

# Store results for each target
for target_idx, target in enumerate(targets):
    print(f"Training model for {target}...\n")
    
    # Create a pipeline with scaling, polynomial features, and Ridge regression model
    pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Standardize the features
        ('poly', PolynomialFeatures(degree=3, include_bias=False)),  # Polynomial features (degree 3)
        ('model', Ridge(alpha=1.0))  # Ridge regression for regularization
    ])
    
    # Set up k-fold cross-validation (e.g., 5 folds)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # Cross-validation: Evaluate the model using R^2 score
    cv_scores = cross_val_score(pipeline, X, y[:, target_idx], cv=kf, scoring='r2')
    
    # Train the model for the current target variable (on the whole dataset)
    pipeline.fit(X, y[:, target_idx])  # Fit the model for the current target
    
    # Get predictions for the full dataset
    y_pred = pipeline.predict(X)
    
    # Calculate mean squared error for the current target variable
    mse = mean_squared_error(y[:, target_idx], y_pred)  # Compare to the current target (not y_all)
    
    # Calculate R-squared for the current target variable
    r2 = r2_score(y[:, target_idx], y_pred)  # Compare to the current target (not y_all)
    
    # Display results for the current target
    print(f"Results for {target}:")
    print(f"  Mean Squared Error (MSE): {mse:.4f}")
    print(f"  R²: {r2:.4f}")
    print("\nModel trained for", target)


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import scipy.stats as stats
from matplotlib.backends.backend_pdf import PdfPages

# Load the data
data_path = 'Total.csv'
data = pd.read_csv(data_path)

# Independent variables (features)
features = ['WVR', 'CHLA_RESULT', 'PIP_PT', 'TIP_PT', 'EVI']

# Dependent variables (targets)
targets = ['AMMONIA_N_RESULT', 'CHLORIDE_RESULT', 'COND_RESULT', 'DOC_RESULT',
           'NITRATE_NITRITE_N_RESULT', 'NTL_RESULT', 'PH_RESULT', 'PTL_RESULT',
           'SULFATE_RESULT', 'TKN_RESULT', 'TURB_RESULT']

# Prepare data by dropping rows with NaN values in features or targets
filtered_data = data.dropna(subset=features + targets)

# Extract features and targets
X = filtered_data[features].values
y = filtered_data[targets].values

# Create a PDF to save the plots
pdf_filename = 'model_plots.pdf'
with PdfPages(pdf_filename) as pdf:

    # Store results for each target
    for target_idx, target in enumerate(targets):
        print(f"Training model for {target}...\n")
        
        # Create a pipeline with scaling, polynomial features, and Ridge regression model
        pipeline = Pipeline([
            ('scaler', StandardScaler()),  # Standardize the features
            ('poly', PolynomialFeatures(degree=3, include_bias=False)),  # Polynomial features (degree 3)
            ('model', Ridge(alpha=1.0))  # Ridge regression for regularization
        ])
        
        # Set up k-fold cross-validation (5 folds)
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        
        # Store the R² scores for each fold
        fold_r2_scores = []
        
        for train_idx, test_idx in kf.split(X):
            # Split the data into training and testing sets
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx, target_idx], y[test_idx, target_idx]
            
            # Train the model on the training data
            pipeline.fit(X_train, y_train)
            
            # Predict on the test data
            y_pred = pipeline.predict(X_test)
            
            # Calculate R² for this fold
            fold_r2 = r2_score(y_test, y_pred)
            fold_r2_scores.append(fold_r2)
        
        # Train the model for the current target variable on the full dataset
        pipeline.fit(X, y[:, target_idx])  # Fit the model for the current target
        
        # Get predictions for the full dataset
        y_pred = pipeline.predict(X)
        
        # Calculate mean squared error for the current target variable
        mse = mean_squared_error(y[:, target_idx], y_pred)  # Compare to the current target (not y_all)
        
        # Calculate R-squared for the current target variable
        r2 = r2_score(y[:, target_idx], y_pred)  # Compare to the current target (not y_all)
        
        # Calculate F-statistic and p-value for the model
        n = X.shape[0]  # Sample size
        p = X.shape[1] + 1  # Number of predictors including polynomial terms
        rss = np.sum((y[:, target_idx] - y_pred) ** 2)
        tss = np.sum((y[:, target_idx] - np.mean(y[:, target_idx])) ** 2)
        f_stat = (tss - rss) / p / (rss / (n - p - 1))
        p_value = 1 - stats.f.cdf(f_stat, p, n - p - 1)
        
        # Create a new figure for the plot
        fig, ax = plt.subplots(figsize=(12, 6))
        
        # Plot actual vs predicted
        ax.scatter(y[:, target_idx], y_pred, color='blue', alpha=0.6, label='Actual vs Predicted')
        
        # Plot a line for perfect predictions (where predicted = actual)
        ax.plot([min(y[:, target_idx]), max(y[:, target_idx])], 
                [min(y[:, target_idx]), max(y[:, target_idx])], color='red', linestyle='--', label='Perfect Prediction')
        
        # Display the R² value and p-value on the plot
        ax.text(0.1, 0.9, f'R² = {r2:.2f}', transform=ax.transAxes, fontsize=12, verticalalignment='top')
        
        # Check if p-value is less than 0.001
        p_value_text = f'P-value = {p_value:.4f}'
        if p_value < 0.001:
            p_value_text = 'P-value < 0.001'
        
        ax.text(0.1, 0.85, p_value_text, transform=ax.transAxes, fontsize=12, verticalalignment='top')
        
        # Add labels and title
        ax.set_xlabel('Actual Values')
        ax.set_ylabel('Predicted Values')
        ax.set_title(f'{target} - Actual vs Predicted')
        
        # Add legend
        ax.legend()

        # Save the plot to the PDF
        pdf.savefig(fig)
        plt.close(fig)
        
        # Print out results
        print(f"Results for {target}:")
        print(f"  R² on full dataset: {r2:.4f}")

print(f"All plots have been saved to {pdf_filename}.")


In [None]:
import pandas as pd
import folium

# Load the CSV data
file_path = 'NWCA2011S.csv'  # Replace with the path to your CSV file
data = pd.read_csv(file_path)

# Extract LAT_ANALYS and LON_ANALYS columns
latitude = data['LAT_ANALYS']
longitude = data['LON_ANALYS']

# Initialize the map centered on the U.S. (around latitude 37.0902, longitude -95.7129)
us_map = folium.Map(location=[37.0902, -95.7129], zoom_start=5)

# Add each point from the CSV file to the map as a small red dot
for lat, lon in zip(latitude, longitude):
    folium.CircleMarker(
        location=[lat, lon],
        radius=2,  # Small radius for the dot
        color='red',  # Outline color
        fill=True,
        fill_color='red',  # Fill color
        fill_opacity=1.0
    ).add_to(us_map)

# Save the map to an HTML file
us_map.save('us_map_2011.html')

# Display the map
us_map


In [None]:
import pandas as pd
import folium

# Load the CSV data
file_path = 'NWCA2016S.csv'  # Replace with the path to your CSV file
data = pd.read_csv(file_path)

# Extract LAT_ANALYS and LON_ANALYS columns
latitude = data['LAT_ANALYS']
longitude = data['LON_ANALYS']

# Initialize the map centered on the U.S. (around latitude 37.0902, longitude -95.7129)
us_map = folium.Map(location=[37.0902, -95.7129], zoom_start=5)

# Add each point from the CSV file to the map as a small red dot
for lat, lon in zip(latitude, longitude):
    folium.CircleMarker(
        location=[lat, lon],
        radius=2,  # Small radius for the dot
        color='red',  # Outline color
        fill=True,
        fill_color='red',  # Fill color
        fill_opacity=1.0
    ).add_to(us_map)
    
# Save the map to an HTML file
us_map.save('us_map_2016.html')

# Display the map
us_map


In [None]:
import pandas as pd
import folium

# Load the CSV data
file_path = 'NWCA2021S.csv'  # Replace with the path to your CSV file
data = pd.read_csv(file_path)

# Extract LAT_ANALYS and LON_ANALYS columns
latitude = data['LAT_ANALYS']
longitude = data['LON_ANALYS']

# Initialize the map centered on the U.S. (around latitude 37.0902, longitude -95.7129)
us_map = folium.Map(location=[37.0902, -95.7129], zoom_start=5)

# Add each point from the CSV file to the map as a small red dot
for lat, lon in zip(latitude, longitude):
    folium.CircleMarker(
        location=[lat, lon],
        radius=2,  # Small radius for the dot
        color='red',  # Outline color
        fill=True,
        fill_color='red',  # Fill color
        fill_opacity=1.0
    ).add_to(us_map)

# Save the map to an HTML file
us_map.save('us_map_2021.html')

# Display the map
us_map
