In [None]:
#Support Vector Machine model

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score  
import matplotlib.pyplot as plt
import joblib
import mysql.connector

In [4]:
# Load dataset
dataset = pd.read_csv("SoilMoistureData.csv")

# Display basic information about the dataset
print(dataset.info())

# Display summary statistics
print(dataset.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20166 entries, 0 to 20165
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   pressure              20166 non-null  float64
 1   particulate matter1   20166 non-null  float64
 2   particulate matter2   20166 non-null  float64
 3   particulate matter3   20166 non-null  float64
 4   atmospheric moisture  20166 non-null  float64
 5   moisture              20166 non-null  int64  
 6   luminosity            20166 non-null  int64  
 7   temperature           20166 non-null  float64
 8   humidity              20166 non-null  float64
dtypes: float64(7), int64(2)
memory usage: 1.4 MB
None
           pressure  particulate matter1  particulate matter2  \
count  20166.000000         20166.000000         20166.000000   
mean   93154.619405             1.585495             2.581223   
std      290.963485             1.045641             1.288723   
min    92352.450

In [None]:
# Connect to MySQL database
conn = mysql.connector.connect(
    host="localhost",
    user="root",
    password="",
    database="data"
)

In [None]:
# Query data from phpMyAdmin database
query = "SELECT temperature, moisture, humidity FROM soilmoisture"
dataset_db = pd.read_sql(query, conn)

In [None]:
# Rename columns to match CSV dataset
dataset_db.columns = ['temperature', 'humidity', 'moisture']

In [None]:
# Concatenate datasets
combined_dataset = pd.concat([dataset, dataset_db], ignore_index=True)

In [None]:
# Ensure column names are correct and without leading/trailing spaces
combined_dataset.columns = combined_dataset.columns.str.strip()

In [None]:

print(combined_dataset.describe())

In [None]:
# Function to detect outliers using Z-score
def detect_outliers_zscore(data, threshold=3):
    z_scores = ((data - data.mean()) / data.std()).abs()
    return z_scores > threshold

# Detect outliers in the dataset
outliers = detect_outliers_zscore(combined_dataset.drop(columns=['moisture']))

# Remove outliers from the dataset
dataset_no_outliers = combined_dataset[~outliers.any(axis=1)]


In [None]:
# Separate features and target variable
X = dataset_no_outliers.drop(columns=['moisture'])  # Features
y = dataset_no_outliers['moisture']  # Target variable

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# Convert target variable to float
y_train_float = y_train.astype(float)


In [None]:
# Define parameter grid for GridSearchCV
param_grid = {
    'kernel': ['linear', 'rbf', 'poly'],  # Add more kernels if needed
    'C': [0.1, 1, 10],  # Regularization parameter
    'gamma': ['scale', 'auto']  # Kernel coefficient for 'rbf' and 'poly' kernels
}

In [None]:
# Initialize SVR model
svm_model = SVR()

In [None]:
# Initialize GridSearchCV
grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='neg_mean_squared_error')

In [None]:
# Perform grid search
grid_search.fit(X_train_scaled, y_train_float)

In [None]:
# Get best parameters and best model
best_params = grid_search.best_params_
best_svm_model = grid_search.best_estimator_

In [None]:
# Print best parameters
print("Best Parameters:", best_params)


In [None]:
# Predict with the best model
y_pred_best = best_svm_model.predict(X_test_scaled)


In [None]:
# Evaluate the best model
mse_best = mean_squared_error(y_test, y_pred_best)
rmse_best = mean_squared_error(y_test, y_pred_best, squared=False)
r2_best = r2_score(y_test, y_pred_best)

# Print evaluation metrics
print("Best Model - Mean Squared Error (MSE):", mse_best)
print("Best Model - Root Mean Squared Error (RMSE):", rmse_best)
print("Best Model - R-squared (R2) Score:", r2_best)


In [None]:
# Cross-validation for model evaluation
cv_scores = cross_val_score(svm_model, X_train_scaled, y_train_float, cv=5, scoring='neg_mean_squared_error')
print("Cross-Validation Mean Squared Error:", -cv_scores.mean())

In [None]:

# Save the trained model
joblib.dump(best_svm_model, 'svm_model.pkl')

In [None]:
# Plot actual vs. predicted soil moisture values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_best, color='blue')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'k--', lw=2)
plt.xlabel('Actual Soil Moisture')
plt.ylabel('Predicted Soil Moisture')
plt.title('Actual vs. Predicted Soil Moisture')
plt.show()



In [None]:
# Plot distribution of soil moisture values in the dataset
plt.figure(figsize=(10, 6))
plt.hist(y, bins=30, color='green', alpha=0.7)
plt.xlabel('Soil Moisture')
plt.ylabel('Frequency')
plt.title('Distribution of Soil Moisture Values')
plt.show()

In [None]:
# Residual Plot
plt.figure(figsize=(10, 6))
plt.scatter(y_pred_best, y_pred_best - y_test, color='red')
plt.hlines(y=0, xmin=min(y_pred_best), xmax=max(y_pred_best), colors='black', lw=2)
plt.xlabel('Predicted Soil Moisture')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

In [None]:

# Feature Distribution Plots
plt.figure(figsize=(12, 8))
for i, col in enumerate(X.columns):
    plt.subplot(3, 3, i + 1)
    plt.hist(X[col], bins=30, color='blue', alpha=0.7)
    plt.title(col)
plt.tight_layout()
plt.show()


In [None]:
# Learning Curve
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(svm_model, X_train_scaled, y_train_float, cv=5)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1,
                 color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.title("Learning Curve")
plt.legend(loc="best")
plt.show()


In [None]:
# Feature Importance Plot
if svm_model.kernel == 'linear':
    feature_importance = pd.Series(svm_model.coef_.flatten(), index=X.columns)
    feature_importance.plot(kind='bar', figsize=(10, 6))
    plt.title('Feature Importance')
    plt.xlabel('Feature')
    plt.ylabel('Coefficient')
    plt.show()


In [None]:
# Close the database connection
conn.close()

In [None]:
#Random Forest regressor Model

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import joblib

In [None]:
# Load the dataset
# Replace 'path_to_dataset.csv' with the path to your actual dataset
data = pd.read_csv('SoilMoistureData.csv')
# Display basic information about the dataset
print(data.info())

# Display summary statistics
print(data.describe())

In [None]:
# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

In [None]:
# Check for missing values
print("\nMissing values in each column:")
print(data.isnull().sum())

In [None]:
# Handle missing values (if any)
# For simplicity, fill missing values with the mean of the column
data.fillna(data.mean(), inplace=True)

In [None]:
# Summary statistics of the dataset
print("\nSummary statistics of the dataset:")
print(data.describe())
print(data.info())

In [None]:
# Visualize the data distribution for each feature
print("\nData distribution for each feature:")
data.hist(bins=50, figsize=(20, 15))
plt.show()

In [None]:
# Pairplot to visualize relationships between features
print("\nPairplot of the dataset:")
sns.pairplot(data)
plt.show()

In [None]:
# Correlation matrix to understand relationships between variables
corr_matrix = data.corr()
print("\nCorrelation matrix:")
print(corr_matrix)

In [None]:
# Heatmap of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix Heatmap")
plt.show()

In [None]:
# Scatter plot for each feature vs the target variable
target = 'moisture'
for column in data.columns:
    if column != target:
        plt.figure(figsize=(8, 6))
        plt.scatter(data[column], data[target])
        plt.xlabel(column)
        plt.ylabel(target)
        plt.title(f'{column} vs {target}')
        plt.show()

In [None]:
# Define features and target variable
# Assuming 'soil_moisture' is the target variable
X = data.drop('moisture', axis=1)  # Features
y = data['moisture']  # Target variable

In [None]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# Initialize the Random Forest model
rf_model = RandomForestRegressor(random_state=42)

In [None]:
# Make predictions on the test set
y_pred = best_rf_model.predict(X_test)

In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"\nModel evaluation results with optimized parameters:")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R^2 Score: {r2}")

In [None]:
# Feature importance
importances = best_rf_model.feature_importances_
indices = np.argsort(importances)[::-1]

In [None]:
# Print the feature ranking
print("\nFeature ranking:")
for f in range(X_train.shape[1]):
    print(f"{f + 1}. feature {X.columns[indices[f]]} ({importances[indices[f]]})")

In [None]:
# Plot the feature importances of the forest
plt.figure(figsize=(12, 6))
plt.title("Feature Importances")
plt.bar(range(X_train.shape[1]), importances[indices], align="center")
plt.xticks(range(X_train.shape[1]), X.columns[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.show()

In [None]:
# Save the model and the scaler
joblib.dump(best_rf_model, 'optimized_soil_moisture_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

print("\nModel and scaler saved successfully!")