<a href="https://www.kaggle.com/code/shantanurajmane/soil-moisture-prediction?scriptVersionId=165133407" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np
import cv2
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import os
import pandas as pd
from joblib import load,dump

from sklearn.metrics import ConfusionMatrixDisplay,accuracy_score,classification_report

In [2]:
def list_data(root_folder="/kaggle/input/soilmoisture/soilmoisture"):
    try:
        classes = os.listdir(root_folder)
        all_dataframes = []

        for soil_class in classes:
            folder_path = os.path.join(root_folder, soil_class)

            # Get the list of files in the folder
            files = os.listdir(folder_path)


            # Create a DataFrame for the current class
            data = {'File': files, 'Class': [soil_class] * len(files)}
            df = pd.DataFrame(data)
            all_dataframes.append(df)

        # Concatenate all DataFrames into one
        result_df = pd.concat(all_dataframes, ignore_index=True)

        return result_df

    except FileNotFoundError:
        print(f"The root folder '{root_folder}' does not exist.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Replace 'your_root_folder' with the actual path of the root folder containing the classes
root_folder = '/kaggle/input/soilmoisture/soilmoisture'
df = list_data(root_folder)

if df is not None:
    print("\nDataFrame:")
    print(df)



DataFrame:
         File Class
0    158.jpeg    50
1    166.jpeg    50
2    155.jpeg    50
3    179.jpeg    50
4    170.jpeg    50
..        ...   ...
267  258.jpeg    80
268  267.jpeg    80
269  247.jpeg    80
270  260.jpeg    80
271  261.jpeg    80

[272 rows x 2 columns]


In [3]:
import pandas as pd
import cv2
import numpy as np

def extract_features(image_paths):
    features = []
    for path in image_paths:
        img = cv2.imread(path)
        if img is not None:
            rgb_median = np.median(img, axis=(0, 1))
            hsv_img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
            hsv_median = np.median(hsv_img, axis=(0, 1))

            # Calculate Digital Number (DN)
            DN = 0.2989 * rgb_median[0] + 0.5870 * rgb_median[1] + 0.1140 * rgb_median[2]

            features.append(np.concatenate([rgb_median, hsv_median, [DN]]))  # Add DN to features
        else:
            print(f"Error loading image: {path}")

    return np.array(features)

# Replace 'your_root_folder' with the actual path
root_folder = '/kaggle/input/soilmoisture/soilmoisture'
df = list_data(root_folder) # Assuming `list_data` function is defined elsewhere

if df is not None:
  image_paths = [os.path.join(root_folder, row['Class'], row['File']) for _, row in df.iterrows()]
  features = extract_features(image_paths)

# Create the DataFrame with features and class names (updated for DN)
df_features = pd.DataFrame(features, columns=['feature1', 'feature2', 'feature3', 'feature4', 'feature5', 'feature6', 'DN'])  # Add 'DN' column
df_features['class_name'] = df['Class']

print(df_features)


     feature1  feature2  feature3  feature4  feature5  feature6       DN  \
0        74.0      87.0     105.0      12.0      74.0     106.0  85.1576   
1        63.0      76.0      94.0      13.0      83.0      94.0  74.1587   
2        63.0      75.0      92.0      12.0      77.0      92.0  73.3437   
3        81.0      92.0     108.0      12.0      67.0     108.0  90.5269   
4        79.0      90.0     106.0      12.0      64.0     106.0  88.5271   
..        ...       ...       ...       ...       ...       ...      ...   
267      61.0      76.0      98.0      12.0     100.0      98.0  74.0169   
268      83.0      93.0     110.0      11.0      79.0     111.0  91.9397   
269      83.0      98.0     123.0      11.0      83.0     124.0  96.3567   
270      70.0      86.0     109.0      11.0      97.0     110.0  83.8310   
271      80.0      96.0     120.0      11.0      94.0     121.0  93.9440   

    class_name  
0           50  
1           50  
2           50  
3           50  
4 

In [4]:

def extract_features1(image_paths):
    """Extracts features from images and creates a DataFrame.

    Args:
        image_paths (list): List of paths to image files.

    Returns:
        pandas.DataFrame: DataFrame containing extracted features.
    """

    features = []
    for path in image_paths:
        img = cv2.imread(path)
        if img is not None:
            rgb_median = np.median(img, axis=(0, 1))
            hsv_img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
            hsv_median = np.median(hsv_img, axis=(0, 1))

            # Calculate Digital Number (DN)
            DN = 0.2989 * rgb_median[0] + 0.5870 * rgb_median[1] + 0.1140 * rgb_median[2]

            features.append([*rgb_median, *hsv_median, DN])  # Use unpacking for clarity
        else:
            print(f"Error loading image: {path}")

    return pd.DataFrame(features, columns=['R_median', 'G_median', 'B_median', 'H_median', 'S_median', 'V_median', 'DN'])


In [5]:
X = df_features[["feature1", "feature2", "feature3", "feature4", "feature5", "feature6","DN"]]
y = df_features['class_name']

In [6]:

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

# Train a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)


In [7]:

# Predict and evaluate
predictions = model.predict(X_test)
print(predictions)
print("RMSE:", np.sqrt(mean_squared_error(y_test, predictions)))
print("R²:", r2_score(y_test, predictions))

[27.23180389 65.0323992  22.86949158 25.22710294 27.78037563 26.00704584
 24.29097688 13.16332499 52.67043681 44.36695507 22.61296756 45.45789916
 39.68910795 28.48319198 99.33466462 17.27286429 36.19970919 25.70867015
 68.59301123 22.31419731 32.70325964 63.65792994 35.39691902 23.567574
 22.43886509 34.70992033 43.5494341  24.60059539 65.96112412 38.59581063
 58.03707657 35.59731429 20.29803831 41.22411549 50.60352866 42.25149336
 65.86205382 21.38128485 62.45688454 76.5514471  36.23884943 20.78450501
 46.93113443 41.3675522  29.08568418 31.52467142  4.7634999  22.29727059
 70.82841647 89.93077112 41.40860732 78.78243541 34.48611342 47.02110959
 66.77251357]
RMSE: 16.037662018799352
R²: 0.5714639930802771


In [8]:
print(X_test, y_test)

     feature1  feature2  feature3  feature4  feature5  feature6        DN
30       76.0      85.0      98.0      12.0      58.0      98.0   83.7834
116      66.0      82.0     104.0      12.0      94.0     104.0   79.7174
79       83.0      91.0     108.0      10.0      61.0     108.0   90.5377
127      91.0     100.0     116.0      10.0      58.0     116.0   99.1239
196      78.0      87.0     100.0      11.0      53.0     100.0   85.7832
137     118.0     132.0     156.0      11.0      65.0     157.0  130.5382
209      73.0      81.0      94.0      11.0      52.0      94.0   80.0827
45       67.0      73.0      85.0      11.0      58.0      86.0   72.5673
158      79.0      95.0     114.0      13.0      86.0     115.0   92.3741
247      69.0      80.0      97.0      11.0     102.0      98.0   78.6421
183      70.0      77.0      89.0      10.0      47.0      89.0   76.2680
269      83.0      98.0     123.0      11.0      83.0     124.0   96.3567
227      59.0      69.0      85.0     

In [9]:
dump(model, 'soilmoisture.pkl')

['soilmoisture.pkl']