# Integrating Big Data Analytics and Convolutional Neural Networks for Pest and Disease Detection and Classification

In [1]:
#Install the necessary libraries
#!pip install torch
#!pip install torchvision
#!pip install opencv-python

In [2]:
#Import all the necessary libraries
import warnings
warnings.filterwarnings("ignore")
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
import torch.nn.functional as F
import torchvision
from torchvision import transforms
from PIL import Image
from pyspark.sql import SparkSession
from pyspark.ml.image import ImageSchema
from pyspark.sql.functions import lit
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
import os
from PIL import Image
import numpy as np
import shutil

In [3]:
#Create a spark session

# Initialize Spark session
spark = SparkSession.builder.appName("Load Image Data").getOrCreate()
warnings.filterwarnings("ignore")
spark

24/03/26 12:59:11 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [14]:
import warnings
import matplotlib.pyplot as plt
import cv2

# Suppress warnings
warnings.filterwarnings("ignore")

# Load image data
image_df = spark.read.format("image").load("hdfs://localhost:9000/crop_pest_disease_dataset/Test")

# Collect 2 images as Pandas DataFrame
images_pd = image_df.limit(2).toPandas()

# Display the images
for index, row in images_pd.iterrows():
    image = cv2.imread(row['image.origin'])
    plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    plt.axis('off')
    plt.show()

In [9]:
image_df = spark.read.format("image").load("hdfs://localhost:9000/crop_pest_disease_dataset/Test")
warnings.filterwarnings("ignore")

In [10]:
image_df.printSchema()
image_df.show(2)

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)

+-----+
|image|
+-----+
+-----+



In [8]:
EPOCHS = 10
IMAGE_SIZE = (128, 128)
INPUT_SHAPE = (128, 128, 3)
BATCH_SIZE = 64
LEARNING_RATE = 0.001
MAX_IMAGES_PER_CATEGORY = 5000
CLASS_NAMES = ['Cashew', 'Cassava', 'Maize', 'Tomato']

### Get the plant and disease category from the file path

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def get_disease_category(file_path):
    return file_path.split("/")[5]

def get_plant_category(file_path):
    disease_category = get_disease_category(file_path)
    return disease_category.split(" ")[0]

# Define UDFs
get_disease_category_udf = udf(get_disease_category, StringType())
get_plant_category_udf = udf(get_plant_category, StringType())

# Apply UDFs to DataFrame
dataset_path_with_categories = image_df.withColumn("disease_category", get_disease_category_udf(image_df["image.origin"])) \
                                           .withColumn("plant_category", get_plant_category_udf(image_df["image.origin"]))

# Show the schema and display first few rows with new categories
dataset_path_with_categories.printSchema()
dataset_path_with_categories.show(1)


## Convert the dataset of images to tensors

In [None]:
#!pip install tensorflow

In [None]:
#import tensorflow as tf

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

# Define the image processing function
def process_images(images):
    IMAGE_SIZE = (224, 224)  # Example image size

    def process_single_image(filename):
        img = tf.io.read_file(filename)
        img = tf.io.decode_jpeg(img)
        img = tf.image.resize(img, IMAGE_SIZE)
        img = tf.image.convert_image_dtype(img, tf.float32)
        return img.numpy().tolist()

    return [process_single_image(filename) for filename in images]

# Define UDF for image processing
process_images_udf = udf(process_images, ArrayType(StringType()))

# Apply UDF to DataFrame
processed_images_df = dataset_path_with_categories.withColumn("processed_images", process_images_udf(dataset_path_with_categories["image.origin"]))

# Show the schema and display first few rows with processed images
processed_images_df.printSchema()
processed_images_df.show(1, truncate=False)


In [None]:
from pyspark.sql.functions import udf, col

# Define the image processing function
def process_images(images):
    IMAGE_SIZE = (224, 224)  # Example image size

    def process_single_image(filename):
        img = tf.io.read_file(filename)
        img = tf.io.decode_jpeg(img)
        img = tf.image.resize(img, IMAGE_SIZE)
        img = tf.image.convert_image_dtype(img, tf.float32)
        return img.numpy().tolist()

    return [process_single_image(filename) for filename in images]

# Define UDF for image processing
process_images_udf = udf(process_images, ArrayType(StringType()))

# Apply UDF to DataFrame
processed_images_df = dataset_path_with_categories.withColumn("processed_images", process_images_udf(dataset_path_with_categories["image.origin"]))

# Define a UDF to check if an image is valid
def is_valid_image(img):
    try:
        # Attempt to create a TensorFlow image from the data
        tf.io.decode_jpeg(tf.convert_to_tensor(img))
        return True
    except:
        return False

# Define UDF for image validity
is_valid_image_udf = udf(is_valid_image)

# Add a new column indicating whether each image is valid or not
processed_images_df = processed_images_df.withColumn("is_valid_image", is_valid_image_udf(processed_images_df["processed_images"]))

# Show the schema and display first few rows with processed images and their validity
processed_images_df.printSchema()
processed_images_df.show(1, truncate=False)


In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType, BooleanType

# Define the image processing function
def process_images(images):
    IMAGE_SIZE = (224, 224)  # Example image size

    def process_single_image(filename):
        img = tf.io.read_file(filename)
        img = tf.io.decode_jpeg(img)
        img = tf.image.resize(img, IMAGE_SIZE)
        img = tf.image.convert_image_dtype(img, tf.float32)
        return img.numpy().tolist()

    return [process_single_image(filename) for filename in images]

# Define UDF for image processing
process_images_udf = udf(process_images, ArrayType(StringType()))

# Apply UDF to DataFrame
processed_images_df = dataset_path_with_categories.withColumn("processed_images", process_images_udf(dataset_path_with_categories["image.origin"]))

# Define a UDF to check if an image is valid
def is_valid_image(img):
    try:
        # Attempt to create a TensorFlow image from the data
        tf.io.decode_jpeg(tf.convert_to_tensor(img))
        return True
    except:
        return False

# Define UDF for image validity
is_valid_image_udf = udf(is_valid_image, BooleanType())

# Add a new column indicating whether each image is valid or not
processed_images_df = processed_images_df.withColumn("is_valid_image", is_valid_image_udf(processed_images_df["processed_images"]))

# Drop rows where the image is not valid
processed_images_df = processed_images_df.filter(processed_images_df["is_valid_image"])

# Show the schema and display first few rows with processed images and their validity
processed_images_df.printSchema()
processed_images_df.show(1, truncate=False)

## Encode the target label 

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

# Define the label encoding function
def encode_label(label):
    if label == 'Cashew':
        return 0
    elif label == 'Cassava':
        return 1
    elif label == 'Maize':
        return 2
    elif label == 'Tomato':
        return 3
    else:
        return -1

# Define UDF for label encoding
encode_label_udf = udf(encode_label, IntegerType())

# Apply UDF to DataFrame to create a new column for encoded labels
processed_images_df = processed_images_df.withColumn("encoded_label", encode_label_udf(processed_images_df["plant_category"]))

# Show the schema and display first few rows with encoded labels
processed_images_df.printSchema()
processed_images_df.show(5)

In [None]:
# Collect filenames, plant categories, and disease categories into lists
filenames = processed_images_df.select("image.origin").rdd.flatMap(lambda x: x).collect()
plant_category = processed_images_df.select("plant_category").rdd.flatMap(lambda x: x).collect()
disease_category = processed_images_df.select("disease_category").rdd.flatMap(lambda x: x).collect()

# Show the first few elements of each list for verification
print("Filenames:", filenames[:5])
print("Plant categories:", plant_category[:5])
print("Disease categories:", disease_category[:5])

In [None]:
import pandas as pd

# Create a Pandas dataframe using the lists
data = {"filename": filenames, "plant_category": plant_category, "disease_category": disease_category}
df = pd.DataFrame(data)

# Shuffle the dataframe
df = df.sample(frac=1)

# Print the first 10 rows of the shuffled dataframe
print(df.head(10))

In [None]:
category_sizes = df.groupby("plant_category").size()
print(category_sizes)

In [None]:
import matplotlib.pyplot as plt
categories = df['plant_category'].sort_values().unique()

fig = plt.figure(figsize = (10, 5))

# creating the bar plot
plt.bar(categories, category_sizes, color ='blue',
        width = 0.4)

plt.xlabel("Plant Categories")
plt.ylabel("No. of Images")
plt.title("Number of Images Per Plant Category")
plt.show()

## Training Data

In [None]:
# Define the path to the directory containing the image files on your local machine
local_image_path = "file:///home/hduser/crop_pest_disease_dataset/Train"

# Read binary files into DataFrame
image_df = spark.read.binaryFiles(local_image_path)

# Show the DataFrame
image_df.show()

# Stop SparkSession
spark.stop()


In [None]:
/crop_pest_disease_data/Cassava/test_set
/crop_pest_disease_data/Cassava/train_set
Files in /crop_pest_disease_data/Cashew directory:
/crop_pest_disease_data/Cashew/test_set
/crop_pest_disease_data/Cashew/train_set
Files in /crop_pest_disease_data/Maize directory:
/crop_pest_disease_data/Maize/test_set
/crop_pest_disease_data/Maize/train_set

In [None]:
cassava_train = "/crop_pest_disease_data/Cassava/train_set"
cassavatrain_df = spark.read.format("image").load(cassava_train)

In [None]:
cassavatrain_df.show()

In [None]:
from pyspark import SparkContext
from PIL import Image
import io

# Read images from HDFS
image_rdd = sc.binaryFiles("/crop_pest_disease_data/Cassava/train_set")

# Process images
def process_image(image_data):
    img = Image.open(io.BytesIO(image_data))
    # Perform image processing tasks (e.g., resize, filter)
    # Example: resized_img = img.resize((new_width, new_height))
    return img

processed_images_rdd = image_rdd.map(lambda x: process_image(x[1]))

# Save processed images back to HDFS
processed_images_rdd.saveAsSequenceFile("hdfs://path/to/processed_images")

# Stop SparkContext
sc.stop()


In [None]:
schema = StructType(cassavatrain_df.select("image.*").schema.fields + [
    StructField("data_as_resized_array", ArrayType(IntegerType()), True),
    StructField("data_as_array", ArrayType(IntegerType()), True)
])

def resize_img(img_data, resize=True):
    mode = 'RGBA' if (img_data.nChannels == 4) else 'RGB' 
    img = Image.frombytes(mode=mode, data=img_data.data, size=[img_data.width, img_data.height])
    img = img.convert('RGB') if (mode == 'RGBA') else img
    img = img.resize([224, 224], resample=Image.Resampling.BICUBIC) if (resize) else img
    arr = convert_bgr_array_to_rgb_array(np.asarray(img))
    arr = arr.reshape([224*224*3]) if (resize) else arr.reshape([img_data.width*img_data.height*3])

    return arr

def resize_image_udf(dataframe_batch_iterator: Iterator[pd.DataFrame]) -> Iterator[pd.DataFrame]:
    for dataframe_batch in dataframe_batch_iterator:
        dataframe_batch["data_as_resized_array"] = dataframe_batch.apply(resize_img, args=(True,), axis=1)
        dataframe_batch["data_as_array"] = dataframe_batch.apply(resize_img, args=(False,), axis=1)
        yield dataframe_batch

resized_df = image_df.select("image.*").mapInPandas(resize_image_udf, schema)

In [None]:
# Check the length of the collected list
collected_images = cassavatrain_df.select("image").collect()
num_images = len(collected_images)

# Adjust the index if it's out of range
if image_row >= num_images:
    print(f"Index {image_row} is out of range. Adjusting to {num_images - 1}.")
    image_row = num_images - 1

# Access the image at the adjusted index
spark_single_img = collected_images[image_row]
(spark_single_img.image.origin, spark_single_img.image.mode, spark_single_img.image.nChannels )
mode = 'RGBA' if (spark_single_img.image.nChannels == 4) else 'RGB'


In [None]:
# Check the length of the collected list
collected_images = cassavatrain_df.select("image").collect()
if image_row < len(collected_images):
    spark_single_img = collected_images[image_row]
    (spark_single_img.image.origin, spark_single_img.image.mode, spark_single_img.image.nChannels )
    
    mode = 'RGBA' if (spark_single_img.image.nChannels == 4) else 'RGB'
else:
    print("Error: Index out of range")

In [None]:
# Check the length of the collected list
collected_images = cassavatrain_df.select("image").collect()
num_images = len(collected_images)

# Adjust the index if it's out of range
if image_row >= num_images:
    print(f"Index {image_row} is out of range. Adjusting to {num_images - 1}.")
    image_row = num_images - 1

# Access the image at the adjusted index
spark_single_img = collected_images[image_row]
(spark_single_img.image.origin, spark_single_img.image.mode, spark_single_img.image.nChannels )
mode = 'RGBA' if (spark_single_img.image.nChannels == 4) else 'RGB'


In [None]:
image_row = 40
spark_single_img = cassavatrain_df.select("image").collect()[image_row]
(spark_single_img.image.origin, spark_single_img.image.mode, spark_single_img.image.nChannels )

mode = 'RGBA' if (spark_single_img.image.nChannels == 4) else 'RGB' 
Image.frombytes(mode=mode, data=bytes(spark_single_img.image.data), size=[spark_single_img.image.width,spark_single_img.image.height]).show()

In [None]:
def load_images_from_directory(directory):
    images = []
    labels = []
    for class_name in os.listdir(directory):
        class_path = os.path.join(directory, class_name)
        if os.path.isdir(class_path) and os.listdir(class_path):  # Check if directory exists and not empty
            for filename in os.listdir(class_path):
                # Process image files here
                # Example: load image using OpenCV or any other library
                image = cv2.imread(os.path.join(class_path, filename))
                # images.append(image)
                images.append(os.path.join(class_path, filename))  # Append image file path
                labels.append(class_name)
        else:
            print(f"Warning: Directory {class_path} is empty or does not exist.")
    return images, labels

crops_classes = {
    "Cashew": ["anthracnose3102", "gumosis1714", "healthy5877", "leaf miner3466", "red rust4751"],
    "Cassava": ["bacterial blight3241","bacterial_blight", "brown_spot", "green_mite", "healthy", "mosaic"],
    "Maize": ["fall armyworm", "grasshopper", "healthy", "leaf beetle", "leaf blight", "leaf_spot", "streak virus"],
    "Tomato": ["healthy", "leaf blight", "leaf curl", "septoria leaf spot", "verticillium wilt"]
}

data_directory = "hdfs://localhost:9000/crop_pest_disease_data"

train_data_rdds = []

for crop, classes in crops_classes.items():
    crop_rain_data_rdd = []
    for class_name in classes:
        hdfs_path = f"{data_directory}/{crop}/train_set/{class_name}"
        images, labels = load_images_from_directory(hdfs_path)
        if images and labels:  # Check if images and labels are not empty
            crop_test_data_rdd.append((images, labels))
    train_data_rdds.append(crop_train_data_rdd)

In [None]:
# Initialize lists to store features and labels
features_list = []
labels_list = []

# Iterate over each RDD
for crop_rdd_list in train_data_rdds:
    for rdd in crop_rdd_list:
        # Extract crop name from RDD path
        crop = rdd.name().split('/')[-3]
        # Extract class name from RDD path
        class_name = rdd.name().split('/')[-1]
        # Extract features (images) and labels (class names) from file paths
        features_with_paths = rdd.collect()
        if features_with_paths:  # Check if RDD is not empty
            features = [features for _, features in features_with_paths]
            labels = [class_name] * len(features)
            # Append features and labels to the lists
            features_list.extend(features)
            labels_list.extend(labels)

## Testing Data

In [None]:
def load_images_from_directory(directory):
    images = []
    labels = []
    for class_name in os.listdir(directory):
        class_path = os.path.join(directory, class_name)
        if os.path.isdir(class_path) and os.listdir(class_path):  # Check if directory exists and not empty
            for filename in os.listdir(class_path):
                # Process image files here
                # Example: load image using OpenCV or any other library
                # image = cv2.imread(os.path.join(class_path, filename))
                # images.append(image)
                images.append(os.path.join(class_path, filename))  # Append image file path
                labels.append(class_name)
        else:
            print(f"Warning: Directory {class_path} is empty or does not exist.")
    return images, labels

crops_classes = {
    "Cashew": ["anthracnose", "gumosis", "healthy", "leaf miner", "red rust"],
    "Cassava": ["bacterial_blight", "brown_spot", "green_mite", "healthy", "mosaic"],
    "Maize": ["fall armyworm", "grasshopper", "healthy", "leaf beetle", "leaf blight", "leaf_spot", "streak virus"],
    "Tomato": ["healthy", "leaf blight", "leaf curl", "septoria leaf spot", "verticillium wilt"]
}

data_directory = "hdfs://localhost:9000/crop_pest_disease_data"

test_data_rdds = []

for crop, classes in crops_classes.items():
    crop_test_data_rdd = []
    for class_name in classes:
        hdfs_path = f"{data_directory}/{crop}/test_set/{class_name}"
        images, labels = load_images_from_directory(hdfs_path)
        if images and labels:  # Check if images and labels are not empty
            crop_test_data_rdd.append((images, labels))
    test_data_rdds.append(crop_test_data_rdd)


In [None]:
os.path.exists(hdfs_path)

In [None]:
hdfs_path = "hdfs://localhost:9000/crop_pest_disease_data"

In [None]:
from pathlib import Path
import os.path
import pandas as pd

In [None]:
def convert_path_to_df(hdfs_path):
    image_dir = Path(hdfs_path)

    # Get filepaths and labels
    filepaths = list(image_dir.glob(r'**/*.JPG')) 

    labels = list(map(lambda x: os.path.split(os.path.split(x)[0])[1], filepaths))

    filepaths = pd.Series(filepaths, name='Filepath').astype(str)
    labels = pd.Series(labels, name='Label')

    # Concatenate filepaths and labels
    image_df = pd.concat([filepaths, labels], axis=1)
    return image_df

image_df = convert_path_to_df(hdfs_path)

In [None]:
image_df.head()

In [None]:
# Check for corrupted images within the dataset
import PIL
from pathlib import Path
from PIL import UnidentifiedImageError

path = Path(hdfs_path).rglob("*.jpg")
for img_p in path:
    try:
        img = PIL.Image.open(img_p)
    except PIL.UnidentifiedImageError:
            print(img_p)


In [None]:
import seaborn as sns

In [None]:
label_counts = image_df['Label'].value_counts()

plt.figure(figsize=(10, 6))
sns.barplot(x=label_counts.index, y=label_counts.values, alpha=0.8, palette='rocket')
plt.title('Distribution of Labels in Image Dataset', fontsize=16)
plt.xlabel('Label', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=45) 
plt.show()

In [None]:
# Initialize lists to store features and labels
features_list = []
labels_list = []

# Iterate over each RDD
for crop_rdd_list in train_data_rdds:
    for rdd in crop_rdd_list:
        # Extract crop name from RDD path
        crop = rdd.name().split('/')[-3]
        # Extract class name from RDD path
        class_name = rdd.name().split('/')[-1]
        # Extract features (images) and labels (class names) from file paths
        features_with_paths = rdd.collect()
        if features_with_paths:  # Check if RDD is not empty
            features = [features for _, features in features_with_paths]
            labels = [class_name] * len(features)
            # Append features and labels to the lists
            features_list.extend(features)
            labels_list.extend(labels)

In [None]:
def load_images_from_directory(directory):
    images = []
    labels = []
    for class_name in os.listdir(directory):
        class_path = os.path.join(directory, class_name)
        if os.path.isdir(class_path) and os.listdir(class_path):  # Check if directory exists and not empty
            for filename in os.listdir(class_path):
                # Process image files here
                # Example: load image using OpenCV or any other library
                # image = cv2.imread(os.path.join(class_path, filename))
                # images.append(image)
                images.append(os.path.join(class_path, filename))  # Append image file path
                labels.append(class_name)
        else:
            print(f"Warning: Directory {class_path} is empty or does not exist.")
    return images, labels

crops_classes = {
    "Cashew": ["anthracnose", "gumosis", "healthy", "leaf miner", "red rust"],
    "Cassava": ["bacterial_blight", "brown_spot", "green_mite", "healthy", "mosaic"],
    "Maize": ["fall armyworm", "grasshopper", "healthy", "leaf beetle", "leaf blight", "leaf_spot", "streak virus"],
    "Tomato": ["healthy", "leaf blight", "leaf curl", "septoria leaf spot", "verticillium wilt"]
}

data_directory = "hdfs://localhost:9000/crop_pest_disease_data"

test_data_rdds = []

for crop, classes in crops_classes.items():
    crop_test_data_rdd = []
    for class_name in classes:
        hdfs_path = f"{data_directory}/{crop}/test_set/{class_name}"
        images, labels = load_images_from_directory(hdfs_path)
        if images and labels:  # Check if images and labels are not empty
            crop_test_data_rdd.append((images, labels))
    test_data_rdds.append(crop_test_data_rdd)


In [None]:
spark = 

In [None]:
# Hadoop directory path containing image files
hadoop_dir_path = "hdfs://localhost:9000/crop_pest_disease_data"

# Read image files from Hadoop directory
image_df = spark.read.format("image").load(hadoop_dir_path)


In [None]:
# Show a snippet of the DataFrame
image_df.show(5)

In [None]:
# View DataFrame schema
image_df.printSchema()

In [None]:
crop_df = datasets.ImageFolder(root=hadoop_data_path, transform=transform)

## Perform Image processing


The dataset has already been processed. Here is the image preprocessing that was done to the data.
- All images were captured, separated,and saved in their respective folders according to the plant type.
- The images were annotated and labelled
- Image Cropping and size reduction

## Data Processing and Cleaning

### Checking the image sizes of each crop

In [None]:
import os
from PIL import Image

# Assuming you have already defined and loaded your dataset
# dataset = datasets.ImageFolder(root=hadoop_data_path, transform=transform)

# Get the list of classes (crops) in the dataset
classes = dataset.classes

# Create a dictionary to store image sizes for each crop
crop_image_sizes = {crop: [] for crop in classes}

# Iterate through the dataset
for image_path, label in dataset.samples:
    # Open the image using PIL
    image = Image.open(image_path)
    # Get the size of the image
    width, height = image.size
    # Get the crop name using the label
    crop_name = classes[label]
    # Append the image size to the corresponding crop in the dictionary
    crop_image_sizes[crop_name].append((width, height))

# Print the image sizes for each crop
for crop, sizes in crop_image_sizes.items():
    print(f"Crop: {crop}")
    print(f"Total images: {len(sizes)}")
    print(f"Average image size: {sum([w for w, h in sizes]) / len(sizes)} x {sum([h for w, h in sizes]) / len(sizes)}")
    print()

### check if the pixel value have been normalized to a range suitable for training neural networks


In [None]:
import torch
from torchvision import datasets, transforms

# Assuming you have already defined and loaded your dataset
# dataset = datasets.ImageFolder(root=hadoop_data_path, transform=transform)

# Define a transformation to convert PIL images to PyTorch tensors
transform = transforms.ToTensor()

# Load a few sample images from the dataset
sample_loader = torch.utils.data.DataLoader(dataset, batch_size=5, shuffle=True)
for images, labels in sample_loader:
    # Convert images to numpy arrays and print pixel value range
    for image in images:
        # Check pixel value range
        min_pixel_value = torch.min(image)
        max_pixel_value = torch.max(image)
        print(f"Min pixel value: {min_pixel_value}, Max pixel value: {max_pixel_value}")

import os
import torch
from torchvision import datasets, transforms
from sklearn.model_selection import train_test_split

# Define transformations for preprocessing
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load image data using PyTorch
hadoop_data_path = "hdfs://path/to/crop_pest_disease_data"
dataset = datasets.ImageFolder(root=hadoop_data_path, transform=transform)

# Split data into training and testing datasets
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

# Create data loaders for batch processing
batch_size = 32
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False)


## Check if there are any defective images

In [None]:
data_dir='hdfs://localhost:9000/crop_pest_disease_data'
bad_img_list=[]
total=0
good=0
bad=0
classes=sorted(os.listdir(data_dir))
for klass in classes:
    good_class=0
    bad_class=0
    total_class=0
    msg=f'processing class {klass}'
    print(msg, '\r', end= '')
    classpath=os.path.join(data_dir, klass)
    flist=sorted(os.listdir(classpath))
    for f in flist:
        total +=1
        total_class +=1
        fpath=os.path.join(classpath,f)
        try:
            img= Image.open(fpath) 
            array=np.asarray(img)
            good +=1
            good_class +=1
        except:
            bad_img_list.append(fpath)
            bad +=1
            bad_class +=1
    
    msg=f'class {klass} contains {total_class} files, {good_class} are valid image files and {bad_class} defective image files'
    print (msg)
msg=f'the dataset contains {total} image files, {good} are valid image files and {bad} are defective image files'
print (msg)
if bad>0:
    ans=input('to print a list of defective image files enter P, to not print press Enter')
    if ans == 'P' or ans == 'p':
        for f in bad_img_list:
            print(f)

In [None]:
Create a corrected dataset with the defective image files removed

In [None]:
# you can use this dataset to create a model.
working_dir=r'/kaggle/working/'
corrected_dir=os.path.join(working_dir, 'corrected dataset') # where the corrected dataset will be stored
copied_count = 0
if os.path.isdir(corrected_dir):
    shutil.rmtree(corrected_dir) # make sure the corrected_dir is empty
os.mkdir(corrected_dir)
for klass in classes:
    classpath=os.path.join(data_dir, klass)
    dest_classpath=os.path.join(corrected_dir, klass)
    os.mkdir(dest_classpath)
    flist= os.listdir(classpath)
    for f in flist:
        fpath=os.path.join(classpath,f)
        dest_fpath=os.path.join(dest_classpath,f)
        if fpath not in bad_img_list:
            shutil.copy(fpath, dest_fpath)
            copied_count +=1
msg=f'{copied_count} valid image files were stored in {corrected_dir}'
print(msg)

In [None]:
#Split the training data into training and validation datasets

### Training Data

## Validation Data

### Test Data

In [None]:
crops_classes = {
    "Cashew": ["anthracnose", "gumosis", "healthy", "leaf miner", "red rust"],
    "Cassava": ["bacterial_blight", "brown_spot", "green_mite", "healthy","mosaic"],
    "Maize": ["fall armyworm", "grasshoper", "healthy", "leaf beetle", "leaf blight", "leaf_spot","streak virus"],
    "Tomato": ["healthy", "leaf blight", "leaf curl", "septoria leaf spot","verticulium wilt"]
}

test_data_rdds = []

for crop, classes in crops_classes.items():
    crop_test_data_rdd = []
    for class_name in classes:


        hdfs_path = f"hdfs://localhost:9000/crop_pest_disease_data/{crop}/test_set/{class_name}"
        test_images_rdd = spark.sparkContext.binaryFiles(hdfs_path)
        crop_test_data_rdd.append(test_images_rdd)
    test_data_rdds.append(crop_test_data_rdd)

In [None]:
hdfs_path = f"hdfs://localhost:9000/crop_pest_disease_data/Maize/test_set/grasshopper/0maize_valid_grasshoper.JPG"
test_images = spark.sparkContext.binaryFiles(hdfs_path)

In [None]:
type(test_images)

In [None]:
data=test_images.collect()
print(data)

In [None]:
# spark is from the previous example.
sc = spark.sparkContext

# Path to your image data
cropdata = "file:///home/hduser/Big data and advanced analytics data"

# Read image files into a DataFrame
crop_dataframe = spark.read.format("image").load(cropdata)

# Show the DataFrame schema and first few rows
crop_dataframe.printSchema()
crop_dataframe.show()

# Stop SparkSession
spark.stop()

In [None]:
!git add .
!git commit -m "comment"
!git push origin main

In [None]:
file = sc.textFile

In [None]:
#Import all libraries required for EDA, Preprocessing, Model building, Model Testing, Model EValuation and Visualization
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import pickle
import cv2
from os import listdir
from sklearn.preprocessing import LabelBinarizer
from keras.layers.convolutional import MaxPooling2D
from keras.layers.core import Activation, Flatten, Dropout, Dense
from keras import backend as K
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam
from keras.preprocessing import image
from tensorflow.keras.preprocessing.image import img_to_array
from keras.models import Sequential
from tensorflow.compat.v1.keras.layers import BatchNormalization
from keras.layers.convolutional import Conv2D

In [None]:
#Default size for input images
width=256
height=256
depth=3epoch_ = 25
BS = 32
default_image_size = tuple((256, 256))
image_size = 0
root_dir = '/content/New Plant Diseases Dataset(Augmented)/New Plant Diseases Dataset(Augmented)/'
INIT_LR = 1e-3

In [None]:
#Convert Image into NUmPy array
def convert_image_to_array(image_dir):
    try:
        image = cv2.imread(image_dir)
        if image is not None :
            image = cv2.resize(image, default_image_size)   
            return img_to_array(image)
        else :
            return np.array([])
    except Exception as e:
        print(f"Error : {e}")
        return None