# Integrating Big Data Analytics and Convolutional Neural Networks for Pest and Disease Detection and Classification

In [1]:
#Install the necessary libraries
!pip install torch
!pip install torchvision

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [4]:
#Import all the necessary libraries
import warnings
warnings.filterwarnings("ignore")
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
import torch.nn.functional as F
import torchvision
from torchvision import transforms
from PIL import Image
from pyspark.sql import SparkSession
from pyspark.ml.image import ImageSchema
from pyspark.sql.functions import lit
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
import os
from PIL import Image
import numpy as np
import shutil

In [5]:
#List the contents of the root directory in HDFS
!hdfs dfs -ls /

Found 4 items
drwxr-xr-x   - hduser supergroup          0 2024-03-20 10:55 /crop_pest_disease_data
drwxr-xr-x   - hduser supergroup          0 2024-03-07 10:06 /output1
drwxr-xr-x   - hduser supergroup          0 2024-03-07 13:14 /output2
drwxr-xr-x   - hduser supergroup          0 2024-03-14 14:03 /user1


In [2]:
#sc master - running  locally
sc.master

'local[*]'

In [2]:
#Initialize Spark session
spark = SparkSession.builder \
    .appName("Crop Pest Disease Detection") \
    .getOrCreate()
warnings.filterwarnings("ignore")

## processing the data using Apache Spark-PySpark

In [None]:
from pyspark.sql.functions import lit

#Read image data from Hadoop using PySpark
hadoop_data_path = "hdfs://crop_pest_disease_data"
image_df = ImageSchema.readImages(hadoop_data_path)

#Extract labels from folder names
image_df = image_df.withColumn("label", 
                                split(split(image_df["image"]["origin"], "/")[size(split(image_df["image"]["origin"], "/"))-2], "_")[size(split(split(image_df["image"]["origin"], "/")[size(split(image_df["image"]["origin"], "/"))-2], "_"))-1])

#Perform Exploratory Data Analysis (EDA)
class_distribution = image_df.groupBy("label").count().orderBy("count", ascending=False)
class_distribution.show()

#Display sample images
image_df.show()

#Stop Spark session
spark.stop()


## processing the data using pyTorch

In [None]:
hadoop_data_path = "hdfs://path/to/crop_pest_disease_data" 
crop_df = datasets.ImageFolder(root=hadoop_data_path, transform=transform)

## Perform Image processing


The dataset has already been processed. Here is the image preprocessing that was done to the data.
- All images were captured, separated,and saved in their respective folders according to the plant type.
- The images were annotated and labelled
- Image Cropping and size reduction

## Data Processing and Cleaning

### Checking the image sizes of each crop

In [None]:
import os
from PIL import Image

# Assuming you have already defined and loaded your dataset
# dataset = datasets.ImageFolder(root=hadoop_data_path, transform=transform)

# Get the list of classes (crops) in the dataset
classes = dataset.classes

# Create a dictionary to store image sizes for each crop
crop_image_sizes = {crop: [] for crop in classes}

# Iterate through the dataset
for image_path, label in dataset.samples:
    # Open the image using PIL
    image = Image.open(image_path)
    # Get the size of the image
    width, height = image.size
    # Get the crop name using the label
    crop_name = classes[label]
    # Append the image size to the corresponding crop in the dictionary
    crop_image_sizes[crop_name].append((width, height))

# Print the image sizes for each crop
for crop, sizes in crop_image_sizes.items():
    print(f"Crop: {crop}")
    print(f"Total images: {len(sizes)}")
    print(f"Average image size: {sum([w for w, h in sizes]) / len(sizes)} x {sum([h for w, h in sizes]) / len(sizes)}")
    print()

### check if the pixel value have been normalized to a range suitable for training neural networks


In [None]:
import torch
from torchvision import datasets, transforms

# Assuming you have already defined and loaded your dataset
# dataset = datasets.ImageFolder(root=hadoop_data_path, transform=transform)

# Define a transformation to convert PIL images to PyTorch tensors
transform = transforms.ToTensor()

# Load a few sample images from the dataset
sample_loader = torch.utils.data.DataLoader(dataset, batch_size=5, shuffle=True)
for images, labels in sample_loader:
    # Convert images to numpy arrays and print pixel value range
    for image in images:
        # Check pixel value range
        min_pixel_value = torch.min(image)
        max_pixel_value = torch.max(image)
        print(f"Min pixel value: {min_pixel_value}, Max pixel value: {max_pixel_value}")

import os
import torch
from torchvision import datasets, transforms
from sklearn.model_selection import train_test_split

# Define transformations for preprocessing
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load image data using PyTorch
hadoop_data_path = "hdfs://path/to/crop_pest_disease_data"
dataset = datasets.ImageFolder(root=hadoop_data_path, transform=transform)

# Split data into training and testing datasets
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

# Create data loaders for batch processing
batch_size = 32
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False)


## Check if there are any defective images

In [None]:
data_dir=r'/kaggle/input/crop-pest-and-disease-detection'
bad_img_list=[]
total=0
good=0
bad=0
classes=sorted(os.listdir(data_dir))
for klass in classes:
    good_class=0
    bad_class=0
    total_class=0
    msg=f'processing class {klass}'
    print(msg, '\r', end= '')
    classpath=os.path.join(data_dir, klass)
    flist=sorted(os.listdir(classpath))
    for f in flist:
        total +=1
        total_class +=1
        fpath=os.path.join(classpath,f)
        try:
            img= Image.open(fpath) 
            array=np.asarray(img)
            good +=1
            good_class +=1
        except:
            bad_img_list.append(fpath)
            bad +=1
            bad_class +=1
    
    msg=f'class {klass} contains {total_class} files, {good_class} are valid image files and {bad_class} defective image files'
    print (msg)
msg=f'the dataset contains {total} image files, {good} are valid image files and {bad} are defective image files'
print (msg)
if bad>0:
    ans=input('to print a list of defective image files enter P, to not print press Enter')
    if ans == 'P' or ans == 'p':
        for f in bad_img_list:
            print(f)

In [None]:
Create a corrected dataset with the defective image files removed

In [None]:
# you can use this dataset to create a model.
working_dir=r'/kaggle/working/'
corrected_dir=os.path.join(working_dir, 'corrected dataset') # where the corrected dataset will be stored
copied_count = 0
if os.path.isdir(corrected_dir):
    shutil.rmtree(corrected_dir) # make sure the corrected_dir is empty
os.mkdir(corrected_dir)
for klass in classes:
    classpath=os.path.join(data_dir, klass)
    dest_classpath=os.path.join(corrected_dir, klass)
    os.mkdir(dest_classpath)
    flist= os.listdir(classpath)
    for f in flist:
        fpath=os.path.join(classpath,f)
        dest_fpath=os.path.join(dest_classpath,f)
        if fpath not in bad_img_list:
            shutil.copy(fpath, dest_fpath)
            copied_count +=1
msg=f'{copied_count} valid image files were stored in {corrected_dir}'
print(msg)

In [None]:
#Split the training data into training and validation datasets

### Training Data

In [None]:
crops_classes = {
    "Cashew": ["anthracnose3102", "	gumosis1714", "healthy5877", "leaf miner3466", "red rust4751"],
    "Cassava": ["bacterial blight", "bacterial blight3241", "brown spot", "green mite", "healthy","mosaic"],
    "Maize": ["class1", "class2"],
    "Tomato": ["class1", "class2", "class3", "class4", "class5"]
}

train_data_rdds = []

for crop, classes in crops_classes.items():
    crop_train_data_rdd = []

    for class_name in classes:
        train_images_rdd = spark.sparkContext.binaryFiles(f"hdfs://Cropdiseasedata/CCMT-Dataset-Augmented/{crop}/train_set/{class_name}/*")
        crop_train_data_rdd.append(train_images_rdd)
        
    train_data_rdds.append(crop_train_data_rdd)

## Validation Data

### Test Data

In [None]:
crops_classes = {
    "Cashew": ["anthracnose", "gumosis", "healthy", "leaf miner", "red rust"],
    "Cassava": ["class1", "class2", "class3", "class4"],
    "Maize": ["class1", "class2"],
    "Tomato": ["class1", "class2", "class3", "class4", "class5"]
}

test_data_rdds = []

for crop, classes in crops_classes.items():
    crop_test_data_rdd = []

    for class_name in classes:
        test_images_rdd = spark.sparkContext.binaryFiles(f"hdfs://Cropdiseasedata/CCMT-Dataset-Augmented/{crop}/train_set/{class_name}/*")
        crop_test_data_rdd.append(test_images_rdd)

    testing_data_rdds.append(crop_testing_data_rdd)

In [17]:
# spark is from the previous example.
sc = spark.sparkContext

# Path to your image data
cropdata = "file:///home/hduser/Big data and advanced analytics data"

# Read image files into a DataFrame
crop_dataframe = spark.read.format("image").load(cropdata)

# Show the DataFrame schema and first few rows
crop_dataframe.printSchema()
crop_dataframe.show()

# Stop SparkSession
spark.stop()

24/03/16 20:17:57 WARN SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.


root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)

+-----+
|image|
+-----+
+-----+



In [None]:
!git add .
!git commit -m "comment"
!git push origin main

[master ac4686d] comment
 3 files changed, 240 insertions(+)
 create mode 100644 .ipynb_checkpoints/Semester_Two_CA_CODES-checkpoint.ipynb
 create mode 100644 .~lock.conference-template-a4.docx#
Username for 'https://github.com': 

In [None]:
file = sc.textFile

In [1]:
#Import all libraries required for EDA, Preprocessing, Model building, Model Testing, Model EValuation and Visualization
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import pickle
import cv2
from os import listdir
from sklearn.preprocessing import LabelBinarizer
from keras.layers.convolutional import MaxPooling2D
from keras.layers.core import Activation, Flatten, Dropout, Dense
from keras import backend as K
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam
from keras.preprocessing import image
from tensorflow.keras.preprocessing.image import img_to_array
from keras.models import Sequential
from tensorflow.compat.v1.keras.layers import BatchNormalization
from keras.layers.convolutional import Conv2D



In [None]:
#Default size for input images
width=256
height=256
depth=3epoch_ = 25
BS = 32
default_image_size = tuple((256, 256))
image_size = 0
root_dir = '/content/New Plant Diseases Dataset(Augmented)/New Plant Diseases Dataset(Augmented)/'
INIT_LR = 1e-3

In [None]:
#Convert Image into NUmPy array
def convert_image_to_array(image_dir):
    try:
        image = cv2.imread(image_dir)
        if image is not None :
            image = cv2.resize(image, default_image_size)   
            return img_to_array(image)
        else :
            return np.array([])
    except Exception as e:
        print(f"Error : {e}")
        return None

In [None]:
#Image and Lable List
image_list, label_list = [], []

In [None]:
try:

In [2]:
import PIL

In [3]:
print("Pillow Version:", PIL.__version__)

Pillow Version: 9.0.1


In [None]:
#Load all the images in a directory
from os import listdir
from matplotlib import image
#Load all images i