In [None]:
# import necessary packages
import cv2
import pandas as pd
import os
import json
import numpy as np
from skimage import feature
from matplotlib import pyplot as plt
from matplotlib import image as mpimg

In [None]:
# Create a function to rename the files
def rename_images(directory, prefix="image_", start_number=1, extension=".jpg"):
  """Renames images in the directory with a naming convention."""
  for i, filename in enumerate(os.listdir(directory)):
    if filename.endswith(extension):
      new_filename = f"{prefix}{start_number + i:03}{extension}"
      os.rename(os.path.join(directory, filename),
                os.path.join(directory, new_filename))

In [None]:
# Use the created function and rename the images
image_directory = "/content/drive/MyDrive/DE_CW2/Original_Images"
rename_images(image_directory)

In [None]:
# Create a function to resize images to 500x500
def resize_images_cv2(input_dir, output_dir, size=(500, 500)):
    """Resizes all images in th directory to 500X500 size using cv2 and save them to a new directory."""
    for filename in os.listdir(input_dir):
        if filename.endswith(('.jpg')):
            input_filepath = os.path.join(input_dir, filename)
            output_filepath = os.path.join(output_dir, filename)

            img = cv2.imread(input_filepath)
            resized_img = cv2.resize(img, size)
            cv2.imwrite(output_filepath, resized_img)

In [None]:
# Using the created function to resize the images
input_directory = "/content/drive/MyDrive/DE_CW2/Original_Images"
output_directory = "/content/drive/MyDrive/DE_CW2/Processed_images"
resize_images_cv2(input_directory, output_directory)

In [None]:
# function to reduce noise in the images
def apply_gaussian_blur(image_path, kernel_size=(5, 5), sigma=0):
  """Applies Gaussian blur to an image to reduce noise in the resized images."""
  img = cv2.imread(image_path)
  blurred_img = cv2.GaussianBlur(img, kernel_size, sigma)
  cv2.imwrite(image_path, blurred_img)

In [None]:
# Applyig the function to reduce the noise in images
for filename in os.listdir('/content/drive/MyDrive/DE_CW2/Processed_images'):
    if filename.endswith(('.jpg')):
        image_path = os.path.join('/content/drive/MyDrive/DE_CW2/Processed_images', filename)
        apply_gaussian_blur(image_path)

In [None]:
# function to rotate selected images 90 degrees clockwise
def rotate_and_replace_images(directory, image_names, angle=cv2.ROTATE_90_CLOCKWISE):
  """Rotates specified images using cv2.rotate and replaces the originals."""
  for image_name in image_names:
    image_path = os.path.join(directory, image_name)
    img = cv2.imread(image_path)
    rotated_img = cv2.rotate(img, angle)
    cv2.imwrite(image_path, rotated_img)

In [None]:
# using the created function, rotate selected images 90 degrees clockwise
directory = "/content/drive/MyDrive/DE_CW2/Processed_images"
images_to_rotate = ["image_040.jpg", "image_044.jpg"]

rotate_and_replace_images(directory, images_to_rotate)

In [None]:
# Load the Excel files with
df = pd.read_excel("/content/drive/MyDrive/DE_CW2/Image annotations.xlsx")

# Save to JSON
df.to_json("/content/drive/MyDrive/DE_CW2/Image annotations.json", orient="records", indent=4)


In [None]:
from skimage.feature import graycomatrix, graycoprops
from skimage.measure import label, regionprops

def extract_features(image_path):
  """Extracts color, texture, and shape features from an image."""
  # Read the image
  img = cv2.imread(image_path)

  # Colour Features
  # Mean Intensity
  mean_intensity = np.mean(img)
  # Norm Intensity
  norm_intensity = mean_intensity / 255.0

  # 3 Color Moments (mean, std, skewness)
  mean_color = np.mean(img, axis=(0, 1))
  std_color = np.std(img, axis=(0, 1))
  skewness_color = np.mean(((img - mean_color) / std_color)**3, axis=(0, 1))

 # Collect all the colour features together
  colour_features = {
      'mean_intensity':mean_intensity,
      'norm_intensity':norm_intensity,
      'mean_color':mean_color.tolist(),
      'std_color':std_color.tolist(),
      'skewness_color':skewness_color.tolist()
  }
  color_moments = np.concatenate([mean_color, std_color, skewness_color])

  # Texture Features (GLCM)
  gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  glcm = graycomatrix(gray_img, distances=[1], angles=[0, np.pi/4, np.pi/2, 3*np.pi/4], levels=256, symmetric=True, normed=True)
  texture_features = np.array([
      graycoprops(glcm, 'contrast')[0, 0],
      graycoprops(glcm, 'dissimilarity')[0, 0],
      graycoprops(glcm, 'homogeneity')[0, 0],
      graycoprops(glcm, 'energy')[0, 0],
      graycoprops(glcm, 'correlation')[0, 0],
      graycoprops(glcm, 'ASM')[0, 0]
  ])

  # Shape Features
  # Convert to grayscale and threshold
  gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  ret, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

  # Find contours
  contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

  # Get largest contour
  largest_contour = max(contours, key=cv2.contourArea)

  # Calculate shape features
  area = cv2.contourArea(largest_contour)
  perimeter = cv2.arcLength(largest_contour, True)
  M = cv2.moments(largest_contour)
  if M["m00"] != 0:
    cx = int(M["m10"] / M["m00"])
    cy = int(M["m01"] / M["m00"])
    centroid = (cx, cy)
  else:
    centroid = (0, 0)  # Assign (0, 0) if the moment is zero
  x, y, w, h = cv2.boundingRect(largest_contour)
  bounding_box = (x, y, w, h)

  shape_features = {
      'area': area,
      'perimeter': perimeter,
      'centroid': centroid,
      'bounding_box': bounding_box
  }
  # Store features in a dictionary
  features = {
      'colour_features': colour_features,
      'texture_features': texture_features.tolist(),
      'shape_features': shape_features
  }

  return features


In [None]:
# Extract features of all images and save to JSON
image_directory = '/content/drive/MyDrive/DE_CW2/Processed_images'
all_features = []

for filename in os.listdir(image_directory):
  if filename.endswith(('.jpg')):
    image_path = os.path.join(image_directory, filename)
    features = extract_features(image_path)
    ordered_features = {
            'filename': filename,  # to get the image name first
            'colour_features': features['colour_features'],
            'texture_features': features['texture_features'],
            'shape_features': features['shape_features']
        }
    all_features.append(ordered_features)

# Save to JSON
with open('/content/drive/MyDrive/DE_CW2/Feature_Extraction.json', 'w') as f:
  json.dump(all_features, f, indent=4)

In [None]:
# mongodb+srv://Govindu:2004GOVIss@dataengcw.zehi4.mongodb.net/?retryWrites=true&w=majority&appName=DataEngCW

In [None]:
# Install pymongo
!pip install pymongo

Collecting pymongo
  Downloading pymongo-4.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading pymongo-4.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.7.0 pymongo-4.10.1


In [None]:
from pymongo import MongoClient
import gridfs

In [None]:
!pip install "pymongo[srv]"



In [None]:
Image_connection = MongoClient("mongodb+srv://Govindu:2004GOVIss@dataengcw.zehi4.mongodb.net/?retryWrites=true&w=majority&appName=DataEngCW")

In [None]:
db = Image_connection["CBIR"]

In [None]:
try:
  print(db.list_collection_names())
except Exception as e:
    print(f"Connection Failed: {e}")

['processed_images', 'image_annotations', 'features_extracted']


In [None]:
image_annotations_collection = db["image_annotations"]

# Load JSON metadata file
with open('/content/drive/MyDrive/DE_CW2/Image annotations.json') as file:
    image_data = json.load(file)

# Insert data into MongoDB collection
if isinstance(image_data, list):
    image_annotations_collection.insert_many(image_data)  # For a list of documents
else:
    image_annotations_collection.insert_one(image_data)  # For a single document

In [None]:
extracted_features_collection = db["features_extracted"]

# Load JSON metadata file
with open('/content/drive/MyDrive/DE_CW2/Feature_Extraction.json') as file:
    image_data = json.load(file)

# Insert data into MongoDB collection
if isinstance(image_data, list):
    extracted_features_collection.insert_many(image_data)  # For a list of documents
else:
    extracted_features_collection.insert_one(image_data)  # For a single document

In [None]:
from bson import Binary

processed_images_collection = db["processed_images"]

image_folder_path = '/content/drive/MyDrive/DE_CW2/Processed_images'

for filename in os.listdir(image_folder_path):
    if filename.endswith(".png") or filename.endswith(".jpg"):
        with open(os.path.join(image_folder_path, filename), "rb") as image_file:
            binary_image = Binary(image_file.read())
            image_doc = {
                "filename": filename,
                "image_data": binary_image
            }
            processed_images_collection.insert_one(image_doc)

In [None]:
# Example query to find an image by filename
query = {'filename': "image_012.jpg"}
image_document = processed_images_collection.find_one(query)

if image_document:
    print("Image found:", image_document["filename"])
else:
    print("No image found with the specified criteria.")

Image found: image_012.jpg


Question 2

In [2]:
# import necessary packages
import cv2
import pandas as pd
import os
import json
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string

In [None]:
# Download stopwords and punkt tokenizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
CSV_file_path = '/content/drive/MyDrive/DE_CW2/movie.csv'
df = pd.read_csv(CSV_file_path)

df_sample = df.sample(n=65, random_state=42)
df_sample = df_sample.reset_index(drop=True)

In [None]:
print(df_sample.columns)

Index(['text', 'label'], dtype='object')


In [None]:
original_text_directory = '/content/drive/MyDrive/DE_CW2/Original_Text'

# Iterate through the subset DataFrame and save each row as a text file
for i, row in df_sample.iterrows():
    text = row['text']

    # Create a filename for each text file
    filename = f"Review_{i + 1:03}.txt"
    file_path = os.path.join(original_text_directory, filename)

    # Write the text to the file
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(text)

    print(f"Saved {filename}")

Saved Review_001.txt
Saved Review_002.txt
Saved Review_003.txt
Saved Review_004.txt
Saved Review_005.txt
Saved Review_006.txt
Saved Review_007.txt
Saved Review_008.txt
Saved Review_009.txt
Saved Review_010.txt
Saved Review_011.txt
Saved Review_012.txt
Saved Review_013.txt
Saved Review_014.txt
Saved Review_015.txt
Saved Review_016.txt
Saved Review_017.txt
Saved Review_018.txt
Saved Review_019.txt
Saved Review_020.txt
Saved Review_021.txt
Saved Review_022.txt
Saved Review_023.txt
Saved Review_024.txt
Saved Review_025.txt
Saved Review_026.txt
Saved Review_027.txt
Saved Review_028.txt
Saved Review_029.txt
Saved Review_030.txt
Saved Review_031.txt
Saved Review_032.txt
Saved Review_033.txt
Saved Review_034.txt
Saved Review_035.txt
Saved Review_036.txt
Saved Review_037.txt
Saved Review_038.txt
Saved Review_039.txt
Saved Review_040.txt
Saved Review_041.txt
Saved Review_042.txt
Saved Review_043.txt
Saved Review_044.txt
Saved Review_045.txt
Saved Review_046.txt
Saved Review_047.txt
Saved Review_

In [None]:
# Define a function to conver the text files to lowercase
def convert_to_lowercase(direc_in,direc_out):
  for filename in os.listdir(direc_in):
    if filename.endswith('.txt'):
       output_file_path = os.path.join(direc_out, filename)
       input_file_path = os.path.join(direc_in, filename)

       with open(input_file_path, 'r', encoding='utf-8') as f:
           text = f.read()
       lowercase_text = text.lower()

       with open(output_file_path, 'w', encoding='utf-8') as f:
           f.write(lowercase_text)
    print(f"Converted {filename} to lowercase.")

In [None]:
direc_in = '/content/drive/MyDrive/DE_CW2/Original_Text'
direc_out = '/content/drive/MyDrive/DE_CW2/Preprocessed_Text'

In [None]:
# Use the defined function to apply to all the txt files
convert_to_lowercase(direc_in,direc_out)

Converted .ipynb_checkpoints to lowercase.
Converted Review_001.txt to lowercase.
Converted Review_002.txt to lowercase.
Converted Review_003.txt to lowercase.
Converted Review_004.txt to lowercase.
Converted Review_005.txt to lowercase.
Converted Review_006.txt to lowercase.
Converted Review_007.txt to lowercase.
Converted Review_008.txt to lowercase.
Converted Review_009.txt to lowercase.
Converted Review_010.txt to lowercase.
Converted Review_011.txt to lowercase.
Converted Review_012.txt to lowercase.
Converted Review_013.txt to lowercase.
Converted Review_014.txt to lowercase.
Converted Review_015.txt to lowercase.
Converted Review_016.txt to lowercase.
Converted Review_017.txt to lowercase.
Converted Review_018.txt to lowercase.
Converted Review_019.txt to lowercase.
Converted Review_020.txt to lowercase.
Converted Review_021.txt to lowercase.
Converted Review_022.txt to lowercase.
Converted Review_023.txt to lowercase.
Converted Review_024.txt to lowercase.
Converted Review_025.

In [None]:
# Define a function to conver the text files to lowercase
def tokenize(directory):
  for filename in os.listdir(directory):
    if filename.endswith('.txt'):
       file_path = os.path.join(directory, filename)

       with open(file_path, 'r', encoding='utf-8') as f:
           text = f.read()
       tokens = word_tokenize(text)

       with open(file_path, 'w', encoding='utf-8') as f:
        for token in tokens:
          f.write(token + '\n')
    print(f"Tokenized {filename}")

In [None]:
directory = '/content/drive/MyDrive/DE_CW2/Preprocessed_Text'

In [None]:
# Use the created function to tokenize all the txt files
tokenize(directory)

Tokenized Review_001.txt
Tokenized Review_002.txt
Tokenized Review_003.txt
Tokenized Review_004.txt
Tokenized Review_005.txt
Tokenized Review_006.txt
Tokenized Review_007.txt
Tokenized Review_008.txt
Tokenized Review_009.txt
Tokenized Review_010.txt
Tokenized Review_011.txt
Tokenized Review_012.txt
Tokenized Review_013.txt
Tokenized Review_014.txt
Tokenized Review_015.txt
Tokenized Review_016.txt
Tokenized Review_017.txt
Tokenized Review_018.txt
Tokenized Review_019.txt
Tokenized Review_020.txt
Tokenized Review_021.txt
Tokenized Review_022.txt
Tokenized Review_023.txt
Tokenized Review_024.txt
Tokenized Review_025.txt
Tokenized Review_026.txt
Tokenized Review_027.txt
Tokenized Review_028.txt
Tokenized Review_029.txt
Tokenized Review_030.txt
Tokenized Review_031.txt
Tokenized Review_032.txt
Tokenized Review_033.txt
Tokenized Review_034.txt
Tokenized Review_035.txt
Tokenized Review_036.txt
Tokenized Review_037.txt
Tokenized Review_038.txt
Tokenized Review_039.txt
Tokenized Review_040.txt


In [None]:
# Install scikit-learn
!pip install scikit-learn==1.3.1
# Import Tfidfvectorization
from sklearn.feature_extraction.text import TfidfVectorizer



In [None]:
preprocessed_text_directory = '/content/drive/MyDrive/DE_CW2/Preprocessed_Text'
reviews = []

In [None]:
for filename in os.listdir(preprocessed_text_directory):
    if filename.endswith('.txt'):
        file_path = os.path.join(preprocessed_text_directory, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            # Read all lines in the file and join them into a single string
            reviews.append(' '.join(f.readlines()))

In [None]:
vectorizer = TfidfVectorizer(min_df=1) # since data set is samll min_df set to 1
tfidf_matrix = vectorizer.fit_transform(reviews) # Fit and transform the data

In [None]:
# Print the feature names (words)
print(vectorizer.get_feature_names_out())

# Print the TF-IDF matrix
print(tfidf_matrix.toarray())

['01' '10' '100' ... 'yours' 'yourself' 'zefram']
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.03374401 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [None]:
print(tfidf_matrix.shape)

(65, 3195)


In [None]:
print(tfidf_matrix.sum(axis=1))  # Sum of TF-IDF values per document


[[11.37943704]
 [13.79840774]
 [ 6.31190333]
 [ 7.65557492]
 [ 6.9614065 ]
 [ 9.85445837]
 [ 9.56791406]
 [ 8.64618497]
 [ 8.01785248]
 [ 6.144366  ]
 [ 8.77588577]
 [ 9.57821376]
 [10.53775895]
 [ 7.45743341]
 [10.77374606]
 [ 8.7914998 ]
 [ 9.09512887]
 [10.44919231]
 [ 8.07805341]
 [12.08774009]
 [11.0781233 ]
 [11.94437927]
 [ 9.53553264]
 [ 8.09523635]
 [15.96482221]
 [ 5.43457727]
 [11.23433387]
 [10.04744261]
 [10.4208438 ]
 [ 9.38788438]
 [ 8.64139868]
 [ 9.04020381]
 [ 9.13620736]
 [ 8.52839497]
 [ 9.18867236]
 [ 8.70314685]
 [10.37059432]
 [12.97585484]
 [ 9.47429022]
 [11.42745602]
 [ 8.46591606]
 [ 9.35559318]
 [ 9.6766835 ]
 [12.52670105]
 [ 7.87186087]
 [ 8.02765093]
 [ 8.27032446]
 [10.10381488]
 [10.87515753]
 [ 9.73335477]
 [ 9.57299508]
 [11.98050564]
 [11.58917451]
 [ 8.94179381]
 [ 9.39994342]
 [10.16270356]
 [10.54037689]
 [ 8.13715617]
 [ 9.4655374 ]
 [ 7.29643754]
 [13.19336598]
 [11.2500558 ]
 [ 6.44259884]
 [ 9.268586  ]
 [10.85199744]]


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer()

In [None]:
vectorizer.fit(reviews)

In [None]:
bow_matrix = vectorizer.transform(reviews)

In [None]:
print(bow_matrix)

  (0, 50)	1
  (0, 65)	1
  (0, 69)	2
  (0, 120)	5
  (0, 141)	1
  (0, 142)	1
  (0, 147)	1
  (0, 150)	12
  (0, 183)	1
  (0, 188)	4
  (0, 194)	1
  (0, 198)	2
  (0, 209)	2
  (0, 259)	3
  (0, 266)	1
  (0, 273)	1
  (0, 288)	1
  (0, 309)	1
  (0, 321)	1
  (0, 345)	1
  (0, 398)	2
  (0, 446)	1
  (0, 467)	2
  (0, 474)	3
  (0, 475)	1
  :	:
  (64, 2721)	1
  (64, 2795)	1
  (64, 2805)	1
  (64, 2809)	15
  (64, 2815)	1
  (64, 2828)	1
  (64, 2833)	2
  (64, 2837)	1
  (64, 2870)	4
  (64, 2990)	1
  (64, 3009)	1
  (64, 3023)	1
  (64, 3064)	1
  (64, 3068)	2
  (64, 3073)	1
  (64, 3077)	1
  (64, 3096)	1
  (64, 3102)	1
  (64, 3113)	2
  (64, 3124)	2
  (64, 3138)	2
  (64, 3165)	1
  (64, 3171)	1
  (64, 3172)	1
  (64, 3186)	4


In [None]:
print("Feature Names (Words):", vectorizer.get_feature_names_out())

Feature Names (Words): ['01' '10' '100' ... 'yours' 'yourself' 'zefram']


In [None]:
# Convert the sparse matrix to an array for inspection
print("Bag of Words Matrix:\n", bow_matrix.toarray())

Bag of Words Matrix:
 [[0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
# Creating the meta data file about the vecterization process
vectorization_metadata = {
    "dataset": "IMDb Movie Reviews (Processed)",
    "preprocessing": [
        "Lowercased all text",
        "Tokenized into words"
    ],
    "vectorization_methods": [
        {
            "method": "TF-IDF",
            "parameters": {
                "max_features": "No limit",
                "ngram_range": [1, 2]
            }
        },
        {
            "method": "Bag of Words",
            "parameters": {
                "max_features": "No limit",
                "ngram_range": [1, 1]
            }
        }
    ]
}

# Save to JSON file
with open("vectorization_metadata.json", "w") as f:
    json.dump(vectorization_metadata, f, indent=4)

print("Metadata saved to vectorization_metadata.json!")


In [None]:
# Creating the meta data file about the vecterization process
vectorization_metadata = {
    "dataset": "IMDb Movie Reviews (Seperated Into Individual Text Files)",
    "preprocessing": [
        {
            "method": "Lowercased all text",
            "description": "Converting all text to lowercase to ensure that words are treated the same regardless of their capitalization."
        },
        {
            "method": "Tokenized into words",
            "description": "Splitting text into individual words."
        }
    ],
    "vectorization_methods": [
        {
            "method": "TF-IDF",
            "description": "TF-IDF (Term Frequency-Inverse Document Frequency) assigns weights to words based on their frequency within a document and across the entire corpus.",
            "parameters": {
                "min_df": 1},
            },
        {
            "method": "Bag of Words",
            "description": "Bag of Words represents text as a collection of words and their frequencies, ignoring grammar and word order. It creates a numerical representation of text based on the presence and count of words."
            }
    ]
}

In [None]:
file_path = "/content/drive/MyDrive/DE_CW2/vectorization_metadata.json"
# Save to JSON file
with open(file_path, "w") as f:
    json.dump(vectorization_metadata, f, indent=4)

Metadata saved to vectorization_metadata.json!


In [3]:
# Load the Excel files with
df = pd.read_excel("/content/drive/MyDrive/DE_CW2/Movie Review Sentimnet Labels.xlsx")

# Save to JSON
df.to_json("/content/drive/MyDrive/DE_CW2/Movie Review Sentiment Labels.json", orient="records", indent=4)