In [2]:
import os
import shutil

In [3]:
def move_jpg_files(source_dir, destination_dir):
    """
    Move all .jpg files from source_dir and its subdirectories to destination_dir.

    Parameters:
    source_dir (str): The directory to search for .jpg files.
    destination_dir (str): The directory where .jpg files will be moved.
    """
    if not os.path.exists(destination_dir):
        os.makedirs(destination_dir)

    for root, dirs, files in os.walk(source_dir):
        for file in files:
            if file.lower().endswith(".jpg"):
                file_path = os.path.join(root, file)
                shutil.move(file_path, os.path.join(destination_dir, file))

In [8]:
# Example usage
source_directory = r"D:\Neuflo\Neuflo Learn Latest\neuflo dataset\neuflo dataset\finalimages\Phys"
destination_directory = r"D:\Neuflo\Neuflo Learn Latest\neuflo dataset\neuflo dataset\finalimages\Phys"
move_jpg_files(source_directory, destination_directory)

In [9]:
import os

def delete_empty_subfolders(directory):
    """
    Delete all empty subfolders in the given directory.

    Parameters:
    directory (str): The root directory to search for empty subfolders.
    """
    for root, dirs, files in os.walk(directory, topdown=False):
        for dir in dirs:
            dir_path = os.path.join(root, dir)
            if not os.listdir(dir_path):  # Check if the folder is empty
                os.rmdir(dir_path)
                print(f"Deleted empty folder: {dir_path}")

# Example usage
directory_path = r"D:\Neuflo\Neuflo Learn Latest\neuflo dataset\neuflo dataset\finalimages\Phys"
delete_empty_subfolders(directory_path)


Deleted empty folder: D:\Neuflo\Neuflo Learn Latest\neuflo dataset\neuflo dataset\finalimages\Phys\Chapter_1
Deleted empty folder: D:\Neuflo\Neuflo Learn Latest\neuflo dataset\neuflo dataset\finalimages\Phys\Chapter_10
Deleted empty folder: D:\Neuflo\Neuflo Learn Latest\neuflo dataset\neuflo dataset\finalimages\Phys\Chapter_11
Deleted empty folder: D:\Neuflo\Neuflo Learn Latest\neuflo dataset\neuflo dataset\finalimages\Phys\Chapter_12
Deleted empty folder: D:\Neuflo\Neuflo Learn Latest\neuflo dataset\neuflo dataset\finalimages\Phys\Chapter_13
Deleted empty folder: D:\Neuflo\Neuflo Learn Latest\neuflo dataset\neuflo dataset\finalimages\Phys\Chapter_14
Deleted empty folder: D:\Neuflo\Neuflo Learn Latest\neuflo dataset\neuflo dataset\finalimages\Phys\Chapter_15
Deleted empty folder: D:\Neuflo\Neuflo Learn Latest\neuflo dataset\neuflo dataset\finalimages\Phys\Chapter_16
Deleted empty folder: D:\Neuflo\Neuflo Learn Latest\neuflo dataset\neuflo dataset\finalimages\Phys\Chapter_17
Deleted emp

In [10]:
import os

def list_files(directory):
    """
    List all files in the given directory.

    Parameters:
    directory (str): The directory to list files from.
    """
    files = [file for file in os.listdir(directory) 
             if os.path.isfile(os.path.join(directory, file))]
    return files

In [11]:
# Example usage
directory_path = r"D:\Neuflo\Neuflo Learn Latest\neuflo dataset\neuflo dataset\finalimages\Phys"
files = list_files(directory_path)
for file in files:
    print(file)

PHYS_10_10_EXP.jpg
PHYS_10_12_QUE.jpg
PHYS_10_14_EXP.jpg
PHYS_10_22_OptionA.jpg
PHYS_10_22_OptionB.jpg
PHYS_10_22_OptionC.jpg
PHYS_10_22_OptionD.jpg
PHYS_10_23_EXP.jpg
PHYS_11_11_QUE.jpg
PHYS_11_12_QUE.jpg
PHYS_11_13_EXP.jpg
PHYS_11_14_QUE.jpg
PHYS_11_15_EXP.jpg
PHYS_11_17_QUE.jpg
PHYS_11_18_QUE.jpg
PHYS_11_21_QUE.jpg
PHYS_11_22_OptionA.jpg
PHYS_11_22_OptionB.jpg
PHYS_11_22_OptionC.jpg
PHYS_11_22_OptionD.jpg
PHYS_11_35_QUE.jpg
PHYS_11_36_QUE.jpg
PHYS_11_3_QUE.jpg
PHYS_11_4_QUE.jpg
PHYS_12_25_QUE.jpg
PHYS_12_3_EXP.jpg
PHYS_12_3_QUE.jpg
PHYS_13_10_OptionA.jpg
PHYS_13_10_OptionB.jpg
PHYS_13_10_OptionC.jpg
PHYS_13_10_OptionD.jpg
PHYS_13_10_QUE.jpg
PHYS_13_12_EXP.jpg
PHYS_13_29_OptionA.jpg
PHYS_13_29_OptionB.jpg
PHYS_13_29_OptionC.jpg
PHYS_13_29_OptionD.jpg
PHYS_13_2_EXP.jpg
PHYS_13_2_QUE.jpg
PHYS_13_37_EXP.jpg
PHYS_13_48_QUE.jpg
PHYS_13_4_EXP.jpg
PHYS_13_52_QUE.jpg
PHYS_13_7_OptionA.jpg
PHYS_13_7_OptionB.jpg
PHYS_13_7_OptionC.jpg
PHYS_13_7_OptionD.jpg
PHYS_14_43_EXP.jpg
PHYS_14_45_EXP.jpg


In [15]:
import re
directory_path = r"D:\Neuflo\Neuflo Learn Latest\neuflo dataset\neuflo dataset\finalimages\BIO"
files = list_files(directory_path)

def is_valid_filename(filename):
    # Pattern: Subject (uppercase), followed by 1 or 2 sets of numbers, content type, and '.jpg'
    pattern = r'^[A-Z]+_\d+(_\d+)?_(QUE|EXP|Option[A-D])\.jpg$'
    return re.match(pattern, filename) is not None

def find_anomalies(filenames):
    anomalies = [filename for filename in filenames if not is_valid_filename(filename)]
    return anomalies

anomalies = find_anomalies(files)

print("Anomalies detected:")
for anomaly in anomalies:
    print(anomaly)


Anomalies detected:


In [20]:
!pip install azure-storage-blob openpyxl


Collecting azure-storage-blob
  Downloading azure_storage_blob-12.19.0-py3-none-any.whl.metadata (26 kB)
Collecting azure-core<2.0.0,>=1.28.0 (from azure-storage-blob)
  Downloading azure_core-1.29.6-py3-none-any.whl.metadata (36 kB)
Collecting cryptography>=2.1.4 (from azure-storage-blob)
  Downloading cryptography-41.0.7-cp37-abi3-win_amd64.whl.metadata (5.3 kB)
Collecting isodate>=0.6.1 (from azure-storage-blob)
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
     ---------------------------------------- 0.0/41.7 kB ? eta -:--:--
     ---------------------------------------- 41.7/41.7 kB ? eta 0:00:00
Downloading azure_storage_blob-12.19.0-py3-none-any.whl (394 kB)
   ---------------------------------------- 0.0/394.2 kB ? eta -:--:--
   ------- -------------------------------- 71.7/394.2 kB 1.9 MB/s eta 0:00:01
   -------------------- ------------------- 204.8/394.2 kB 2.5 MB/s eta 0:00:01
   ---------------------------------------  389.1/394.2 kB 2.7 MB/s eta 0:00:01
   -


[notice] A new release of pip is available: 23.3.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
from azure.storage.blob import BlobServiceClient
import os
import pandas as pd

ModuleNotFoundError: No module named 'azure'

In [31]:
def upload_images_to_blob(folder_path, account_name, account_key, container_name, output_excel, folder_name):
    # Create a BlobServiceClient
    blob_service_client = BlobServiceClient(account_url=f"https://{account_name}.blob.core.windows.net", credential=account_key)
    container_client = blob_service_client.get_container_client(container_name)

    # Prepare DataFrame to store file names and URLs
    df = pd.DataFrame(columns=["File_Name", "Blob_URL"])

    # Upload each file in the folder
    for file_name in os.listdir(folder_path):
        if file_name.lower().endswith('.jpg'):  # Check if the file is an image
            # Define the blob name with the folder prefix
            blob_name = f"{folder_name}/{file_name}"
            blob_client = container_client.get_blob_client(blob_name)
            file_path = os.path.join(folder_path, file_name)

            # Upload the file
            with open(file_path, "rb") as data:
                blob_client.upload_blob(data)

            # Add file name and URL to the DataFrame
            blob_url = f"https://{account_name}.blob.core.windows.net/{container_name}/{blob_name}"
            df.loc[len(df)] = [file_name, blob_url]

    # Save the DataFrame to an Excel file
    df.to_excel(output_excel, index=False)

In [17]:
sub = "Physics"

# Example usage
upload_images_to_blob(
    folder_path=r"D:\Neuflo\Neuflo Learn Latest\neuflo dataset\neuflo dataset\finalimages\Phys",
    account_name="neuflolearndb",
    account_key="ZfSTvSwmOxpRbL1B2Mn2PzW0YJMK6bLNpnYqemW+gTHfJgmS1KJgn5hPyTf0pf6ZFOAi1180STNQ+AStB9WHrw==",
    container_name="neetimages",
    output_excel=f"{sub}.xlsx",
    folder_name=f"{sub}"
)

NameError: name 'upload_images_to_blob' is not defined

# **Uploading Images to Blob and Creating Excel Sheets**

In [20]:
import os
import pandas as pd
import psycopg2
from azure.storage.blob import BlobServiceClient

In [21]:
def create_connection():
    try:
        conn = psycopg2.connect(
            host="20.244.33.58",
            database="neuflolearndb",
            user="neufloneet",
            password="LearnNEET321",
            port="5432"
        )
        return conn
    except (Exception, psycopg2.DatabaseError) as error:
        print(f"Connection error: {error}")
        return None

In [22]:
def get_question_id(conn, chapter_number, question_no, subject_id):
    try:
        with conn.cursor() as cur:
            cur.execute("""
                SELECT Q.QuestionID
                FROM Questions Q
                JOIN Chapters C ON Q.ChapterID = C.ChapterID
                WHERE C.ChapterNumber = %s
                AND Q.QuestionNo = %s
                AND C.SubjectID = %s
            """, (chapter_number, question_no, subject_id))
            result = cur.fetchone()
            return result[0] if result else None
    except Exception as e:
        print(f"Error in get_question_id: {e}")
        return None

In [23]:
def upload_images_to_blob(folder_path, account_name, account_key, container_name, output_excel, folder_name):
    conn = create_connection()
    if not conn:
        print("Failed to create database connection.")
        return

    blob_service_client = BlobServiceClient(account_url=f"https://{account_name}.blob.core.windows.net", credential=account_key)
    container_client = blob_service_client.get_container_client(container_name)

    df = pd.DataFrame(columns=["QuestionID", "ImageURL", "ContentType"])

    for file_name in os.listdir(folder_path):
        if file_name.lower().endswith('.jpg'):
            parts = file_name.split('_')
            chapter_title, chapter_number, question_no, content_type = parts[0], parts[1], parts[2], parts[3].split('.')[0]

            # Map abbreviated chapter title to SubjectID
            subject_id_map = {'PHYS': 1, 'CHEM': 2, 'BIO': 3}
            subject_id = subject_id_map.get(chapter_title.upper())

            question_id = get_question_id(conn, chapter_number, question_no, subject_id) if subject_id else None

            blob_name = f"{folder_name}/{file_name}"
            blob_client = container_client.get_blob_client(blob_name)
            file_path = os.path.join(folder_path, file_name)

            with open(file_path, "rb") as data:
                blob_client.upload_blob(data)

            blob_url = f"https://{account_name}.blob.core.windows.net/{container_name}/{blob_name}"
            df.loc[len(df)] = [question_id, blob_url, content_type]

    df.to_excel(output_excel, index=False)

    if conn:
        conn.commit()
        conn.close()

In [24]:
sub = "Physics"

# Example usage
upload_images_to_blob(
    folder_path=r"D:\Neuflo\Neuflo Learn Latest\neuflo dataset\neuflo dataset\finalimages\Phys",
    account_name="neuflolearndb",
    account_key="Jx9zzez89il1XIYC2wT5N6s+mm1gJ6QRDBfgtxmU3DDXpIUasBCJSSRKL+HyCeQBqGzZljW4LxH5+ASt7S0Hbw==",  # Replace with your actual account key
    container_name="neetimages",
    output_excel=f"{sub}.xlsx",
    folder_name=f"{sub}"
)

In [25]:
sub = "Chemistry"

# Example usage
upload_images_to_blob(
    folder_path=r"D:\Neuflo\Neuflo Learn Latest\neuflo dataset\neuflo dataset\finalimages\CHEM",
    account_name="neuflolearndb",
    account_key="Jx9zzez89il1XIYC2wT5N6s+mm1gJ6QRDBfgtxmU3DDXpIUasBCJSSRKL+HyCeQBqGzZljW4LxH5+ASt7S0Hbw==",  # Replace with your actual account key
    container_name="neetimages",
    output_excel=f"{sub}.xlsx",
    folder_name=f"{sub}"
)

In [26]:
sub = "Biology"

# Example usage
upload_images_to_blob(
    folder_path=r"D:\Neuflo\Neuflo Learn Latest\neuflo dataset\neuflo dataset\finalimages\BIO",
    account_name="neuflolearndb",
    account_key="Jx9zzez89il1XIYC2wT5N6s+mm1gJ6QRDBfgtxmU3DDXpIUasBCJSSRKL+HyCeQBqGzZljW4LxH5+ASt7S0Hbw==",  # Replace with your actual account key
    container_name="neetimages",
    output_excel=f"{sub}.xlsx",
    folder_name=f"{sub}"
)

# **Uploading Image data to database**

In [27]:
import pandas as pd
import psycopg2

In [28]:
def create_connection():
    try:
        conn = psycopg2.connect(
            host="20.244.33.58",
            database="neuflolearndb",
            user="neufloneet",
            password="LearnNEET321",
            port="5432"
        )
        return conn
    except (Exception, psycopg2.DatabaseError) as error:
        print(f"Connection error: {error}")
        return None

In [29]:
def upload_data_to_images_table(excel_file):
    # Read the Excel file
    df = pd.read_excel(excel_file)

    # Create a database connection
    conn = create_connection()
    if conn is None:
        print("Failed to connect to the database.")
        return

    try:
        with conn.cursor() as cur:
            for _, row in df.iterrows():
                cur.execute(
                    "INSERT INTO Images (QuestionID, ImageURL, ContentType) VALUES (%s, %s, %s)",
                    (row['QuestionID'], row['ImageURL'], row['ContentType'])
                )
            conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print(f"Error inserting data: {error}")
        conn.rollback()
    finally:
        if conn:
            conn.commit()
            conn.close()

In [30]:
upload_data_to_images_table(r"D:\Neuflo\Neuflo Learn Latest\neuflo dataset\Physics.xlsx")

In [31]:
upload_data_to_images_table(r"D:\Neuflo\Neuflo Learn Latest\neuflo dataset\Chemistry.xlsx")

In [32]:
upload_data_to_images_table(r"D:\Neuflo\Neuflo Learn Latest\neuflo dataset\Biology.xlsx")

**setting hasimage to true**

In [33]:
def update_questions_has_image():
    conn = create_connection()
    if conn is None:
        print("Failed to connect to the database.")
        return

    try:
        with conn.cursor() as cur:
            cur.execute("""
                UPDATE Questions
                SET HasImage = TRUE
                WHERE QuestionID IN (
                    SELECT DISTINCT QuestionID
                    FROM Images
                )
            """)
            conn.commit()
            print("Updated 'HasImage' status in 'Questions' table.")
    except (Exception, psycopg2.DatabaseError) as error:
        print(f"Error updating data: {error}")
        conn.rollback()
    finally:
        if conn:
            conn.commit()
            conn.close()

# Example usage
update_questions_has_image()

Updated 'HasImage' status in 'Questions' table.


In [7]:
from azure.storage.blob import BlobServiceClient, ContentSettings

def update_blob_content_type(account_name, account_key, container_name, folder_prefix):
    # Create a BlobServiceClient
    blob_service_client = BlobServiceClient(account_url=f"https://{account_name}.blob.core.windows.net", credential=account_key)
    container_client = blob_service_client.get_container_client(container_name)

    # List all blobs in the folder and update their Content-Type
    blob_list = container_client.list_blobs(name_starts_with=folder_prefix)
    for blob in blob_list:
        if blob.name.lower().endswith('.jpg'):
            blob_client = container_client.get_blob_client(blob.name)
            # Fetch existing blob properties
            props = blob_client.get_blob_properties()
            # Update Content-Type if necessary
            if props.content_settings.content_type != 'image/jpeg':
                print(f"Updating Content-Type for: {blob.name}")
                blob_client.set_http_headers(content_settings=ContentSettings(content_type='image/jpeg'))
            else:
                print(f"Already set to image/jpeg: {blob.name}")

# Call the function for each folder
folders = ["Biology", "Chemistry", "Physics"]
for folder in folders:
    print(f"Processing folder: {folder}")
    update_blob_content_type(
        account_name="neuflolearndb",
        account_key="Jx9zzez89il1XIYC2wT5N6s+mm1gJ6QRDBfgtxmU3DDXpIUasBCJSSRKL+HyCeQBqGzZljW4LxH5+ASt7S0Hbw==",
        container_name="neetimages",
        folder_prefix=folder
    )


Processing folder: Biology
Updating Content-Type for: Biology/BIO_10_21_QUE.jpg
Updating Content-Type for: Biology/BIO_10_24_QUE.jpg
Updating Content-Type for: Biology/BIO_10_28_QUE.jpg
Updating Content-Type for: Biology/BIO_10_53_QUE.jpg
Updating Content-Type for: Biology/BIO_10_9_QUE.jpg
Updating Content-Type for: Biology/BIO_12_15_EXP.jpg
Updating Content-Type for: Biology/BIO_12_1_QUE.jpg
Updating Content-Type for: Biology/BIO_12_33_EXP.jpg
Updating Content-Type for: Biology/BIO_14_2_QUE.jpg
Updating Content-Type for: Biology/BIO_16_29_QUE.jpg
Updating Content-Type for: Biology/BIO_16_37_QUE.jpg
Updating Content-Type for: Biology/BIO_17_22_QUE.jpg
Updating Content-Type for: Biology/BIO_17_23_QUE.jpg
Updating Content-Type for: Biology/BIO_17_33_EXP.jpg
Updating Content-Type for: Biology/BIO_17_7_QUE.jpg
Updating Content-Type for: Biology/BIO_18_11_EXP.jpg
Updating Content-Type for: Biology/BIO_18_1_QUE.jpg
Updating Content-Type for: Biology/BIO_18_2_QUE.jpg
Updating Content-Type for