In [3]:
# Haris Corner
import os
import cv2
import numpy as np
import pandas as pd

def detect_and_mark_corners(image, image_path):
    size = (128, 128)
    # Resize image to 512 x 512
    resized_image = cv2.resize(image, size)

    # Convert the resized image to grayscale
    gray = cv2.cvtColor(resized_image, cv2.COLOR_BGR2GRAY)

    # Detect corners using the Harris corner detection algorithm
    corners = cv2.cornerHarris(gray, 2, 3, 0.04)

    # Threshold the corner response to identify strong corners
    threshold = 0.01 * corners.max()
    corner_matrix = np.zeros(size, dtype=np.uint8)
    corner_matrix[corners > threshold] = 1
    
    # Flatten the corner matrix to a one-dimensional array
    corner_array = corner_matrix.flatten()
    
    # Create a list with the image path and corner values
    lst = [image_path] + list(corner_array)
#     print(len(lst))
    return lst

# Function to extract features for an image
def extract_features(image_path):
    image = cv2.imread(image_path) 
    features = detect_and_mark_corners(image, image_path)
    return features

# Main code
parent_folder_path = '/kaggle/input/image-feature-extraction-project/ProjectData'
output_path = '/kaggle/working/haris_corner_output.csv'
all_features = []

# Get a list of subfolders in the parent folder
subfolders = [subfolder for subfolder in os.listdir(parent_folder_path) if os.path.isdir(os.path.join(parent_folder_path, subfolder))]

# Iterate over each subfolder
for subfolder in subfolders:
    # Construct the path to the current subfolder
    subfolder_path = os.path.join(parent_folder_path, subfolder)
    
    # Construct the path to the folder containing images within the current subfolder
    folder_path = os.path.join(subfolder_path, 'img')
    
    # List all files in the folder
    file_list = os.listdir(folder_path)

    # Iterate over each file in the folder
    for file_name in file_list:
        # Construct the image path
        image_path = os.path.join(folder_path, file_name)

        # Extract features for the current image
        image_features = extract_features(image_path)

        # Append the features to the list
        all_features.append(image_features)

# Create a DataFrame from the list of features
df = pd.DataFrame(all_features)

# Save the DataFrame to a CSV file
df.to_csv(output_path, index=False)


In [4]:
# Canny Edge Detection
import os
import cv2
import numpy as np
import pandas as pd 

def detect_and_mark_edges(image, image_path):
    size = (128, 128)
    # Resize image to 128 x 128
    resized_image = cv2.resize(image, size)

    # Convert the resized image to grayscale
    gray = cv2.cvtColor(resized_image, cv2.COLOR_BGR2GRAY)

    # Detect edges using the Canny edge detection algorithm
    edges = cv2.Canny(gray, 0, 255)

    # Create a one-dimensional matrix to mark edges
    edge_matrix = np.zeros(size, dtype=np.uint8)
    edge_matrix[edges != 0] = 1

    # Flatten the edge matrix to a one-dimensional array
    edge_array = edge_matrix.flatten()

    # Create a list with the image path and edge values
    lst = [image_path] + list(edge_array)

    return lst

# Function to extract features for an image
def extract_features(image_path):
    image = cv2.imread(image_path)  
    edge_features = detect_and_mark_edges(image, image_path) 
    return edge_features

# Main code
parent_folder_path = '/kaggle/input/image-feature-extraction-project/ProjectData'
output_path = '/kaggle/working/edge_output.csv'
all_features = []

# Get a list of subfolders in the parent folder
subfolders = [subfolder for subfolder in os.listdir(parent_folder_path) if os.path.isdir(os.path.join(parent_folder_path, subfolder))]

# Iterate over each subfolder
for subfolder in subfolders:
    # Construct the path to the current subfolder
    subfolder_path = os.path.join(parent_folder_path, subfolder)
    
    # Construct the path to the folder containing images within the current subfolder
    folder_path = os.path.join(subfolder_path, 'img')
    
    # List all files in the folder
    file_list = os.listdir(folder_path)

    # Iterate over each file in the folder
    for file_name in file_list:
        # Construct the image path
        image_path = os.path.join(folder_path, file_name)

        # Extract features for the current image
        image_features = extract_features(image_path)

        # Append the features to the list
        all_features.append(image_features)

# Create a DataFrame from the list of features
df = pd.DataFrame(all_features)

# Save the DataFrame to a CSV file
df.to_csv(output_path, index=False)


# ADDING MAJORITY VOTE TO CSV 

In [5]:
# Animal

file_names = ['/kaggle/working/haris_corner_output.csv', '/kaggle/working/edge_output.csv']
df = pd.read_csv('/kaggle/input/image-feature-extraction-project/ProjectData/AnimalLabels.csv')
for file_name in file_names :
    df2 = pd.read_csv(file_name)

    # Iterate over each column in the first index (column 0)
    for index, column_value in df.iloc[:, 0].items():
        # Extract the desired parts from the input string
        subfolder = int(column_value.split("-")[0].split("_")[1])
        image_name = "s" + str(int(column_value.split("-")[2].split("_")[1])) + ".jpg"

        # Calculate majority voting on columns A1 to A7 
        majority_vote = df.iloc[index, 1:8].value_counts().idxmax()

        # Construct the expected string
        expected_string = f"/kaggle/input/image-feature-extraction-project/ProjectData/s{subfolder}/img/{image_name}"

        # Check if the expected string is present in column index 0 of the second CSV file
        is_present = expected_string in df2.iloc[:, 0].values

        if is_present: 
            # Find the index where the expected string is present in df2
            idx = df2.index[df2.iloc[:, 0] == expected_string].tolist()[0]

            # Update the 'Animal' column in df2 with the majority_vote value
            df2.loc[idx, 'Animal'] = int(majority_vote)

    df2['Animal'] = df2['Animal'].astype(int)
    # Save the updated df2 to a CSV file
    # df2.to_csv('/kaggle/working/haris_corner_output.csv', index=False)
    df2.to_csv(file_name, index=False)
    print('Completed', file_name)


Completed /kaggle/working/haris_corner_output.csv
Completed /kaggle/working/edge_output.csv


In [6]:
# Mythological_Full
df = pd.read_csv('/kaggle/input/image-feature-extraction-project/ProjectData/MythologicalLabels.csv')
for file_name in file_names :
    df2 = pd.read_csv(file_name)
    # Iterate over each column in the first index (column 0)
    for index, column_value in df.iloc[:, 0].items():
        # Extract the desired parts from the input string
        subfolder = int(column_value.split("-")[0].split("_")[1])
        image_name = "s" + str(int(column_value.split("-")[2].split("_")[1])) + ".jpg"

        # Calculate majority voting on columns A1 to A7 
        majority_vote = df.iloc[index, 1:8].value_counts().idxmax()

        # Construct the expected string
        expected_string = f"/kaggle/input/image-feature-extraction-project/ProjectData/s{subfolder}/img/{image_name}"

        # Check if the expected string is present in column index 0 of the second CSV file
        is_present = expected_string in df2.iloc[:, 0].values

        if is_present: 
            # Find the index where the expected string is present in df2
            idx = df2.index[df2.iloc[:, 0] == expected_string].tolist()[0]

            # Update the 'Animal' column in df2 with the majority_vote value
            df2.loc[idx, 'Mythological'] = int(majority_vote)

    df2['Mythological'] = df2['Mythological'].astype(int)
    # Save the updated df2 to a CSV file
    # df2.to_csv('/kaggle/working/haris_corner_output.csv', index=False)
    df2.to_csv(file_name, index=False)
    print('Completed', file_name)

Completed /kaggle/working/haris_corner_output.csv
Completed /kaggle/working/edge_output.csv


In [7]:
# Mythological_A2_A3_A5
df = pd.read_csv('/kaggle/input/image-feature-extraction-project/ProjectData/MythologicalLabels.csv')
for file_name in file_names :
    df2 = pd.read_csv(file_name)
    # Iterate over each column in the first index (column 0)
    for index, column_value in df.iloc[:, 0].items():
        # Extract the desired parts from the input string
        subfolder = int(column_value.split("-")[0].split("_")[1])
        image_name = "s" + str(int(column_value.split("-")[2].split("_")[1])) + ".jpg"

        # Calculate majority voting on columns A2, A3, and A5
        columns_to_consider = ['A2', 'A3', 'A5']
        majority_vote = df.loc[index, columns_to_consider].value_counts().idxmax()

        # Construct the expected string
        expected_string = f"/kaggle/input/image-feature-extraction-project/ProjectData/s{subfolder}/img/{image_name}"

        # Check if the expected string is present in column index 0 of the second CSV file
        is_present = expected_string in df2.iloc[:, 0].values

        if is_present: 
            # Find the index where the expected string is present in df2
            idx = df2.index[df2.iloc[:, 0] == expected_string].tolist()[0]

            # Update the 'Animal' column in df2 with the majority_vote value
            df2.loc[idx, 'Mythological_A2_A3_A5'] = int(majority_vote)

    df2['Mythological_A2_A3_A5'] = df2['Mythological_A2_A3_A5'].astype(int)
    # Save the updated df2 to a CSV file
    # df2.to_csv('/kaggle/working/haris_corner_output.csv', index=False)
    df2.to_csv(file_name, index=False)
    print('Completed', file_name)

Completed /kaggle/working/haris_corner_output.csv
Completed /kaggle/working/edge_output.csv


In [8]:
# Mythological_A6_A7
df = pd.read_csv('/kaggle/input/image-feature-extraction-project/ProjectData/MythologicalLabels.csv')
for file_name in file_names :
    df2 = pd.read_csv(file_name)
    # Iterate over each column in the first index (column 0)
    for index, column_value in df.iloc[:, 0].items():
        # Extract the desired parts from the input string
        subfolder = int(column_value.split("-")[0].split("_")[1])
        image_name = "s" + str(int(column_value.split("-")[2].split("_")[1])) + ".jpg"

        # Calculate majority voting on columns A2, A3, and A5
        columns_to_consider = ['A6', 'A7']
        majority_vote = df.loc[index, columns_to_consider].value_counts().idxmax()

        # Construct the expected string
        expected_string = f"/kaggle/input/image-feature-extraction-project/ProjectData/s{subfolder}/img/{image_name}"

        # Check if the expected string is present in column index 0 of the second CSV file
        is_present = expected_string in df2.iloc[:, 0].values

        if is_present: 
            # Find the index where the expected string is present in df2
            idx = df2.index[df2.iloc[:, 0] == expected_string].tolist()[0]

            # Update the 'Animal' column in df2 with the majority_vote value
            df2.loc[idx, 'Mythological_A6_A7'] = int(majority_vote)

    df2['Mythological_A6_A7'] = df2['Mythological_A6_A7'].astype(int)
    # Save the updated df2 to a CSV file
    # df2.to_csv('/kaggle/working/haris_corner_output.csv', index=False)
    df2.to_csv(file_name, index=False)
    print('Completed', file_name)

Completed /kaggle/working/haris_corner_output.csv
Completed /kaggle/working/edge_output.csv


In [9]:
# Tree 
df = pd.read_csv('/kaggle/input/image-feature-extraction-project/ProjectData/TreeLabels.csv')
for file_name in file_names :
    df2 = pd.read_csv(file_name)
    # Iterate over each column in the first index (column 0)
    for index, column_value in df.iloc[:, 0].items():
        # Extract the desired parts from the input string
        subfolder = int(column_value.split("-")[0].split("_")[1])
        image_name = "s" + str(int(column_value.split("-")[2].split("_")[1])) + ".jpg"

        # Calculate majority voting on columns A1 to A7 
        majority_vote = df.iloc[index, 1:8].value_counts().idxmax()

        # Construct the expected string
        expected_string = f"/kaggle/input/image-feature-extraction-project/ProjectData/s{subfolder}/img/{image_name}"

        # Check if the expected string is present in column index 0 of the second CSV file
        is_present = expected_string in df2.iloc[:, 0].values

        if is_present: 
            # Find the index where the expected string is present in df2
            idx = df2.index[df2.iloc[:, 0] == expected_string].tolist()[0]

            # Update the 'Animal' column in df2 with the majority_vote value
            df2.loc[idx, 'Tree'] = int(majority_vote)

    df2['Tree'] = df2['Tree'].astype(int)
    # Save the updated df2 to a CSV file
    # df2.to_csv('/kaggle/working/haris_corner_output.csv', index=False)
    df2.to_csv(file_name, index=False)
    print('Completed', file_name)

Completed /kaggle/working/haris_corner_output.csv
Completed /kaggle/working/edge_output.csv


In [10]:
import pandas as pd

# Read the two files into DataFrames
file1_path = '/kaggle/working/haris_corner_output.csv'
file2_path = '/kaggle/working/edge_output.csv'

df1 = pd.read_csv(file1_path)
df2 = pd.read_csv(file2_path)

# Select the desired columns for comparison
columns_to_compare = ['Animal', 'Mythological', 'Tree', 'Mythological_A2_A3_A5', 'Mythological_A6_A7']

# Iterate over the columns and compare the values
for column in columns_to_compare:
    column_values1 = df1[column]
    column_values2 = df2[column]
#     print(column_values1,' and ', column_values2)
    # Compare the values and print the differences
    differences = column_values1 != column_values2
    if differences.any():
        print(f"Differences in column '{column}':")
        print(df1[differences][column])
        print(df2[differences][column])
        print("--------------------")
    else:
        print(f"No differences in column '{column}'")


No differences in column 'Animal'
No differences in column 'Mythological'
No differences in column 'Tree'
No differences in column 'Mythological_A2_A3_A5'
No differences in column 'Mythological_A6_A7'


In [11]:
# import pandas as pd

# # Read the CSV files
# file1_path = "/kaggle/working/haris_corner_output.csv"
# file2_path = "/kaggle/working/edge_output.csv"

# file1_data = pd.read_csv(file1_path)
# file2_data = pd.read_csv(file2_path)

# # Define the columns to compare
# columns_to_compare = ['Animal', 'Mythological', 'Tree', 'Mythological_A2_A3_A5', 'Mythological_A6_A7']

# # Iterate over each index and compare the column values
# for index in range(len(file1_data)):
#     for column in columns_to_compare:
#         file1_value = file1_data.loc[index, column]
#         file2_value = file2_data.loc[index, column]
#         print(file1_value, ' and ', file2_value)
#         if file1_value == file2_value:
#             print(f"Value at index {index}, column '{column}' is the same in both files.")
#         else:
#             print(f"Value at index {index}, column '{column}' is different in both files.")


In [12]:
import pandas as pd

# Specify the path to your Excel file
excel_file_path = '/kaggle/working/haris_corner_output.csv'
# excel_file_path = '/kaggle/working/edge_output.csv'

# Read the Excel file into a pandas DataFrame
df = pd.read_csv(excel_file_path)

# Count the number of rows in the DataFrame
num_rows = len(df)

# Print the number of rows
print("Number of rows:", num_rows)
num_columns = len(df.columns)

print("Number of columns:", num_columns)

column_names = df.columns

# Print the column names
print("Column names:", column_names)

column_names = ['0','1','Animal', 'Mythological', 'Tree', 'Mythological_A2_A3_A5', 'Mythological_A6_A7' ]

# Access the column data
column_data = df[column_names]

# Print the column data
print(column_data)

Number of rows: 106
Number of columns: 16390
Column names: Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '16380', '16381', '16382', '16383', '16384', 'Animal', 'Mythological',
       'Mythological_A2_A3_A5', 'Mythological_A6_A7', 'Tree'],
      dtype='object', length=16390)
                                                     0  1  Animal  \
0    /kaggle/input/image-feature-extraction-project...  1       0   
1    /kaggle/input/image-feature-extraction-project...  0       1   
2    /kaggle/input/image-feature-extraction-project...  0       1   
3    /kaggle/input/image-feature-extraction-project...  1       0   
4    /kaggle/input/image-feature-extraction-project...  0       0   
..                                                 ... ..     ...   
101  /kaggle/input/image-feature-extraction-project...  0       0   
102  /kaggle/input/image-feature-extraction-project...  0       1   
103  /kaggle/input/image-feature-extraction-project...  0       0   
104  /

## Training the Model (SVM, Naive Bayes, Logistic Regression)

In [13]:
 
# Assuming 'df' is the DataFrame and 'column_name' is the name of the column
value_counts = df['Mythological_A2_A3_A5'].value_counts()

# Print the count of each unique value
print(value_counts) 

# Assuming 'df' is the DataFrame and 'column_name' is the name of the column
value_counts = df['Mythological_A6_A7'].value_counts()

# Print the count of each unique value
print(value_counts) 

value_counts = df['Mythological'].value_counts()

# Print the count of each unique value
print(value_counts) 

0    61
1    45
Name: Mythological_A2_A3_A5, dtype: int64
1    86
0    20
Name: Mythological_A6_A7, dtype: int64
1    60
0    46
Name: Mythological, dtype: int64


In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

print("SVM ")

# Read the CSV file
for file_name in file_names :
    df = pd.read_csv(file_name)

    # Separate the features (x) and target (y) variables
    x = df.loc[:, '1':'16384']
    y_columns = ['Animal', 'Mythological', 'Tree', 'Mythological_A2_A3_A5', 'Mythological_A6_A7']

    # Define the number of epochs
    num_epochs = 10

    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}:")

        # Train and test the SVM model for each target column separately
        for column in y_columns:
            y = df[column]

            # Split the data into training and testing sets (80:20 split)
            x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

            # Create an SVM classifier with regularization (e.g., C=1.0)
            svm = SVC(C = 1.0)

            # Train the SVM model
            svm.fit(x_train, y_train)
            
            # Make predictions on the training set
            y_train_pred = svm.predict(x_train)
            print("y_train_pred ", y_train_pred[1])
            # Make predictions on the test set
            y_test_pred = svm.predict(x_test)

            # Calculate the accuracy of the model on training and test sets
            train_accuracy = accuracy_score(y_train, y_train_pred)
            test_accuracy = accuracy_score(y_test, y_test_pred)

            # Print the accuracy for the current target column
            print(f"Accuracy for {column}:")
            print(f"  Training Accuracy: {train_accuracy}")
            print(f"  Testing Accuracy: {test_accuracy}\n")
    print("Completed ", file_name)

SVM 
Epoch 1:
y_train_pred  1
Accuracy for Animal:
  Training Accuracy: 0.9761904761904762
  Testing Accuracy: 0.7272727272727273

y_train_pred  1
Accuracy for Mythological:
  Training Accuracy: 0.9761904761904762
  Testing Accuracy: 0.5454545454545454

y_train_pred  0
Accuracy for Tree:
  Training Accuracy: 0.9642857142857143
  Testing Accuracy: 0.7727272727272727

y_train_pred  1
Accuracy for Mythological_A2_A3_A5:
  Training Accuracy: 1.0
  Testing Accuracy: 0.6363636363636364

y_train_pred  1
Accuracy for Mythological_A6_A7:
  Training Accuracy: 0.9285714285714286
  Testing Accuracy: 0.6818181818181818

Epoch 2:
y_train_pred  1
Accuracy for Animal:
  Training Accuracy: 0.9761904761904762
  Testing Accuracy: 0.7272727272727273

y_train_pred  1
Accuracy for Mythological:
  Training Accuracy: 0.9761904761904762
  Testing Accuracy: 0.5454545454545454

y_train_pred  0
Accuracy for Tree:
  Training Accuracy: 0.9642857142857143
  Testing Accuracy: 0.7727272727272727

y_train_pred  1
Accur

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

print("Neural Network")

# Read the CSV file
for file_name in file_names:
    df = pd.read_csv(file_name)

    # Separate the features (x) and target (y) variables
    x = df.loc[:, '1':'16384']
    y_columns = ['Animal', 'Mythological', 'Tree', 'Mythological_A2_A3_A5', 'Mythological_A6_A7']

    # Define the number of epochs
    num_epochs = 2

    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}:")

        # Train and test the neural network model for each target column separately
        for column in y_columns:
            y = df[column]

            # Split the data into training and testing sets (80:20 split)
            x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

            # Create a neural network classifier
            nn = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', random_state=20)

            # Train the neural network model
            nn.fit(x_train, y_train)

            # Make predictions on the training set
            y_train_pred = nn.predict(x_train)

            # Make predictions on the test set
            y_test_pred = nn.predict(x_test)

            # Calculate the accuracy of the model on training and test sets
            train_accuracy = accuracy_score(y_train, y_train_pred)
            test_accuracy = accuracy_score(y_test, y_test_pred)

            # Print the accuracy for the current target column
            print(f"Accuracy for {column}:")
            print(f"  Training Accuracy: {train_accuracy}")
            print(f"  Testing Accuracy: {test_accuracy}\n")
    print("Completed ", file_name)


Neural Network
Epoch 1:
Accuracy for Animal:
  Training Accuracy: 1.0
  Testing Accuracy: 0.7272727272727273

Accuracy for Mythological:
  Training Accuracy: 1.0
  Testing Accuracy: 0.5909090909090909

Accuracy for Tree:
  Training Accuracy: 0.7380952380952381
  Testing Accuracy: 0.7727272727272727

Accuracy for Mythological_A2_A3_A5:
  Training Accuracy: 1.0
  Testing Accuracy: 0.7727272727272727

Accuracy for Mythological_A6_A7:
  Training Accuracy: 1.0
  Testing Accuracy: 0.6818181818181818

Epoch 2:
Accuracy for Animal:
  Training Accuracy: 1.0
  Testing Accuracy: 0.7272727272727273

Accuracy for Mythological:
  Training Accuracy: 1.0
  Testing Accuracy: 0.5909090909090909

Accuracy for Tree:
  Training Accuracy: 0.7380952380952381
  Testing Accuracy: 0.7727272727272727

Accuracy for Mythological_A2_A3_A5:
  Training Accuracy: 1.0
  Testing Accuracy: 0.7727272727272727

Accuracy for Mythological_A6_A7:
  Training Accuracy: 1.0
  Testing Accuracy: 0.6818181818181818

Completed  /kag

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score

print("SVM")

# Read the CSV file
for file_name in file_names:
    df = pd.read_csv(file_name)

    # Separate the features (x) and target (y) variables
    x = df.loc[:, '1':'16384']
    y_columns = ['Animal', 'Mythological', 'Tree', 'Mythological_A2_A3_A5', 'Mythological_A6_A7']

    # Define the number of epochs
    num_epochs = 1

    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}:")

        # Train and test the SVM model for each target column separately
        for column in y_columns:
            y = df[column]

            # Split the data into training and testing sets (80:20 split)
            x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

            # Create an SVM classifier with regularization (e.g., C=1.0)
            svm = SVC(C=1.0, probability=True)  # Set probability=True for calculating AUC

            # Train the SVM model
            svm.fit(x_train, y_train)

            # Make predictions on the training set
            y_train_pred = svm.predict(x_train)

            # Make predictions on the test set
            y_test_pred = svm.predict(x_test)

            # Calculate the accuracy of the model on training and test sets
            train_accuracy = accuracy_score(y_train, y_train_pred)
            test_accuracy = accuracy_score(y_test, y_test_pred)

            # Calculate the AUC of the model on training and test sets
            train_auc = roc_auc_score(y_train, svm.predict_proba(x_train)[:, 1])
            test_auc = roc_auc_score(y_test, svm.predict_proba(x_test)[:, 1])  
            
            # Print the accuracy and AUC for the current target column
            print(f"Accuracy for {column}:")
            print(f"  Training Accuracy: {train_accuracy}")
            print(f"  Testing Accuracy: {test_accuracy}")

            print(f"AUC for {column}:")
            print(f"  Training AUC: {train_auc}")
            print(f"  Testing AUC: {test_auc}\n")

    print("Completed", file_name)


SVM
Epoch 1:
Accuracy for Animal:
  Training Accuracy: 0.9761904761904762
  Testing Accuracy: 0.7272727272727273
AUC for Animal:
  Training AUC: 0.0
  Testing AUC: 0.625

Accuracy for Mythological:
  Training Accuracy: 0.9761904761904762
  Testing Accuracy: 0.5454545454545454
AUC for Mythological:
  Training AUC: 0.0
  Testing AUC: 0.3583333333333333

Accuracy for Tree:
  Training Accuracy: 0.9642857142857143
  Testing Accuracy: 0.7727272727272727
AUC for Tree:
  Training AUC: 0.0
  Testing AUC: 0.1647058823529412

Accuracy for Mythological_A2_A3_A5:
  Training Accuracy: 1.0
  Testing Accuracy: 0.6363636363636364
AUC for Mythological_A2_A3_A5:
  Training AUC: 0.0
  Testing AUC: 0.4821428571428571

Accuracy for Mythological_A6_A7:
  Training Accuracy: 0.9285714285714286
  Testing Accuracy: 0.6818181818181818
AUC for Mythological_A6_A7:
  Training AUC: 0.0
  Testing AUC: 0.3523809523809524

Completed /kaggle/working/haris_corner_output.csv
Epoch 1:
Accuracy for Animal:
  Training Accurac

## Naive Bayes 

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

print("Naive Bayes")

# Read the CSV file
for file_name in file_names :
    df = pd.read_csv(file_name)

    # Separate the features (x) and target (y) variables
    x = df.loc[:, '1':'16384']
    y_columns = ['Animal', 'Mythological', 'Tree', 'Mythological_A2_A3_A5', 'Mythological_A6_A7']

    # Train, test, and validate the Naive Bayes model for each target column separately
    for column in y_columns:
        y = df[column]

        # Split the data into training and remaining data (80:20 split)
        x_train, x_remain, y_train, y_remain = train_test_split(x, y, test_size=0.2, random_state=42)

        # Split the remaining data into testing and validation sets (50:50 split)
        x_test, x_val, y_test, y_val = train_test_split(x_remain, y_remain, test_size=0.5, random_state=42)

        # Create a Naive Bayes classifier
        nb = GaussianNB()

        # Train the Naive Bayes model
        nb.fit(x_train, y_train)

        # Make predictions on the training set
        y_pred_train = nb.predict(x_train)

        # Make predictions on the validation set
        y_pred_val = nb.predict(x_val)

        # Make predictions on the test set
        y_pred_test = nb.predict(x_test)

        # Calculate the accuracy of the model on the training, validation, and test sets
        accuracy_train = accuracy_score(y_train, y_pred_train)
        accuracy_val = accuracy_score(y_val, y_pred_val)
        accuracy_test = accuracy_score(y_test, y_pred_test)

        # Print the accuracies for the current target column
        print(f"Training Accuracy for {column}: {accuracy_train}")
        print(f"Validation Accuracy for {column}: {accuracy_val}")
        print(f"Test Accuracy for {column}: {accuracy_test}\n")
    print(f"Completed : {file_name}\n")

Naive Bayes
Training Accuracy for Animal: 0.9642857142857143
Validation Accuracy for Animal: 0.7272727272727273
Test Accuracy for Animal: 0.8181818181818182

Training Accuracy for Mythological: 0.9761904761904762
Validation Accuracy for Mythological: 0.5454545454545454
Test Accuracy for Mythological: 0.5454545454545454

Training Accuracy for Tree: 1.0
Validation Accuracy for Tree: 0.8181818181818182
Test Accuracy for Tree: 0.7272727272727273

Training Accuracy for Mythological_A2_A3_A5: 0.9761904761904762
Validation Accuracy for Mythological_A2_A3_A5: 0.7272727272727273
Test Accuracy for Mythological_A2_A3_A5: 0.45454545454545453

Training Accuracy for Mythological_A6_A7: 1.0
Validation Accuracy for Mythological_A6_A7: 0.6363636363636364
Test Accuracy for Mythological_A6_A7: 0.7272727272727273

Completed : /kaggle/working/haris_corner_output.csv

Training Accuracy for Animal: 1.0
Validation Accuracy for Animal: 0.7272727272727273
Test Accuracy for Animal: 0.7272727272727273

Training A

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

# Read the CSV file
for file_name in file_names :
    df = pd.read_csv(file_name)

    # Separate the features (x) and target (y) variables
    x = df.loc[:, '1':'16384']
    y_columns = ['Animal','Tree', 'Mythological', 'Mythological_A2_A3_A5', 'Mythological_A6_A7']

    # Define the parameter and kernel options to try
    param_grid = [
        { 'C': [0.1, 0.01, 1, 10], 'kernel': ['linear'] },
        { 'C': [0.1, 0.01, 1, 10], 'kernel': ['poly'], 'degree': [2, 3] },
        { 'C': [0.1, 0.01, 1, 10], 'kernel': ['rbf'], 'gamma': [0.1, 1, 10] }
    ]

    # Train and test the SVM model for each target column separately
    for column in y_columns:
        y = df[column]

        # Split the data into training and testing sets (80:20 split)
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

        # Create an SVM classifier
        svm = SVC()

        # Perform grid search with cross-validation to find the best parameters
        grid_search = GridSearchCV(svm, param_grid, cv=5)
        grid_search.fit(x_train, y_train)

        # Get the best parameters found by grid search
        best_params = grid_search.best_params_
        print(f"Best parameters for {column}: {best_params}")

        # Print the results for each combination of hyperparameters
        cv_results = grid_search.cv_results_
        for mean_score, params in zip(cv_results['mean_test_score'], cv_results['params']):
            print(f"Parameters: {params} | Mean Accuracy: {mean_score}")

        # Create a new SVM classifier with the best parameters
        svm = SVC(**best_params)

        # Train the SVM model
        svm.fit(x_train, y_train)

        # Make predictions on the test set
        y_pred = svm.predict(x_test)

        # Calculate the accuracy of the model
        accuracy = accuracy_score(y_test, y_pred)

        # Print the accuracy for the current target column
        print(f"Accuracy for {column}: {accuracy}\n")
    print(f"Completed : {file_name}\n")

Best parameters for Animal: {'C': 0.1, 'degree': 2, 'kernel': 'poly'}
Parameters: {'C': 0.1, 'kernel': 'linear'} | Mean Accuracy: 0.5713235294117647
Parameters: {'C': 0.01, 'kernel': 'linear'} | Mean Accuracy: 0.5713235294117647
Parameters: {'C': 1, 'kernel': 'linear'} | Mean Accuracy: 0.5713235294117647
Parameters: {'C': 10, 'kernel': 'linear'} | Mean Accuracy: 0.5713235294117647
Parameters: {'C': 0.1, 'degree': 2, 'kernel': 'poly'} | Mean Accuracy: 0.6308823529411764
Parameters: {'C': 0.1, 'degree': 3, 'kernel': 'poly'} | Mean Accuracy: 0.6308823529411764
Parameters: {'C': 0.01, 'degree': 2, 'kernel': 'poly'} | Mean Accuracy: 0.6308823529411764
Parameters: {'C': 0.01, 'degree': 3, 'kernel': 'poly'} | Mean Accuracy: 0.6308823529411764
Parameters: {'C': 1, 'degree': 2, 'kernel': 'poly'} | Mean Accuracy: 0.6308823529411764
Parameters: {'C': 1, 'degree': 3, 'kernel': 'poly'} | Mean Accuracy: 0.6308823529411764
Parameters: {'C': 10, 'degree': 2, 'kernel': 'poly'} | Mean Accuracy: 0.559558

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Read the CSV file
for file_name in file_names :
    df = pd.read_csv(file_name)

    # Separate the features (x) and target (y) variables
    x = df.loc[:, '1':'16384']
    y_columns = ['Animal', 'Tree', 'Mythological', 'Mythological_A2_A3_A5', 'Mythological_A6_A7']

    # Initialize dictionaries to store accuracy scores
    training_accuracy = {}
    testing_accuracy = {}
    validation_accuracy = {}

    # Iterate over each target column
    for column in y_columns:
        # Split the data into training, testing, and validation sets
        x_train, x_test, y_train, y_test = train_test_split(x, df[column], test_size=0.2, random_state=42)
        x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

        # Create the SVM classifier with adjusted hyperparameters
        if column == 'Mythological_A2_A3_A5':
            svm = SVC(C=1, kernel='poly', degree=2)
        else:
            svm = SVC(C=0.1, kernel='linear')

        # Train the model
        svm.fit(x_train, y_train)

        # Make predictions on training, testing, and validation data
        y_train_pred = svm.predict(x_train)
        y_test_pred = svm.predict(x_test)
        y_val_pred = svm.predict(x_val)

        # Calculate accuracy scores
        training_acc = accuracy_score(y_train, y_train_pred)
        testing_acc = accuracy_score(y_test, y_test_pred)
        validation_acc = accuracy_score(y_val, y_val_pred)

        # Store accuracy scores in respective dictionaries
        training_accuracy[column] = training_acc
        testing_accuracy[column] = testing_acc
        validation_accuracy[column] = validation_acc

    # Print the accuracy scores
    print("Training Accuracy:")
    print(training_accuracy)
    print("\nTesting Accuracy:")
    print(testing_accuracy)
    print("\nValidation Accuracy:")
    print(validation_accuracy)
    
    print(f"Completed : {file_name}\n")

Training Accuracy:
{'Animal': 1.0, 'Tree': 1.0, 'Mythological': 1.0, 'Mythological_A2_A3_A5': 0.9850746268656716, 'Mythological_A6_A7': 1.0}

Testing Accuracy:
{'Animal': 0.7272727272727273, 'Tree': 0.7727272727272727, 'Mythological': 0.5909090909090909, 'Mythological_A2_A3_A5': 0.7272727272727273, 'Mythological_A6_A7': 0.6818181818181818}

Validation Accuracy:
{'Animal': 0.5882352941176471, 'Tree': 0.6470588235294118, 'Mythological': 0.4117647058823529, 'Mythological_A2_A3_A5': 0.6470588235294118, 'Mythological_A6_A7': 0.7058823529411765}
Completed : /kaggle/working/haris_corner_output.csv

Training Accuracy:
{'Animal': 1.0, 'Tree': 1.0, 'Mythological': 1.0, 'Mythological_A2_A3_A5': 1.0, 'Mythological_A6_A7': 1.0}

Testing Accuracy:
{'Animal': 0.7272727272727273, 'Tree': 0.7727272727272727, 'Mythological': 0.5454545454545454, 'Mythological_A2_A3_A5': 0.6363636363636364, 'Mythological_A6_A7': 0.6818181818181818}

Validation Accuracy:
{'Animal': 0.5882352941176471, 'Tree': 0.64705882352

In [20]:
# Logistic Regression
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Read the CSV file
for file_name in file_names :
    df = pd.read_csv(file_name)

    # Separate the features (x) and target (y) variables
    x = df.loc[:, '1':'16384']
    y_columns = ['Animal', 'Mythological', 'Tree', 'Mythological_A2_A3_A5', 'Mythological_A6_A7']


    # Train and test the logistic regression model for each target column separately
    for column in y_columns:
        y = df[column]

        # Split the data into training and testing sets (80:20 split)
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

        # Create a logistic regression classifier
        logistic_reg = LogisticRegression()

        # Train the logistic regression model
        logistic_reg.fit(x_train, y_train)

        # Make predictions on the training set
        y_train_pred = logistic_reg.predict(x_train)

        # Calculate the training accuracy
        train_accuracy = accuracy_score(y_train, y_train_pred)

        # Make predictions on the testing set
        y_test_pred = logistic_reg.predict(x_test)

        # Calculate the testing accuracy
        test_accuracy = accuracy_score(y_test, y_test_pred)

        # Print the accuracy for the current target column
        print(f"Training Accuracy for {column}: {train_accuracy}")
        print(f"Testing Accuracy for {column}: {test_accuracy}\n")
    print(f"Completed : {file_name}\n")

Training Accuracy for Animal: 1.0
Testing Accuracy for Animal: 0.7272727272727273

Training Accuracy for Mythological: 1.0
Testing Accuracy for Mythological: 0.5454545454545454

Training Accuracy for Tree: 1.0
Testing Accuracy for Tree: 0.7727272727272727

Training Accuracy for Mythological_A2_A3_A5: 1.0
Testing Accuracy for Mythological_A2_A3_A5: 0.6363636363636364

Training Accuracy for Mythological_A6_A7: 1.0
Testing Accuracy for Mythological_A6_A7: 0.6818181818181818

Completed : /kaggle/working/haris_corner_output.csv

Training Accuracy for Animal: 1.0
Testing Accuracy for Animal: 0.7272727272727273

Training Accuracy for Mythological: 1.0
Testing Accuracy for Mythological: 0.5454545454545454

Training Accuracy for Tree: 1.0
Testing Accuracy for Tree: 0.7727272727272727

Training Accuracy for Mythological_A2_A3_A5: 1.0
Testing Accuracy for Mythological_A2_A3_A5: 0.6363636363636364

Training Accuracy for Mythological_A6_A7: 1.0
Testing Accuracy for Mythological_A6_A7: 0.68181818181

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

x = df.loc[:, '1':'16384']
y_columns = ['Animal', 'Mythological', 'Tree', 'Mythological_A2_A3_A5', 'Mythological_A6_A7']

# Read the CSV file
for file_name in file_names:
    df = pd.read_csv(file_name)

    # Separate the features (x) and target (y) variables

    # Train and test the KNN model for each target column separately
    for column in y_columns:
        y = df[column]

        # Split the data into training and testing sets (80:20 split)
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

        # Create a KNN classifier
        knn = KNeighborsClassifier()

        # Train the KNN model
        knn.fit(x_train, y_train)

        # Make predictions on the training and test sets
        y_train_pred = knn.predict(x_train)
        y_test_pred = knn.predict(x_test)

        # Calculate the accuracy of the model
        training_accuracy = accuracy_score(y_train, y_train_pred)
        testing_accuracy = accuracy_score(y_test, y_test_pred)

        # Print the accuracy for the current target column
        print(f"Training Accuracy for {column}: {training_accuracy}")
        print(f"Testing Accuracy for {column}: {testing_accuracy}\n")
    print(f"Completed : {file_name}\n")

Training Accuracy for Animal: 0.7023809523809523
Testing Accuracy for Animal: 0.6363636363636364

Training Accuracy for Mythological: 0.6547619047619048
Testing Accuracy for Mythological: 0.5

Training Accuracy for Tree: 0.7142857142857143
Testing Accuracy for Tree: 0.7727272727272727

Training Accuracy for Mythological_A2_A3_A5: 0.7857142857142857
Testing Accuracy for Mythological_A2_A3_A5: 0.5909090909090909

Training Accuracy for Mythological_A6_A7: 0.8333333333333334
Testing Accuracy for Mythological_A6_A7: 0.6818181818181818

Completed : /kaggle/working/haris_corner_output.csv

Training Accuracy for Animal: 0.7023809523809523
Testing Accuracy for Animal: 0.6363636363636364

Training Accuracy for Mythological: 0.6547619047619048
Testing Accuracy for Mythological: 0.5

Training Accuracy for Tree: 0.7142857142857143
Testing Accuracy for Tree: 0.7727272727272727

Training Accuracy for Mythological_A2_A3_A5: 0.7857142857142857
Testing Accuracy for Mythological_A2_A3_A5: 0.5909090909090

# Random data and 10 trials 

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np

print("SVM ")

# Read the CSV file
for file_name in file_names:
    df = pd.read_csv(file_name)

    # Separate the features (x) and target (y) variables
    x = df.loc[:, '1':'16384']
    y_columns = ['Animal', 'Mythological', 'Tree', 'Mythological_A2_A3_A5', 'Mythological_A6_A7']

    # Define the number of trials and epochs
    num_trials = 10
    num_epochs = 1

    for trial in range(num_trials):
        print(f"Trial {trial+1}:")
        
        # Shuffle the data before each trial
        indices = np.arange(len(df))
        np.random.shuffle(indices)
        df_shuffled = df.iloc[indices]

        # Train and test the SVM model for each target column separately
        for epoch in range(num_epochs):
            print(f"Epoch {epoch+1}:")

            # Perform the current trial with shuffled data
            for column in y_columns:
                y = df_shuffled[column]

                # Split the data into training and testing sets (80:20 split)
                x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

                # Create an SVM classifier with regularization (e.g., C=1.0)
                svm = SVC(C=1.0)

                # Train the SVM model
                svm.fit(x_train, y_train)

                # Make predictions on the training set
                y_train_pred = svm.predict(x_train) 

                # Make predictions on the test set
                y_test_pred = svm.predict(x_test)

                # Calculate the accuracy of the model on training and test sets
                train_accuracy = accuracy_score(y_train, y_train_pred)
                test_accuracy = accuracy_score(y_test, y_test_pred)

                # Print the accuracy for the current target column
                print(f"Accuracy for {column}:")
                print(f"  Training Accuracy: {train_accuracy}")
                print(f"  Testing Accuracy: {test_accuracy}\n")
    print("Completed ", file_name)


SVM 
Trial 1:
Epoch 1:
Accuracy for Animal:
  Training Accuracy: 0.9761904761904762
  Testing Accuracy: 0.5

Accuracy for Mythological:
  Training Accuracy: 0.9880952380952381
  Testing Accuracy: 0.5909090909090909

Accuracy for Tree:
  Training Accuracy: 0.9285714285714286
  Testing Accuracy: 0.6363636363636364

Accuracy for Mythological_A2_A3_A5:
  Training Accuracy: 0.9761904761904762
  Testing Accuracy: 0.5454545454545454

Accuracy for Mythological_A6_A7:
  Training Accuracy: 0.9404761904761905
  Testing Accuracy: 0.7727272727272727

Trial 2:
Epoch 1:
Accuracy for Animal:
  Training Accuracy: 0.9761904761904762
  Testing Accuracy: 0.6363636363636364

Accuracy for Mythological:
  Training Accuracy: 0.9642857142857143
  Testing Accuracy: 0.5909090909090909

Accuracy for Tree:
  Training Accuracy: 0.9523809523809523
  Testing Accuracy: 0.8181818181818182

Accuracy for Mythological_A2_A3_A5:
  Training Accuracy: 0.9761904761904762
  Testing Accuracy: 0.5909090909090909

Accuracy for My

# Take random data for 10 Trails with 60:40

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np

print("SVM ")

# Read the CSV file
for file_name in file_names:
    df = pd.read_csv(file_name)

    # Separate the features (x) and target (y) variables
    x = df.loc[:, '1':'16384']
    y_columns = ['Animal', 'Mythological', 'Tree', 'Mythological_A2_A3_A5', 'Mythological_A6_A7']

    # Define the number of trials and epochs
    num_trials = 10
    num_epochs = 1

    for trial in range(num_trials):
        print(f"Trial {trial+1}:")
        
        # Shuffle the data before each trial
        indices = np.arange(len(df))
        np.random.shuffle(indices)
        df_shuffled = df.iloc[indices]

        # Train and test the SVM model for each target column separately
        for epoch in range(num_epochs):
            print(f"Epoch {epoch+1}:")

            # Perform the current trial with shuffled data
            for column in y_columns:
                y = df_shuffled[column]

                # Split the data into training and testing sets (60:40 split)
                x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=42)

                # Create an SVM classifier with regularization (e.g., C=1.0)
                svm = SVC(C=1.0)

                # Train the SVM model
                svm.fit(x_train, y_train)

                # Make predictions on the training set
                y_train_pred = svm.predict(x_train)

                # Make predictions on the test set
                y_test_pred = svm.predict(x_test)

                # Calculate the accuracy of the model on training and test sets
                train_accuracy = accuracy_score(y_train, y_train_pred)
                test_accuracy = accuracy_score(y_test, y_test_pred)

                # Print the accuracy for the current target column
                print(f"Accuracy for {column}:")
                print(f"  Training Accuracy: {train_accuracy}")
                print(f"  Testing Accuracy: {test_accuracy}\n")
    print("Completed ", file_name)


SVM 
Trial 1:
Epoch 1:
Accuracy for Animal:
  Training Accuracy: 0.9523809523809523
  Testing Accuracy: 0.627906976744186

Accuracy for Mythological:
  Training Accuracy: 0.9523809523809523
  Testing Accuracy: 0.4883720930232558

Accuracy for Tree:
  Training Accuracy: 0.9365079365079365
  Testing Accuracy: 0.7441860465116279

Accuracy for Mythological_A2_A3_A5:
  Training Accuracy: 0.9841269841269841
  Testing Accuracy: 0.6744186046511628

Accuracy for Mythological_A6_A7:
  Training Accuracy: 0.9365079365079365
  Testing Accuracy: 0.7674418604651163

Trial 2:
Epoch 1:
Accuracy for Animal:
  Training Accuracy: 0.9523809523809523
  Testing Accuracy: 0.627906976744186

Accuracy for Mythological:
  Training Accuracy: 0.9682539682539683
  Testing Accuracy: 0.6046511627906976

Accuracy for Tree:
  Training Accuracy: 0.9365079365079365
  Testing Accuracy: 0.6744186046511628

Accuracy for Mythological_A2_A3_A5:
  Training Accuracy: 0.9841269841269841
  Testing Accuracy: 0.5813953488372093

Ac

# Take random data for 10 Trails with 75:25

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np

print("SVM ")

# Read the CSV file
for file_name in file_names:
    df = pd.read_csv(file_name)

    # Separate the features (x) and target (y) variables
    x = df.loc[:, '1':'16384']
    y_columns = ['Animal', 'Mythological', 'Tree', 'Mythological_A2_A3_A5', 'Mythological_A6_A7']

    # Define the number of trials and epochs
    num_trials = 10
    num_epochs = 1

    for trial in range(num_trials):
        print(f"Trial {trial+1}:")
        
        # Shuffle the data before each trial
        indices = np.arange(len(df))
        np.random.shuffle(indices)
        df_shuffled = df.iloc[indices]

        # Train and test the SVM model for each target column separately
        for epoch in range(num_epochs):
            print(f"Epoch {epoch+1}:")

            # Perform the current trial with shuffled data
            for column in y_columns:
                y = df_shuffled[column]

                # Split the data into training and testing sets (75:25 split)
                x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

                # Create an SVM classifier with regularization (e.g., C=1.0)
                svm = SVC(C=1.0)

                # Train the SVM model
                svm.fit(x_train, y_train)

                # Make predictions on the training set
                y_train_pred = svm.predict(x_train) 

                # Make predictions on the test set
                y_test_pred = svm.predict(x_test)

                # Calculate the accuracy of the model on training and test sets
                train_accuracy = accuracy_score(y_train, y_train_pred)
                test_accuracy = accuracy_score(y_test, y_test_pred)

                # Print the accuracy for the current target column
                print(f"Accuracy for {column}:")
                print(f"  Training Accuracy: {train_accuracy}")
                print(f"  Testing Accuracy: {test_accuracy}\n")
    print("Completed ", file_name)


SVM 
Trial 1:
Epoch 1:
Accuracy for Animal:
  Training Accuracy: 0.9746835443037974
  Testing Accuracy: 0.6666666666666666

Accuracy for Mythological:
  Training Accuracy: 0.9620253164556962
  Testing Accuracy: 0.6666666666666666

Accuracy for Tree:
  Training Accuracy: 0.9493670886075949
  Testing Accuracy: 0.6666666666666666

Accuracy for Mythological_A2_A3_A5:
  Training Accuracy: 1.0
  Testing Accuracy: 0.6666666666666666

Accuracy for Mythological_A6_A7:
  Training Accuracy: 0.9620253164556962
  Testing Accuracy: 0.8518518518518519

Trial 2:
Epoch 1:
Accuracy for Animal:
  Training Accuracy: 1.0
  Testing Accuracy: 0.7407407407407407

Accuracy for Mythological:
  Training Accuracy: 0.9873417721518988
  Testing Accuracy: 0.4444444444444444

Accuracy for Tree:
  Training Accuracy: 0.9367088607594937
  Testing Accuracy: 0.6666666666666666

Accuracy for Mythological_A2_A3_A5:
  Training Accuracy: 0.9873417721518988
  Testing Accuracy: 0.5925925925925926

Accuracy for Mythological_A6_A

# Comparing y_train_pred and y_test_pred with original Y 

In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np

print("SVM ")

# Read the CSV file
for file_name in file_names:
    df = pd.read_csv(file_name)

    # Separate the features (x) and target (y) variables
    x = df.loc[:, '1':'16384']
    y_columns = ['Animal', 'Mythological', 'Tree', 'Mythological_A2_A3_A5', 'Mythological_A6_A7']

    # Define the number of trials and epochs
    num_trials = 10
    num_epochs = 1

    for trial in range(num_trials):
        print(f"Trial {trial+1}:")
        
        # Shuffle the data before each trial
        indices = np.arange(len(df))
        np.random.shuffle(indices)
        df_shuffled = df.iloc[indices]

        # Train and test the SVM model for each target column separately
        for epoch in range(num_epochs):
            print(f"Epoch {epoch+1}:")

            # Perform the current trial with shuffled data
            for column in y_columns:
                y = df_shuffled[column]

                # Split the data into training and testing sets (75:25 split)
                x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

                # Create an SVM classifier with regularization (e.g., C=1.0)
                svm = SVC(C=1.0)

                # Train the SVM model
                svm.fit(x_train, y_train)

                # Make predictions on the training set
                y_train_pred = svm.predict(x_train)

                # Make predictions on the test set
                y_test_pred = svm.predict(x_test)

                # Calculate the accuracy of the model on training and test sets
                train_accuracy = accuracy_score(y_train, y_train_pred)
                test_accuracy = accuracy_score(y_test, y_test_pred)

                # Print the accuracy for the current target column
                print(f"Accuracy for {column}:")
                print(f"  Training Accuracy: {train_accuracy}")
                print(f"  Testing Accuracy: {test_accuracy}\n")

                # Compare original labels with predictions
                train_correct = (y_train == y_train_pred).sum()
                test_correct = (y_test == y_test_pred).sum()
                train_total = len(y_train)
                test_total = len(y_test)

                print(f"Correctly predicted on training set: {train_correct}/{train_total}")
                print(f"Correctly predicted on testing set: {test_correct}/{test_total}\n")

    print("Completed ", file_name)


SVM 
Trial 1:
Epoch 1:
Accuracy for Animal:
  Training Accuracy: 0.9873417721518988
  Testing Accuracy: 0.7037037037037037

Correctly predicted on training set: 78/79
Correctly predicted on testing set: 19/27

Accuracy for Mythological:
  Training Accuracy: 0.9873417721518988
  Testing Accuracy: 0.5925925925925926

Correctly predicted on training set: 78/79
Correctly predicted on testing set: 16/27

Accuracy for Tree:
  Training Accuracy: 0.9493670886075949
  Testing Accuracy: 0.7037037037037037

Correctly predicted on training set: 75/79
Correctly predicted on testing set: 19/27

Accuracy for Mythological_A2_A3_A5:
  Training Accuracy: 0.9746835443037974
  Testing Accuracy: 0.5555555555555556

Correctly predicted on training set: 77/79
Correctly predicted on testing set: 15/27

Accuracy for Mythological_A6_A7:
  Training Accuracy: 0.9240506329113924
  Testing Accuracy: 0.9259259259259259

Correctly predicted on training set: 73/79
Correctly predicted on testing set: 25/27

Trial 2:
Ep

# Average and standard deviation of accuracies across trials for each target column

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np

print("SVM ")

# Read the CSV file
for file_name in file_names:
    df = pd.read_csv(file_name)

    # Separate the features (x) and target (y) variables
    x = df.loc[:, '1':'16384']
    y_columns = ['Animal', 'Mythological', 'Tree', 'Mythological_A2_A3_A5', 'Mythological_A6_A7']

    # Define the number of trials and epochs
    num_trials = 10
    num_epochs = 10

    # Lists to store accuracies for each target column across trials
    train_accuracies = {col: [] for col in y_columns}
    test_accuracies = {col: [] for col in y_columns}

    for trial in range(num_trials):
        print(f"Trial {trial + 1}:")
        
        # Shuffle the data before each trial
        indices = np.arange(len(df))
        np.random.shuffle(indices)
        df_shuffled = df.iloc[indices]

        # Train and test the SVM model for each target column separately
        for epoch in range(num_epochs):
#             print(f"Epoch {epoch + 1}:")

            # Perform the current trial with shuffled data
            for column in y_columns:
                y = df_shuffled[column]

                # Split the data into training and testing sets (75:25 split)
                x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

                # Create an SVM classifier with regularization (e.g., C=1.0)
                svm = SVC(C=1.0)

                # Train the SVM model
                svm.fit(x_train, y_train)

                # Make predictions on the training set
                y_train_pred = svm.predict(x_train) 

                # Make predictions on the test set
                y_test_pred = svm.predict(x_test)

                # Calculate the accuracy of the model on training and test sets
                train_accuracy = accuracy_score(y_train, y_train_pred)
                test_accuracy = accuracy_score(y_test, y_test_pred)

                # Append accuracies to the corresponding lists
                train_accuracies[column].append(train_accuracy)
                test_accuracies[column].append(test_accuracy)

                # Print the accuracy for the current target column
#                 print(f"Accuracy for {column}:")
#                 print(f"  Training Accuracy: {train_accuracy}")
#                 print(f"  Testing Accuracy: {test_accuracy}\n")

    # Calculate average and standard deviation of accuracies across trials for each target column
    print("Average and Standard Deviation of Accuracies:")
    for column in y_columns:
        avg_train_accuracy = np.mean(train_accuracies[column])
        std_train_accuracy = np.std(train_accuracies[column])
        avg_test_accuracy = np.mean(test_accuracies[column])
        std_test_accuracy = np.std(test_accuracies[column])
        print(f"Target Column: {column}")
        print(f"  Average Training Accuracy: {avg_train_accuracy:.4f} +/- {std_train_accuracy:.4f}")
        print(f"  Average Testing Accuracy: {avg_test_accuracy:.4f} +/- {std_test_accuracy:.4f}\n")

    print("Completed ", file_name)


SVM 
Trial 1:
Trial 2:
Trial 3:
Trial 4:
Trial 5:
Trial 6:
Trial 7:
Trial 8:
Trial 9:
Trial 10:
Average and Standard Deviation of Accuracies:
Target Column: Animal
  Average Training Accuracy: 0.9734 +/- 0.0144
  Average Testing Accuracy: 0.5926 +/- 0.0549

Target Column: Mythological
  Average Training Accuracy: 0.9886 +/- 0.0068
  Average Testing Accuracy: 0.5741 +/- 0.0556

Target Column: Tree
  Average Training Accuracy: 0.9557 +/- 0.0172
  Average Testing Accuracy: 0.7407 +/- 0.0759

Target Column: Mythological_A2_A3_A5
  Average Training Accuracy: 0.9848 +/- 0.0136
  Average Testing Accuracy: 0.5407 +/- 0.0687

Target Column: Mythological_A6_A7
  Average Training Accuracy: 0.9228 +/- 0.0322
  Average Testing Accuracy: 0.8407 +/- 0.0643

Completed  /kaggle/working/haris_corner_output.csv
Trial 1:
Trial 2:
Trial 3:
Trial 4:
Trial 5:
Trial 6:
Trial 7:
Trial 8:
Trial 9:
Trial 10:
Average and Standard Deviation of Accuracies:
Target Column: Animal
  Average Training Accuracy: 1.0000 +

In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np 

print("SVM ")

# Function to run SVM with different train and test sizes and return accuracies
def run_svm(df, x, y_columns, train_size, test_size, num_trials=10):
    train_accuracies = {col: [] for col in y_columns}
    test_accuracies = {col: [] for col in y_columns}

    for trial in range(num_trials):
        # Shuffle the data before each trial
        indices = np.arange(len(df))
        np.random.shuffle(indices)
        df_shuffled = df.iloc[indices]

        for column in y_columns:
            y = df_shuffled[column]

            # Split the data into training and testing sets
            x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=train_size, test_size=test_size, random_state=42)

            svm = SVC(C=1.0)
            svm.fit(x_train, y_train)

            # Make predictions on the training set
            y_train_pred = svm.predict(x_train)

            # Make predictions on the test set
            y_test_pred = svm.predict(x_test)

            # Calculate and store the accuracies
            train_accuracy = accuracy_score(y_train, y_train_pred)
            test_accuracy = accuracy_score(y_test, y_test_pred)
            train_accuracies[column].append(train_accuracy)
            test_accuracies[column].append(test_accuracy)

    return train_accuracies, test_accuracies

# Read the CSV file
for file_name in file_names:
    df = pd.read_csv(file_name)

    # Separate the features (x) and target (y) variables
    x = df.loc[:, '1':'16384']
    y_columns = ['Animal', 'Mythological', 'Tree', 'Mythological_A2_A3_A5', 'Mythological_A6_A7']

    # Define the train and test sizes to evaluate
    train_sizes = [0.6, 0.75, 0.8, 0.9]
    test_sizes = [0.4, 0.25, 0.2, 0.1]
 
    for train_size, test_size in zip(train_sizes, test_sizes):
        print(f"Evaluating with Train Size: {train_size:.2f}, Test Size: {test_size:.2f}")

        # Run SVM for each train-test split size
        train_accuracies, test_accuracies = run_svm(df, x, y_columns, train_size, test_size)

        # Save the results in the PDF file
        for column in y_columns:
            avg_train_accuracy = np.mean(train_accuracies[column])
            std_train_accuracy = np.std(train_accuracies[column])
            avg_test_accuracy = np.mean(test_accuracies[column])
            std_test_accuracy = np.std(test_accuracies[column])

            print(f" Target Column: {column} " )
            print(f" Average Training Accuracy: {avg_train_accuracy:.4f} +/- {std_train_accuracy:.4f} " )
            print(f" Average Testing Accuracy: {avg_test_accuracy:.4f} +/- {std_test_accuracy:.4f} " )
        print()
    print("Completed ", file_name)
    print()

SVM 
Evaluating with Train Size: 0.60, Test Size: 0.40
 Target Column: Animal 
 Average Training Accuracy: 0.9698 +/- 0.0111 
 Average Testing Accuracy: 0.6488 +/- 0.0459 
 Target Column: Mythological 
 Average Training Accuracy: 0.9794 +/- 0.0102 
 Average Testing Accuracy: 0.5279 +/- 0.0313 
 Target Column: Tree 
 Average Training Accuracy: 0.9508 +/- 0.0218 
 Average Testing Accuracy: 0.7163 +/- 0.0271 
 Target Column: Mythological_A2_A3_A5 
 Average Training Accuracy: 0.9841 +/- 0.0123 
 Average Testing Accuracy: 0.5837 +/- 0.0493 
 Target Column: Mythological_A6_A7 
 Average Training Accuracy: 0.9444 +/- 0.0216 
 Average Testing Accuracy: 0.7837 +/- 0.0403 

Evaluating with Train Size: 0.75, Test Size: 0.25
 Target Column: Animal 
 Average Training Accuracy: 0.9835 +/- 0.0170 
 Average Testing Accuracy: 0.6778 +/- 0.0704 
 Target Column: Mythological 
 Average Training Accuracy: 0.9886 +/- 0.0105 
 Average Testing Accuracy: 0.5889 +/- 0.0535 
 Target Column: Tree 
 Average Trainin