# Load the dataset from dataset folder

In [12]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Define column names
column_names = ['Height', 'Length', 'Area', 'Eccen', 'P_Black', 'P_And', 'Mean_TR', 'Blackpix', 'Blackand', 'Wb_Trans', 'Class']

# Read data from CSV with specified column names
pageBlocksData = pd.read_csv('dataset/page-blocks.data.csv', names=column_names)


# Display the first few rows of the dataset

In [13]:
print(pageBlocksData.head())
print(pageBlocksData.describe())

   Height  Length  Area  Eccen  P_Black  P_And  Mean_TR  Blackpix  Blackand  \
0       5       7    35  1.400    0.400  0.657     2.33        14        23   
1       6       7    42  1.167    0.429  0.881     3.60        18        37   
2       6      18   108  3.000    0.287  0.741     4.43        31        80   
3       5       7    35  1.400    0.371  0.743     4.33        13        26   
4       6       3    18  0.500    0.500  0.944     2.25         9        17   

   Wb_Trans  Class  
0         6      1  
1         5      1  
2         7      1  
3         3      1  
4         4      1  
            Height       Length           Area        Eccen      P_Black  \
count  5473.000000  5473.000000    5473.000000  5473.000000  5473.000000   
mean     10.473232    89.568244    1198.405628    13.753977     0.368642   
std      18.960564   114.721758    4849.376950    30.703737     0.177757   
min       1.000000     1.000000       7.000000     0.007000     0.052000   
25%       7.000000 

# Define a function to add watermark to the plot

In [14]:
def add_watermark(ax, text):
    ax.text(0.5, 0.5, text, alpha=0.3, fontsize=50, color='gray',
            rotation=0, ha='center', va='center', transform=ax.transAxes)

# Bivariate Plots

In [15]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
sns.boxplot(x='Block Type', y='Height', data=pageBlocksData)
plt.title('Box plot of Height across Block Type')
plt.xlabel('Block Type')
plt.ylabel('Height')
plt.grid(True)
# Add watermark
add_watermark(plt.gca(), '2021SE56')
plt.show()

ValueError: Could not interpret value `Block Type` for `x`. An entry with this name does not appear in `data`.

<Figure size 800x600 with 0 Axes>

In [None]:
import seaborn as sns

plt.figure(figsize=(8, 6))
sns.boxplot(x='Sex', y='Height', data=pageBlocksData)
plt.title('Box plot of Height across different Sexes')
plt.xlabel('Sex')
plt.ylabel('Height')
# Add watermark
add_watermark(plt.gca(), '2021SE56')
plt.show()

# Multivariate Plots

In [None]:
sns.pairplot(pageBlocksData[['Length', 'Diameter', 'Height']])
plt.suptitle('Pairplot of Length, Diameter, and Height')
# Add watermark
add_watermark(plt.gca(), '2021SE56')
plt.show()

In [None]:
# Calculate correlation matrix
corr_matrix = pageBlocksData.corr()

# Plot
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix Heatmap')
plt.show()

In [None]:
# Convert 'Sex' column to numerical values
pageBlocksData['Sex'] = pageBlocksData['Sex'].map({'M': 0, 'F': 1, 'I': 2})

# Plot the correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(pageBlocksData.corr(), annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap')

# Add watermark
add_watermark(plt.gca(), '2021SE56')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode the 'Sex' column using one-hot encoding
abalone_data_encoded = pd.get_dummies(pageBlocksData, columns=['Sex'])

# Split the dataset into features (X) and target (y)
X = abalone_data_encoded.drop('Rings', axis=1)
y = abalone_data_encoded['Rings']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# kNN Classifier
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X_train, y_train)
knn_predictions = knn_classifier.predict(X_test)

# Na誰ve Bayes Classifier
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)
nb_predictions = nb_classifier.predict(X_test)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data"
columnNames = ["Sex", "Length", "Diameter", "Height", "Whole weight", "Shucked weight", "Viscera weight", "Shell weight", "Rings"]
pageBlocksData = pd.read_csv(url, names=columnNames)

# Encode the 'Sex' column using LabelEncoder
label_encoder = LabelEncoder()
pageBlocksData['Sex'] = label_encoder.fit_transform(pageBlocksData['Sex'])

# Split the dataset into features (X) and target (y)
X = pageBlocksData.drop('Rings', axis=1)
y = pageBlocksData['Rings']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# kNN Classifier
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X_train, y_train)
knn_predictions = knn_classifier.predict(X_test)

# Na誰ve Bayes Classifier
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)
nb_predictions = nb_classifier.predict(X_test)

# Evaluation Metrics function
def evaluate_classifier(y_true, y_pred):
    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:")
    print(cm)
    
    # Accuracy
    accuracy = accuracy_score(y_true, y_pred)
    print("Accuracy:", accuracy)
    
    # Precision
    precision = precision_score(y_true, y_pred, average='weighted')
    print("Precision:", precision)
    
    # Recall
    recall = recall_score(y_true, y_pred, average='weighted')
    print("Recall:", recall)
    
    # F1-Score
    f1 = f1_score(y_true, y_pred, average='weighted')
    print("F1-Score:", f1)
    
    # Plot Confusion Matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, cmap='Blues', fmt='g', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.title('Confusion Matrix')
    plt.show()

# Evaluate kNN Classifier
print("\nEvaluation Metrics for kNN Classifier:")
evaluate_classifier(y_test, knn_predictions)

# Evaluate Na誰ve Bayes Classifier
print("\nEvaluation Metrics for Na誰ve Bayes Classifier:")
evaluate_classifier(y_test, nb_predictions)