In [1]:
import math
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score, classification_report

In [2]:
dt = pd.read_csv("Lab08_data.csv")
dt

Unnamed: 0,Age,Income,Student,Credit_Rating,Buys_Computer
0,<=30,high,no,fair,no
1,<=30,high,no,excellent,no
2,31---40,high,no,fair,yes
3,>40,medium,no,fair,yes
4,>40,low,yes,fair,yes
5,>40,low,yes,excellent,no
6,31---40,low,yes,excellent,yes
7,<=30,medium,no,fair,no
8,<=30,low,yes,fair,yes
9,>40,medium,yes,fair,yes


In [3]:
#A1
Buys_comp_count = dt['Buys_Computer'].value_counts()

prob_no = Buys_comp_count['no'] / len(dt)
prob_yes = Buys_comp_count['yes'] / len(dt)

print("Prior probability of buys_computer = yes:", prob_yes)
print("Prior probability of buys_computer = no:", prob_no)

Prior probability of buys_computer = yes: 0.6428571428571429
Prior probability of buys_computer = no: 0.35714285714285715


In [4]:
n = len(dt)
c_yes = c_no = 0
for i in range(0,n):
    if dt['Buys_Computer'][i] == "yes":
        c_yes += 1
    if dt['Buys_Computer'][i] == "no":
        c_no += 1
prob_yes = c_yes/n
prob_no = c_no/n
print("Prior probability of buys_computer = yes:", prob_yes)
print("Prior probability of buys_computer = no:", prob_no)

Prior probability of buys_computer = yes: 0.6428571428571429
Prior probability of buys_computer = no: 0.35714285714285715


In [5]:
#A2
feature = 'Age'
cond_densities = {}
for label in dt['Buys_Computer'].unique():
    d1 = dt[dt['Buys_Computer'] == label][feature]
    counts = d1.value_counts(normalize=True).to_dict() 
    cond_densities[label] = counts
print("Class Conditional Densities:")
for label, values in cond_densities.items():
    print(f"Class {label}: {feature} probabilities: {values}")

Class Conditional Densities:
Class no: Age probabilities: {'<=30': 0.6, '>40': 0.4}
Class yes: Age probabilities: {'31---40': 0.4444444444444444, '>40': 0.3333333333333333, '<=30': 0.2222222222222222}


In [6]:
feature = 'Income'
cond_densities = {}
for label in dt['Buys_Computer'].unique():
    d1 = dt[dt['Buys_Computer'] == label][feature]
    counts = d1.value_counts(normalize=True).to_dict() 
    cond_densities[label] = counts
print("Class Conditional Densities:")
for label, values in cond_densities.items():
    print(f"Class {label}: {feature} probabilities: {values}")

Class Conditional Densities:
Class no: Income probabilities: {'high': 0.4, 'medium': 0.4, 'low': 0.2}
Class yes: Income probabilities: {'medium': 0.4444444444444444, 'low': 0.3333333333333333, 'high': 0.2222222222222222}


In [7]:
feature = 'Student'
cond_densities = {}
for label in dt['Buys_Computer'].unique():
    d1 = dt[dt['Buys_Computer'] == label][feature]
    counts = d1.value_counts(normalize=True).to_dict() 
    cond_densities[label] = counts
print("Class Conditional Densities:")
for label, values in cond_densities.items():
    print(f"Class {label}: {feature} probabilities: {values}")

Class Conditional Densities:
Class no: Student probabilities: {'no': 0.8, 'yes': 0.2}
Class yes: Student probabilities: {'yes': 0.6666666666666666, 'no': 0.3333333333333333}


In [8]:
feature = 'Credit_Rating'
cond_densities = {}
for label in dt['Buys_Computer'].unique():
    d1 = dt[dt['Buys_Computer'] == label][feature]
    counts = d1.value_counts(normalize=True).to_dict() 
    cond_densities[label] = counts
print("Class Conditional Densities:")
for label, values in cond_densities.items():
    print(f"Class {label}: {feature} probabilities: {values}")

Class Conditional Densities:
Class no: Credit_Rating probabilities: {'excellent': 0.6, 'fair': 0.4}
Class yes: Credit_Rating probabilities: {'fair': 0.6666666666666666, 'excellent': 0.3333333333333333}


In [9]:
features = ['Age', 'Income', 'Student', 'Credit_Rating']
cond_densities = {}
for i in features:
    feature_prob = {}
  
    for j in dt['Buys_Computer'].unique():
        data = dt[dt['Buys_Computer'] == j][i]
        size = len(data)
        
        cat_counts = {}
        for k in data:
            if k not in cat_counts:
                cat_counts[k] = 1
            else:
                cat_counts[k] += 1
        
        probabilities = {k: count / size for k, count in cat_counts.items()}
        
        feature_prob[j] = probabilities

    cond_densities[i] = feature_prob

for i, val in cond_densities.items():
    print(f"\nClass Conditional Densities for {i}:")
    for j, probabilities in val.items():
        print(f"Class {j}: {i} probabilities: {probabilities}")


Class Conditional Densities for Age:
Class no: Age probabilities: {'<=30': 0.6, '>40': 0.4}
Class yes: Age probabilities: {'31---40': 0.4444444444444444, '>40': 0.3333333333333333, '<=30': 0.2222222222222222}

Class Conditional Densities for Income:
Class no: Income probabilities: {'high': 0.4, 'low': 0.2, 'medium': 0.4}
Class yes: Income probabilities: {'high': 0.2222222222222222, 'medium': 0.4444444444444444, 'low': 0.3333333333333333}

Class Conditional Densities for Student:
Class no: Student probabilities: {'no': 0.8, 'yes': 0.2}
Class yes: Student probabilities: {'no': 0.3333333333333333, 'yes': 0.6666666666666666}

Class Conditional Densities for Credit_Rating:
Class no: Credit_Rating probabilities: {'fair': 0.4, 'excellent': 0.6}
Class yes: Credit_Rating probabilities: {'fair': 0.6666666666666666, 'excellent': 0.3333333333333333}


In [10]:
#A3
from scipy.stats import chi2_contingency
con_table = pd.crosstab(index=dt['Age'], columns=dt['Income'])
print("Contingency Table:")
print(con_table)

chi2, p, _, _ = chi2_contingency(con_table)
print(f"\nChi-squared value: {chi2}")
print(f"P-value: {p}")
alpha = 0.05
print("\nSignificance Test:")
if p <= alpha:
    print("The variables are dependent.")
else:
    print("The variables are independent.")

Contingency Table:
Income   high  low  medium
Age                       
31---40     2    1       1
<=30        2    1       2
>40         0    2       3

Chi-squared value: 3.3249999999999997
P-value: 0.5049810026322079

Significance Test:
The variables are independent.


In [11]:
con_table = pd.crosstab(index=dt['Age'], columns=dt['Student'])
print("Contingency Table:")
print(con_table)

chi2, p, _, _ = chi2_contingency(con_table)
print(f"\nChi-squared value: {chi2}")
print(f"P-value: {p}")
alpha = 0.05
print("\nSignificance Test:")
if p <= alpha:
    print("The variables are dependent.")
else:
    print("The variables are independent.")

Contingency Table:
Student  no  yes
Age             
31---40   2    2
<=30      3    2
>40       2    3

Chi-squared value: 0.4
P-value: 0.8187307530779818

Significance Test:
The variables are independent.


In [12]:
con_table = pd.crosstab(index=dt['Income'], columns=dt['Credit_Rating'])
print("Contingency Table:")
print(con_table)

chi2, p, _, _ = chi2_contingency(con_table)
print(f"\nChi-squared value: {chi2}")
print(f"P-value: {p}")
alpha = 0.05
print("\nSignificance Test:")
if p <= alpha:
    print("The variables are dependent.")
else:
    print("The variables are independent.")

Contingency Table:
Credit_Rating  excellent  fair
Income                        
high                   1     3
low                    2     2
medium                 3     3

Chi-squared value: 0.7291666666666666
P-value: 0.6944859597510076

Significance Test:
The variables are independent.


In [13]:
con_table = pd.crosstab(index=dt['Student'], columns=dt['Income'])
print("Contingency Table:")
print(con_table)

chi2, p, _, _ = chi2_contingency(con_table)
print(f"\nChi-squared value: {chi2}")
print(f"P-value: {p}")
alpha = 0.05
print("\nSignificance Test:")
if p <= alpha:
    print("The variables are dependent.")
else:
    print("The variables are independent.")

Contingency Table:
Income   high  low  medium
Student                   
no          3    0       4
yes         1    4       2

Chi-squared value: 5.666666666666667
P-value: 0.05881647164242988

Significance Test:
The variables are independent.


In [14]:
#A4
encoder = OrdinalEncoder()
dt_encoder = pd.DataFrame(encoder.fit_transform(dt[['Age', 'Income', 'Student', 'Credit_Rating']]), columns=['Age', 'Income', 'Student', 'Credit_Rating'])
dt_encoder['Buys_Computer'] = dt['Buys_Computer']

X = dt_encoder[['Age', 'Income', 'Student', 'Credit_Rating']]
y = dt_encoder['Buys_Computer']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

nb_classifier = CategoricalNB()
nb_classifier.fit(X_train, y_train)
y_pred = nb_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.33

Classification Report:
              precision    recall  f1-score   support

          no       0.00      0.00      0.00         1
         yes       0.50      0.50      0.50         2

    accuracy                           0.33         3
   macro avg       0.25      0.25      0.25         3
weighted avg       0.33      0.33      0.33         3



In [15]:
#A5
dt1 = pd.read_csv("Skin_Cancer_Metadata.csv")
dt1

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear
...,...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,akiec,histo,40.0,male,abdomen
10011,HAM_0002867,ISIC_0033550,akiec,histo,40.0,male,abdomen
10012,HAM_0002867,ISIC_0033536,akiec,histo,40.0,male,abdomen
10013,HAM_0000239,ISIC_0032854,akiec,histo,80.0,male,face


In [16]:
dt1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10015 entries, 0 to 10014
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   lesion_id     10015 non-null  object 
 1   image_id      10015 non-null  object 
 2   dx            10015 non-null  object 
 3   dx_type       10015 non-null  object 
 4   age           9958 non-null   float64
 5   sex           10015 non-null  object 
 6   localization  10015 non-null  object 
dtypes: float64(1), object(6)
memory usage: 547.8+ KB


In [17]:
m = dt1['age'].mean()
dt1['age'].fillna(m, inplace = True)

In [18]:
from sklearn.preprocessing import LabelEncoder
label_enc = LabelEncoder()
cols = ['dx_type','sex','localization']
for i in cols:
    dt1[i] = label_enc.fit_transform(dt1[i])
dt1

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,3,80.0,1,11
1,HAM_0000118,ISIC_0025030,bkl,3,80.0,1,11
2,HAM_0002730,ISIC_0026769,bkl,3,80.0,1,11
3,HAM_0002730,ISIC_0025661,bkl,3,80.0,1,11
4,HAM_0001466,ISIC_0031633,bkl,3,75.0,1,4
...,...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,akiec,3,40.0,1,0
10011,HAM_0002867,ISIC_0033550,akiec,3,40.0,1,0
10012,HAM_0002867,ISIC_0033536,akiec,3,40.0,1,0
10013,HAM_0000239,ISIC_0032854,akiec,3,80.0,1,5


In [22]:
X = dt1[['dx_type', 'age', 'sex', 'localization']]
y = dt1['dx']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

nb_classifier = CategoricalNB()
nb_classifier.fit(X_train, y_train)
y_pred = nb_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.71

Classification Report:
              precision    recall  f1-score   support

       akiec       0.20      0.01      0.03        69
         bcc       0.17      0.02      0.04        93
         bkl       0.45      0.37      0.41       228
          df       0.00      0.00      0.00        28
         mel       0.30      0.36      0.33       226
          nv       0.82      0.93      0.87      1338
        vasc       0.00      0.00      0.00        21

    accuracy                           0.71      2003
   macro avg       0.28      0.24      0.24      2003
weighted avg       0.65      0.71      0.67      2003

