In [33]:
import pandas as pd

glass_data = pd.read_csv('glass.csv')

#Inspect the Dataset
glass_data.info()

#Check for missing values
missing_values = glass_data.isnull().sum()
missing_values = missing_values[missing_values > 0]
print("\nColumns with missing values:")
print(missing_values)

#Check Sample Dataset Record
print("\nSample Record on the Dataset")
glass_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   idno    214 non-null    int64  
 1   RI      214 non-null    float64
 2   Na      214 non-null    float64
 3   Mg      214 non-null    float64
 4   Al      214 non-null    float64
 5   Si      214 non-null    float64
 6   K       214 non-null    float64
 7   Ca      214 non-null    float64
 8   Ba      214 non-null    float64
 9   Fe      214 non-null    float64
 10  type    214 non-null    int64  
dtypes: float64(9), int64(2)
memory usage: 18.5 KB

Columns with missing values:
Series([], dtype: int64)

Sample Record on the Dataset


Unnamed: 0,idno,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,type
0,1,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,2,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,4,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,5,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [34]:
# Drop 'idno' column
glass_data = glass_data.drop(columns=['idno'])

# Separate features and target
X = glass_data.drop(columns=['type'])
y = glass_data['type']

In [35]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB

def developNaiveModel(X, y):
    # Split the Dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

    # Initialize and train the Naive Bayes model
    nb_model = GaussianNB()
    nb_model.fit(X_train, y_train)

    # Evaluate the model using 8-fold cross-validation
    cv_scores = cross_val_score(nb_model, X, y, cv=8)
    print(f"Mean accuracy: {cv_scores.mean():.4f}")
    print(cv_scores)

In [36]:
#Develop the Model Using all Features
developNaiveModel(X, y)

Mean accuracy: 0.4293
[0.40740741 0.44444444 0.25925926 0.66666667 0.44444444 0.48148148
 0.61538462 0.11538462]


In [37]:
from sklearn.feature_selection import SelectKBest, chi2

# Univariate Selection: Chi-Square Test
selector = SelectKBest(score_func=chi2, k=5)
selector.fit(X, y)

# Get feature scores
feature_scores = pd.Series(selector.scores_, index=X.columns)

# Sort and print all scores
sorted_scores = feature_scores.sort_values(ascending=False)
print("\nChi-Square scores for all features:")
print(sorted_scores)

# Get the top 5 features
top_5_univariate = sorted_scores.head(5).index
print("\nTop 5 features (Univariate selection):")
print(top_5_univariate)
print('\n')


# Select top 5 features from X
X_selected = X[top_5_univariate]

# Train the model
developNaiveModel(X_selected, y)



Chi-Square scores for all features:
Ba    145.514077
Mg    100.984212
K      31.670632
Al     16.977488
Na      4.311253
Ca      3.210929
Fe      2.170185
Si      0.110449
RI      0.000048
dtype: float64

Top 5 features (Univariate selection):
Index(['Ba', 'Mg', 'K', 'Al', 'Na'], dtype='object')


Mean accuracy: 0.4339
[0.33333333 0.51851852 0.48148148 0.55555556 0.40740741 0.44444444
 0.57692308 0.15384615]


In [38]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest model to get feature importance
rf_model = RandomForestClassifier(random_state=10)
rf_model.fit(X, y)

# Get feature importance scores
feature_importances = pd.Series(rf_model.feature_importances_, index=X.columns)

# Sort the importances in descending order
sorted_importances = feature_importances.sort_values(ascending=False)

# Print all feature importance scores
print("\nFeature Importance scores (Random Forest):")
print(sorted_importances)

# Get the top 5 important features
top_5_features = sorted_importances.head(5).index
print("\nTop 5 features (Feature Importance):")
print(top_5_features)

# Select top 5 features from X
X_selected = X[top_5_features]

# Train the model using top 5 features
developNaiveModel(X_selected, y)



Feature Importance scores (Random Forest):
Mg    0.169713
RI    0.154783
Al    0.152385
Ca    0.119057
Na    0.100737
K     0.088226
Si    0.086621
Ba    0.086340
Fe    0.042138
dtype: float64

Top 5 features (Feature Importance):
Index(['Mg', 'RI', 'Al', 'Ca', 'Na'], dtype='object')
Mean accuracy: 0.4954
[0.37037037 0.48148148 0.44444444 0.77777778 0.51851852 0.37037037
 0.57692308 0.42307692]


In [39]:
# Combine features and target temporarily to compute correlation with 'type'
X_with_target = X.copy()
X_with_target['type'] = y

# Compute correlation matrix
corr_matrix = X_with_target.corr()

# Get absolute correlation of features with target 'type'
corr_with_target = corr_matrix['type'].drop('type').abs()

# Sort and get top 5 correlated features
top_5_corr_features = corr_with_target.sort_values(ascending=False).head(5)

# Display correlation values
print("\nCorrelation of features with target 'type':")
print(corr_with_target.sort_values(ascending=False))

print("\nTop 5 features (Correlation with target):")
print(top_5_corr_features)

# Select features
X_selected = X[top_5_corr_features.index]

# Train the model
developNaiveModel(X_selected, y)



Correlation of features with target 'type':
Mg    0.744993
Al    0.598829
Ba    0.575161
Na    0.502898
Fe    0.188278
RI    0.164237
Si    0.151565
K     0.010054
Ca    0.000952
Name: type, dtype: float64

Top 5 features (Correlation with target):
Mg    0.744993
Al    0.598829
Ba    0.575161
Na    0.502898
Fe    0.188278
Name: type, dtype: float64
Mean accuracy: 0.2286
[0.11111111 0.2962963  0.14814815 0.37037037 0.22222222 0.2962963
 0.38461538 0.        ]
