In [1]:
# GLASS PREDICTION USING MACHINE LEARNING

In [2]:
import pandas as pd
import numpy as np

In [3]:
# READ DATA FILE

In [4]:
df = pd.read_csv("glassdata.csv")
df.head()

Unnamed: 0,ID,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type of glass
0,1,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,2,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,4,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,5,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [5]:
# NO.OF ROWS AND COLUMNS

In [6]:
df.shape

(214, 11)

In [7]:
# CHECKING FOR NULL VALUES

In [8]:
df.isna().sum()

ID               0
RI               0
Na               0
Mg               0
Al               0
Si               0
K                0
Ca               0
Ba               0
Fe               0
Type of glass    0
dtype: int64

In [9]:
# CHECKING FOR THE DEPENDENCY OF EACH FEATURE ON ANOTHER

In [10]:
correlation=df.corr()['Type of glass']
correlation.abs().sort_values(ascending=False)

Type of glass    1.000000
ID               0.877357
Mg               0.744993
Al               0.598829
Ba               0.575161
Na               0.502898
Fe               0.188278
RI               0.164237
Si               0.151565
K                0.010054
Ca               0.000952
Name: Type of glass, dtype: float64

In [11]:
# SEPERATING DEPENDENT AND INDEPENDENT VARIABLES

In [12]:
y=df['Type of glass']
x=df.drop('Type of glass',axis=1)

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
# SPLITTING TRAIN AND TEST DATA

In [15]:
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.23,random_state=42)

In [16]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


The filename, directory name, or volume label syntax is incorrect.


In [17]:
## USINg DIFFERENT ALGORITHMS LIKE NAIVE BAYES, RANDOM FOREST,
## DECISION TREE, KNN, XGBOOST TO PREDICT THE TARGET VALUE 

In [18]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [25]:
models=[]
models.append(('Naive Bayes', GaussianNB()))
models.append(('RandomForest', RandomForestClassifier()))
models.append(('Decision Tree', DecisionTreeClassifier()))
models.append(('KNN', KNeighborsClassifier()))

In [20]:
import warnings
warnings.filterwarnings(action='once')

In [26]:
for name,model in models:
    print(name)
    model.fit(X_train,y_train)
    predictions=model.predict(X_test)
    from sklearn.metrics import confusion_matrix,accuracy_score
    print(confusion_matrix(predictions,y_test))
    print('\n')
    print(accuracy_score(predictions,y_test))
    print('\n')

Naive Bayes
[[10  6  0  0  0  0]
 [ 1 11  1  0  0  0]
 [ 0  3  3  0  0  0]
 [ 0  0  0  4  0  0]
 [ 0  0  0  0  2  0]
 [ 0  0  0  0  1  8]]


0.76


RandomForest
[[11  0  0  0  0  0]
 [ 0 20  0  0  0  0]
 [ 0  0  4  0  0  0]
 [ 0  0  0  4  0  0]
 [ 0  0  0  0  3  0]
 [ 0  0  0  0  0  8]]


1.0


Decision Tree
[[11  0  0  0  0  0]
 [ 0 20  0  0  0  0]
 [ 0  0  4  0  0  0]
 [ 0  0  0  3  0  0]
 [ 0  0  0  1  3  0]
 [ 0  0  0  0  0  8]]


0.98


KNN
[[10  0  0  0  0  0]
 [ 1 20  0  0  0  0]
 [ 0  0  4  0  0  0]
 [ 0  0  0  3  0  0]
 [ 0  0  0  1  3  0]
 [ 0  0  0  0  0  8]]


0.96




In [22]:
# CONCLUSION:
## Upon all algorithms **RandomForest** and **DecisionTree** gave almost 100% accuracy. 
## On cross validation the accuracy may vary between 98%-100%

In [23]:
res = pd.DataFrame(predictions)
res.index = X_test.index 
res.columns = ["Type of glass"]
res.to_csv("glass_prediction.csv")