<a href="https://colab.research.google.com/github/KTSNVaishnavi/Machine-Learning/blob/main/Neural_Networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [21]:
!pip install collinearity

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [22]:
import pandas as pd
import numpy as np
#For detecting the low variance
from sklearn.feature_selection import VarianceThreshold
#For detecting the outliers
from scipy import stats
#For detecting the collinearity 
from collinearity import SelectNonCollinear
#For scaling the data, and encoding
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegressionCV
#For class imbalance
from imblearn.over_sampling import SMOTE 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import mean_absolute_error,mean_squared_error

In [23]:
#Reading the data
data=pd.read_csv('/content/gdrive/MyDrive/MachineLearning/Iris.csv')

In [24]:
data.shape

(150, 6)

In [25]:
data.dtypes

Id                 int64
SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species           object
dtype: object

In [26]:
data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [27]:
data.tail()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica
149,150,5.9,3.0,5.1,1.8,Iris-virginica


In [28]:
data.describe(include='all')

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
count,150.0,150.0,150.0,150.0,150.0,150
unique,,,,,,3
top,,,,,,Iris-setosa
freq,,,,,,50
mean,75.5,5.843333,3.054,3.758667,1.198667,
std,43.445368,0.828066,0.433594,1.76442,0.763161,
min,1.0,4.3,2.0,1.0,0.1,
25%,38.25,5.1,2.8,1.6,0.3,
50%,75.5,5.8,3.0,4.35,1.3,
75%,112.75,6.4,3.3,5.1,1.8,


In [29]:
#Checking for na values
data.isnull().sum()

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [30]:
#Checking the outliers
num=data.select_dtypes('float64').columns
z = np.abs(stats.zscore(data[num]))
print(z)

     SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
0         0.900681      1.032057       1.341272      1.312977
1         1.143017      0.124958       1.341272      1.312977
2         1.385353      0.337848       1.398138      1.312977
3         1.506521      0.106445       1.284407      1.312977
4         1.021849      1.263460       1.341272      1.312977
..             ...           ...            ...           ...
145       1.038005      0.124958       0.819624      1.447956
146       0.553333      1.281972       0.705893      0.922064
147       0.795669      0.124958       0.819624      1.053537
148       0.432165      0.800654       0.933356      1.447956
149       0.068662      0.124958       0.762759      0.790591

[150 rows x 4 columns]


In [31]:
# Position of the outlier
print(np.where(z > 3.5))
print(np.where(z < -3.5))

(array([], dtype=int64), array([], dtype=int64))
(array([], dtype=int64), array([], dtype=int64))


In [32]:
#There is no outlier
# num=data.select_dtypes(['float64','int64'])
data= data[(z < 3.5).all(axis=1)]
data.shape

(150, 6)

In [33]:
#Finding the collinear columns
cor_matrix = data.corr().abs()
print(cor_matrix)

                     Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  \
Id             1.000000       0.716676      0.397729       0.882747   
SepalLengthCm  0.716676       1.000000      0.109369       0.871754   
SepalWidthCm   0.397729       0.109369      1.000000       0.420516   
PetalLengthCm  0.882747       0.871754      0.420516       1.000000   
PetalWidthCm   0.899759       0.817954      0.356544       0.962757   

               PetalWidthCm  
Id                 0.899759  
SepalLengthCm      0.817954  
SepalWidthCm       0.356544  
PetalLengthCm      0.962757  
PetalWidthCm       1.000000  


In [34]:
#Building an upper triangular matrix to see the correaltion of the columns more clearly 
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype('bool'))
print(upper_tri)

               Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
Id            NaN       0.716676      0.397729       0.882747      0.899759
SepalLengthCm NaN            NaN      0.109369       0.871754      0.817954
SepalWidthCm  NaN            NaN           NaN       0.420516      0.356544
PetalLengthCm NaN            NaN           NaN            NaN      0.962757
PetalWidthCm  NaN            NaN           NaN            NaN           NaN


In [35]:
#View the columns with high collinearity
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
print(); print(to_drop)


['PetalWidthCm']


In [36]:
data = data.drop(data[to_drop], axis=1)
print(); print(data.head())


   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm      Species
0   1            5.1           3.5            1.4  Iris-setosa
1   2            4.9           3.0            1.4  Iris-setosa
2   3            4.7           3.2            1.3  Iris-setosa
3   4            4.6           3.1            1.5  Iris-setosa
4   5            5.0           3.6            1.4  Iris-setosa


In [37]:
#Separating the num cols and cat cols
#Dropping the unnecessary columns
waste=['Id']
#In this dataset, all the independant variables have correct data types
data=data.drop(waste,axis=1)
cat='Species'
data[cat]=data[cat].astype('category')

In [38]:
#Check the Shape
data.columns

Index(['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'Species'], dtype='object')

In [39]:
# #Removing any zero variance columns
# # import ordinal encoder from sklearn
# from sklearn.preprocessing import OrdinalEncoder
# ord_enc = OrdinalEncoder()
# # Transform the data
# data[cat]=np.array(data[cat])
# data[cat]=data[cat].values.reshape(-1,1)
# data[cat] = ord_enc.fit_transform(data[cat])

In [40]:
#Removing the low variance
num=data.select_dtypes('float64').columns
from sklearn.feature_selection import VarianceThreshold
var_thr = VarianceThreshold(threshold = 0.25) #Removing both constant and quasi-constant
var_thr.fit(data[num])

var_thr.get_support()


array([ True, False,  True])

In [41]:
concol = [column for column in data[num].columns 
          if column not in data[num].columns[var_thr.get_support()]]

for features in concol:
    print(features)

SepalWidthCm


In [42]:
num=num.drop(concol)

In [43]:
data[num].shape

(150, 2)

In [44]:
data=pd.concat([data[num],data[cat]],axis=1)
data.shape


(150, 3)

In [45]:
#Checking the value counts of target
data['Species'].value_counts()


Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: Species, dtype: int64

In [46]:
#Train_test_split
X_train,X_test,y_train,y_test=train_test_split(data.loc[:, data.columns!='Species'], data['Species'], test_size=0.3, random_state=100,stratify=data['Species'])

In [47]:
#Imputing the null values
sim_imp=SimpleImputer(strategy='mean')
X_train=pd.DataFrame(sim_imp.fit_transform(X_train),columns=X_train.columns)
X_test=pd.DataFrame(sim_imp.transform(X_test),columns=X_train.columns)

In [48]:
#Standardizing the data
std=StandardScaler()
X_train=pd.DataFrame(std.fit_transform(X_train),columns=X_train.columns)
X_test=pd.DataFrame(std.transform(X_test),columns=X_test.columns)

In [49]:
# y_train=y_train.ravel()
# y_test=y_test.ravel()

In [50]:
# y_train=pd.DataFrame(y_train.values,columns=(['Species']))
# y_test=pd.DataFrame(y_test.values,columns=(['Species']))
# y_train.dtypes
# y_train=y_train.astype('category')
# y_test=y_test.astype('category')


In [51]:
#Replacing the string to a no.
y_test=pd.DataFrame(y_test.replace({'Iris-setosa':1,'Iris-versicolor':2,'Iris-virginica':3}))
y_train=pd.DataFrame(y_train.replace({'Iris-setosa':1,'Iris-versicolor':2,'Iris-virginica':3}))

In [52]:
# #Encoding the target variable
# from sklearn.preprocessing import LabelEncoder
# lab=LabelEncoder()
# y_train=pd.DataFrame(lab.fit_transform(y_train))
# y_test=pd.DataFrame(lab.transform(y_test))
# y_train.astype('category')
y_train

Unnamed: 0,Species
71,2
90,2
4,1
17,1
18,1
...,...
26,1
69,2
103,3
65,2


In [53]:
#Model building
model=LogisticRegressionCV(cv=3,solver='newton-cg',class_weight='balanced',max_iter=200,multi_class='multinomial')
log=model.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [54]:
#Metrics
train_pred=log.predict(X_train)
test_pred=log.predict(X_test)

In [55]:
print(confusion_matrix(train_pred,y_train))
print(confusion_matrix(test_pred,y_test))
print(classification_report(train_pred,y_train))
print(classification_report(test_pred,y_test))

[[35  0  0]
 [ 0 34  1]
 [ 0  1 34]]
[[15  0  0]
 [ 0 15  3]
 [ 0  0 12]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        35
           2       0.97      0.97      0.97        35
           3       0.97      0.97      0.97        35

    accuracy                           0.98       105
   macro avg       0.98      0.98      0.98       105
weighted avg       0.98      0.98      0.98       105

              precision    recall  f1-score   support

           1       1.00      1.00      1.00        15
           2       1.00      0.83      0.91        18
           3       0.80      1.00      0.89        12

    accuracy                           0.93        45
   macro avg       0.93      0.94      0.93        45
weighted avg       0.95      0.93      0.93        45



In [56]:
#Building the neural network
import tensorflow as tf
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense
from keras.utils.np_utils import to_categorical

In [57]:
model=Sequential()
model.add(Dense(5,input_dim=2, activation='relu'))        
model.add(Dense(3, activation='softmax'))                                                    

In [58]:
from tensorflow.keras.optimizers import SGD 
customized_optimizer=SGD(learning_rate=0.015)

In [59]:
model.compile(loss='categorical_crossentropy',optimizer=customized_optimizer,metrics=['accuracy'])

In [60]:
type(y_train)

pandas.core.frame.DataFrame

In [61]:
y_train=y_train.astype('category')
y_test=y_test.astype('category')

In [62]:
#Convert the target to categorical 
y_train=pd.get_dummies(y_train)
y_test=pd.get_dummies(y_test)
y_train

Unnamed: 0,Species_1,Species_2,Species_3
71,0,1,0
90,0,1,0
4,1,0,0
17,1,0,0
18,1,0,0
...,...,...,...
26,1,0,0
69,0,1,0
103,0,0,1
65,0,1,0


In [63]:
## Convert to numeric
y_train=y_train.values
y_test=y_test.values
X_train = X_train.values
X_test = X_test.values
y_train.shape


(105, 3)

In [64]:
ann = model.fit(X_train, y_train,
                    batch_size=64,
                    validation_split=0.2,
                    epochs= 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [65]:
print(model.evaluate(X_train,y_train))
print(model.evaluate(X_test,y_test))

[0.7713578939437866, 0.8095238208770752]
[0.7842761278152466, 0.7555555701255798]


In [66]:
y_train_Pred = (model.predict(X_train)>0.5).astype("int32")
y_test_Pred = (model.predict(X_test)>0.5).astype("int32")



In [67]:
# Function for Classification Report
from sklearn.metrics import accuracy_score,classification_report
def classifcation_report_train_test(y_train, y_train_pred, y_test, y_test_pred):

    print('''
            =========================================
               CLASSIFICATION REPORT FOR TRAIN DATA
            =========================================
            ''')
    print(classification_report(y_train, y_train_pred, digits=4))

    print('''
            =========================================
               CLASSIFICATION REPORT FOR TEST DATA
            =========================================
            ''')
    print(classification_report(y_test, y_test_pred, digits=4))

In [68]:
classifcation_report_train_test(y_train, y_train_Pred, y_test, y_test_Pred)


               CLASSIFICATION REPORT FOR TRAIN DATA
            
              precision    recall  f1-score   support

           0     1.0000    0.9714    0.9855        35
           1     0.0000    0.0000    0.0000        35
           2     0.9231    0.3429    0.5000        35

   micro avg     0.9787    0.4381    0.6053       105
   macro avg     0.6410    0.4381    0.4952       105
weighted avg     0.6410    0.4381    0.4952       105
 samples avg     0.4381    0.4381    0.4381       105


               CLASSIFICATION REPORT FOR TEST DATA
            
              precision    recall  f1-score   support

           0     1.0000    0.8667    0.9286        15
           1     0.0000    0.0000    0.0000        15
           2     0.8571    0.4000    0.5455        15

   micro avg     0.9500    0.4222    0.5846        45
   macro avg     0.6190    0.4222    0.4913        45
weighted avg     0.6190    0.4222    0.4913        45
 samples avg     0.4222    0.4222    0.4222        45


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
