#First attempt

## Import the data

In [None]:
import pandas as pd
import numpy as np

# reading
url="https://drive.google.com/file/d/1ljJfs1Rue1PRouBeZVl3DabqWRrfI8ZL/view?usp=share_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
df = pd.read_csv(path)
data=df.copy()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder,PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error,mean_squared_error,mean_absolute_percentage_error,r2_score
from scipy.sparse import csr_matrix
from scipy import sparse
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
X = df.drop(columns="poisonous")
y = df["poisonous"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

## Data Exploration

In [None]:
X_train.head()

Unnamed: 0,cap.shape,cap.color,bruises,stalk.color.above.ring,stalk.color.below.ring,population,Id
198,b,b,True,w,w,v,1832
4637,f,n,True,p,g,y,5006
3019,f,p,True,w,w,v,4040
2468,x,g,False,w,w,a,5533
6225,x,w,True,w,w,s,2710


In [None]:
X_train.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5199 entries, 198 to 3582
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   cap.shape               5199 non-null   object
 1   cap.color               5199 non-null   object
 2   bruises                 5199 non-null   bool  
 3   stalk.color.above.ring  5199 non-null   object
 4   stalk.color.below.ring  5199 non-null   object
 5   population              5199 non-null   object
 6   Id                      5199 non-null   int64 
dtypes: bool(1), int64(1), object(5)
memory usage: 289.4+ KB


The only numeric columnn of our train set is "Id". Our column trandformer below will ignore it.

## Preprocessing the data

In [None]:
cat_col=X_train.select_dtypes(exclude = 'number').copy().columns # Extracting the names of columns
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),       # Our data set X_train has no missing value, but it might not be the case for  X_test
    OneHotEncoder(handle_unknown="ignore")
)  

numeric_pipe = make_pipeline(                                 #useful if we have meaningful nuemric columns
    SimpleImputer(strategy="mean"))

preprocessor = ColumnTransformer(transformers=[
    ('category', categoric_pipe, cat_col)
    #('number', numeric_pipe, num_col) # We ignore the numeric column
])

dt_pipeline = make_pipeline(preprocessor, 
                            #StandardScaler(with_mean=False) # no need to scale the onHotEncoded data
                           )

In [None]:
X_train_encoded=dt_pipeline.fit_transform(X_train)
X_train_encoded=pd.DataFrame(X_train_encoded.todense())


X_test_encoded=dt_pipeline.transform(X_test)
X_test_encoded=pd.DataFrame(X_test_encoded.todense())

## Neural network with 5 layers:
We have a binary classification problem, and therefore set the last activation to be the sigmoid function $f(x)=\frac{1}{1+\exp(-x)}$.

In [None]:
model = Sequential(
    [
        tf.keras.Input(shape=(42,)),
        Dense(units=30, activation='linear', name = 'layer1'),
        Dense(20, activation='relu', name = 'layer2'),
        Dense(5, activation='relu', name = 'layer3'),
        Dense(3, activation='linear', name = 'layer4'),
        Dense(1, activation='sigmoid', name = 'layer5')
     ]
)

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 layer1 (Dense)              (None, 30)                1290      
                                                                 
 layer2 (Dense)              (None, 20)                620       
                                                                 
 layer3 (Dense)              (None, 5)                 105       
                                                                 
 layer4 (Dense)              (None, 3)                 18        
                                                                 
 layer5 (Dense)              (None, 1)                 4         
                                                                 
Total params: 2,037
Trainable params: 2,037
Non-trainable params: 0
_________________________________________________________________


The binary cross entropy loss function $-y\log(\hat{y} )-(1-y)\log(1-\hat{y})$ works pretty well with binary classification problems. The optimizer Adam is faster.

In [None]:
model.compile(                                          #compiling the model
    loss = tf.keras.losses.BinaryCrossentropy(),
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.01),
)

model.fit(                                              #fitting the model
    X_train_encoded,y_train,                            
    epochs=60,
)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


<keras.callbacks.History at 0x7f4a529d3280>

##Predictions of the model
The model is now ready to make predictions. 

In [None]:
predictions_train_set = model.predict(X_train_encoded)    #predictions for the train set
predictions_test_set = model.predict(X_test_encoded)      #prediction for the test set



Recall that the output of our model is a vector of real numbers, each representing the probability that a given mushroom (describe by the correcponding row in the data set) is poisonous. We set our decisive treshold to be equal to $0.25$: We classify a mushroom as poisonous if, and only if our model predicts that its probability to be poisonous is strictly greater than $0.25$. 

In [None]:
treshold=0.25
yhat_train=list((pd.DataFrame(predictions_train_set).iloc[:,0]>treshold).astype(int))
yhat_test=list((pd.DataFrame(predictions_test_set).iloc[:,0]>treshold).astype(int))

Overview of the predictions.

In [None]:
[(yhat_test[i],list(y_test)[i]) for i in range(20)]

[(1, 1),
 (0, 0),
 (0, 0),
 (1, 1),
 (1, 1),
 (0, 0),
 (0, 0),
 (0, 0),
 (1, 1),
 (1, 1),
 (1, 1),
 (1, 1),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (1, 1)]

## Model performence

We use the confusion matrix to evaluate the performence of our model. We compute: 
- $tn$: The number of true negatives. It corresponds to the number of eatable mushrooms that our model accurately classified.
- $fp$: The number of false positives. It corresponds to the number of eatable mushrooms that our model missclassified.
- $fn$: The number of false negatives. It corresponds to the number of poisonous mushrooms that our model missclassified.
- $tp$: The number of true positives. It corresponds to the number of poisonous mushrooms that our model accurately classified.

In [None]:
from sklearn.metrics import confusion_matrix
#confusion_matrix(list(y_test),list(yhat_test))
tn, fp, fn, tp = confusion_matrix(list(y_test),list(yhat_test)).ravel()
tn, fp, fn, tp

(601, 70, 0, 629)

In [None]:
#doind the task above manually
tp_test=sum([list(yhat_test)[i]*list(y_test)[i] for i in range(len(y_test))])
fp_test=sum([yhat_test[i]*(1-list(y_test)[i]) for i in range(len(y_test))])
tn_test=sum([(1-yhat_test[i])*(1-list(y_test)[i]) for i in range(len(y_test))])
fn_test=sum([(1-yhat_test[i])*list(y_test)[i] for i in range(len(y_test))])
tn_test,fp_test,fn_test,tp_test


(601, 70, 0, 629)

In [None]:
# reading
url="https://drive.google.com/file/d/1rHAgVfd7vtZv3bj4Fb0MqS5PcRwOLC5I/view?usp=share_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
df_testing = pd.read_csv(path)

data_testing=df_testing.copy()

X_testing_encoded=dt_pipeline.transform(data_testing)
X_testing_encoded=pd.DataFrame(X_testing_encoded.todense())
predictions_testing = model.predict(X_testing_encoded)
yhat_testing=list((pd.DataFrame(predictions_testing).iloc[:,0]>treshold).astype(int))



In [None]:
data_testing["poisonous"]=yhat_testing
result=data_testing[["Id","poisonous"]]
result.to_csv("attemp_2_Gauss.csv",index=False)
