# Flu Shot Learning

## Importing Libraries

In [0]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf 
tf.test.gpu_device_name() 

In [0]:
tf.__version__

# Data Preprocessing

## Importing the dataset

In [0]:
features_dataset = pd.read_csv('training_set_features.csv', index_col="respondent_id")
test_dataset = pd.read_csv('test_set_features.csv', index_col="respondent_id")
labels_dataset = pd.read_csv('training_set_labels.csv', index_col="respondent_id")

In [0]:
X = features_dataset.iloc[:, :].values
X_test1 = test_dataset.iloc[:, :].values
y_h = labels_dataset.iloc[:,0 ].values
y_s = labels_dataset.iloc[:,-1 ].values

## Missing Data

In [0]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit(X[:,:])
X[:,:] = imputer.transform(X[:,:])

In [0]:
imputer.fit(X_test1[:,:])
X_test1[:,:] = imputer.transform(X_test1[:,:])

## Encoding Categorical Data




In [0]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [20,21,22,23,24,25,26,27,28,29,30,33,34])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
X_test1 = np.array(ct.fit_transform(X_test1))

In [0]:
RANDOM_SEED = 6

In [0]:
print(X)

# Hyper parameter Tuning (SVM RBF)

## Training the dataset

In [0]:
from sklearn import svm
from sklearn.model_selection import cross_val_score

In [0]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    }
}

In [0]:
scores = []
from sklearn.model_selection import GridSearchCV
for model_name, mp in model_params.items():
    clf_h =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf_h.fit(X,y_h)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

In [0]:
scores = []
from sklearn.model_selection import RandomizedSearchCV
for model_name, mp in model_params.items():
    rs_h = RandomizedSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    rs_h.fit(X,y_h)
    scores.append({
        'model': model_name,
        'best_score': rs.best_score_,
        'best_params': rs.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

## Predicting the test result

In [0]:
y_prob = clf_h.predict(X)

In [0]:
y_prob_h = clf_h.predict(X_test1)

## Accuracy (Linear Regression for both H1N1 and seasonal flu)

In [0]:
from sklearn.metrics import roc_curve, roc_auc_score
roc_auc_score(y_h, y_prob)

## Building the ANN

### Initializing the ANN

In [0]:
ann_s = tf.keras.models.Sequential()

### Adding the input layer and the first hidden layer

In [0]:
ann_s.add(tf.keras.layers.Dense(64, activation='selu'))

### Adding the second and third hidden layer

In [0]:
ann_s.add(tf.keras.layers.Dense(units=32, activation='selu'))

In [0]:
ann_s.add(tf.keras.layers.Dense(units=16, activation='selu'))

### Adding the output layer

In [0]:
ann_s.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

## Training the ANN

### Compiling the ANN

In [0]:
ann_s.compile(optimizer = 'SGD', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [0]:
X = np.asarray(X).astype(np.float32)
X_test1 = np.asarray(X_test1).astype(np.float32)
y_h = np.asarray(y_h).astype(np.float32)
y_s = np.asarray(y_s).astype(np.float32)

### Training the ANN on the Training set

In [0]:
ann_s.fit(X, y_s, batch_size = 1024, epochs = 1000)

## Part 4 - Making the predictions and evaluating the model

### Predicting the Test set results

In [0]:
y_prob = ann_s.predict(X)

In [0]:
y_prob_s = ann_s.predict(X_test1)

### Accuracy

In [0]:
from sklearn.metrics import roc_curve, roc_auc_score
roc_auc_score(y_s, y_prob)

# Submission

In [0]:
y_prob_h

In [0]:
y_prob_s

In [0]:
submission_dataset = pd.read_csv('submission_format.csv', index_col="respondent_id")
submission_dataset.head()

In [0]:
np.testing.assert_array_equal(test_dataset.index.values, submission_dataset.index.values)
submission_dataset["h1n1_vaccine"] = y_prob_h
submission_dataset["seasonal_vaccine"] = y_prob_s
submission_dataset.head()

In [0]:
submission_dataset.to_csv('team_20_submission_4.csv', index=True)

In [0]:
!head team_20_submission_24.csv

In [0]:
from google.colab import files
files.download'team_20_submission_24.csv')