In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.patches as mpatches
from plotly.subplots import make_subplots

import seaborn as sns
sns.set(color_codes = True)
sns.set(style="whitegrid")
import plotly.figure_factory as ff
from plotly.colors import n_colors

In [None]:
df = pd.read_csv("h1n1_vaccine_prediction.csv")
df.head()

In [None]:
df.info()

In [None]:
df.dtypes

#data visualization

In [None]:
sns.countplot(x="sex",data=df)
plt.show

In [None]:
sns.countplot(x="marital_status",data=df)
plt.show

In [None]:
sns.countplot(x="employment",data=df)
plt.show

In [None]:
sns.countplot(x="race",hue="sex",data=df)
plt.show()

In [None]:
sns.countplot(x="marital_status",hue="housing_status",data=df)
plt.show()

In [None]:
sns.countplot(x="employment",hue="housing_status",data=df)
plt.show()

In [None]:
sns.countplot(x="employment",hue="income_level",data=df)
plt.show()

In [None]:
sns.catplot(x="no_of_adults",y="income_level",data=df,kind="violin")

In [None]:
sns.catplot(x="income_level",y="no_of_adults",data=df,kind="violin")

# Data cleaning

In [None]:
df.isnull().sum()

In [None]:
df=df.drop(["unique_id","income_level","race","qualification","h1n1_worry","employment","marital_status","housing_status","dr_recc_seasonal_vacc","census_msa","contact_avoidance","contact_avoidance","h1n1_awareness","reduced_outside_home_cont","is_health_worker","avoid_touch_face","bought_face_mask","wash_hands_frequently","avoid_large_gatherings"],axis=1)

In [None]:
df.isnull().sum()

In [None]:
df=pd.get_dummies(df,columns=["sex","no_of_adults","no_of_children","age_bracket"])

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.isnull().sum()

In [None]:
median1=df["dr_recc_h1n1_vacc"].median()
df["dr_recc_h1n1_vacc"]=df["dr_recc_h1n1_vacc"].replace(np.nan,median1)

In [None]:
median1=df["chronic_medic_condition"].median()
df["chronic_medic_condition"]=df["chronic_medic_condition"].replace(np.nan,median1)

In [None]:
median1=df["cont_child_undr_6_mnths"].median()
df["cont_child_undr_6_mnths"]=df["cont_child_undr_6_mnths"].replace(np.nan,median1)

In [None]:
median1=df["has_health_insur"].median()
df["has_health_insur"]=df["has_health_insur"].replace(np.nan,median1)

In [None]:
median1=df["has_health_insur"].median()
df["is_h1n1_vacc_effective"]=df["is_h1n1_vacc_effective"].replace(np.nan,median1)

In [None]:
df.isnull().sum()

In [None]:
median1=df["is_h1n1_risky"].median()
df["is_h1n1_risky"]=df["is_h1n1_risky"].replace(np.nan,median1)

In [None]:
median1=df["sick_from_h1n1_vacc"].median()
df["sick_from_h1n1_vacc"]=df["sick_from_h1n1_vacc"].replace(np.nan,median1)

In [None]:
median1=df["is_seas_vacc_effective"].median()
df["is_seas_vacc_effective"]=df["is_seas_vacc_effective"].replace(np.nan,median1)

In [None]:
median1=df["is_seas_risky"].median()
df["is_seas_risky"]=df["is_seas_risky"].replace(np.nan,median1)

In [None]:
median1=df["antiviral_medication"].median()
df["antiviral_medication"]=df["antiviral_medication"].replace(np.nan,median1)

In [None]:
median1=df["sick_from_seas_vacc"].median()
df["sick_from_seas_vacc"]=df["sick_from_seas_vacc"].replace(np.nan,median1)

In [None]:
df.isnull().sum()

# classifier

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
y=df[["h1n1_vaccine"]]
x=df.drop(["h1n1_vaccine"],axis=1)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(x,y,test_size=0.30,random_state=1)

# logistic regression

In [None]:
model_log=LogisticRegression()

In [None]:
model_log.fit(X_train,Y_train)

In [None]:
model_log.score(X_test,Y_test)

In [None]:
model_log.score(X_train,Y_train)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report, accuracy_score

In [None]:
prediction=model_log.predict(X_test)
from sklearn import metrics
cm = metrics.confusion_matrix(Y_test, prediction, labels=[1, 0])
df_cm = pd.DataFrame (cm, index = [i for i in ["1","0"]],
                     columns = [i for i in ["Predict 1","Predict 0"]])

sns.heatmap(df_cm, annot=True, fmt='g')

# decision tree classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model_dtc=DecisionTreeClassifier(max_depth=2)

In [None]:
model_dtc.fit(X_train,Y_train)

In [None]:
model_dtc.score(X_test,Y_test)

In [None]:
model_dtc.score(X_train,Y_train)

In [None]:
prediction=model_dtc.predict(X_test)
from sklearn import metrics
cm = metrics.confusion_matrix(Y_test, prediction, labels=[1, 0])
df_cm = pd.DataFrame (cm, index = [i for i in ["1","0"]],
                     columns = [i for i in ["Predict 1","Predict 0"]])

sns.heatmap(df_cm, annot=True, fmt='g')

# bagging classifier

In [None]:
rfc=RandomForestClassifier(n_estimators=15,max_features=0.3, min_samples_split=40)

In [None]:
rfc.fit(X_train,Y_train)

In [None]:
rfc.score(X_train,Y_train)

In [None]:
rfc.score(X_test,Y_test)

In [None]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
predictions=rfc.predict(X_test)
cm=metrics.confusion_matrix(Y_test,predictions, labels=[1,0])
df_cm=pd.DataFrame(cm,index=[i for i in ["1","0"]],
                   columns=[i for i in ["Predict1","Predict 0"]])
sns.heatmap(df_cm,annot=True,fmt='g')
rfcp = accuracy_score(Y_test,predictions)

#KNeighbors Classifier


In [None]:
knc=KNeighborsClassifier()

In [None]:
knc.fit(X_train,Y_train)

In [None]:
knc.score(X_train,Y_train)

In [None]:
knc.score(X_test,Y_test)

In [None]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
predictions=knc.predict(X_test)
cm=metrics.confusion_matrix(Y_test,predictions, labels=[1,0])
df_cm=pd.DataFrame(cm,index=[i for i in ["1","0"]],
                   columns=[i for i in ["Predict1","Predict 0"]])
sns.heatmap(df_cm,annot=True,fmt='g')
kncp= accuracy_score(Y_test,predictions)

#Gradient Boosting Classifier

In [None]:
gbc=GradientBoostingClassifier(learning_rate=0.99,n_estimators=50)

In [None]:
gbc.fit(X_train,Y_train)

In [None]:
gbc.score(X_train,Y_train)

In [None]:
gbc.score(X_test,Y_test)

In [None]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
predictions=gbc.predict(X_test)
cm=metrics.confusion_matrix(Y_test,predictions, labels=[1,0])
df_cm=pd.DataFrame(cm,index=[i for i in ["1","0"]],
                   columns=[i for i in ["Predict1","Predict 0"]])
sns.heatmap(df_cm,annot=True,fmt='g')
kncp= accuracy_score(Y_test,predictions)

#AdaBoost Classifier

In [None]:
adc=AdaBoostClassifier(learning_rate=0.0000001, n_estimators=50)

In [None]:
adc.fit(X_train,Y_train)

In [None]:
adc.score(X_train,Y_train)

In [None]:
adc.score(X_test,Y_test)

In [None]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
predictions=adc.predict(X_test)
cm=metrics.confusion_matrix(Y_test,predictions, labels=[1,0])
df_cm=pd.DataFrame(cm,index=[i for i in ["1","0"]],
                   columns=[i for i in ["Predict1","Predict 0"]])
sns.heatmap(df_cm,annot=True,fmt='g')
kncp= accuracy_score(Y_test,predictions)

# support vector classifier

In [None]:
svc=AdaBoostClassifier(learning_rate=0.0000001, n_estimators=50)

In [None]:
svc.fit(X_train,Y_train)

In [None]:
svc.score(X_train,Y_train)

In [None]:
svc.score(X_test,Y_test)

In [None]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
predictions=svc.predict(X_test)
cm=metrics.confusion_matrix(Y_test,predictions, labels=[1,0])
df_cm=pd.DataFrame(cm,index=[i for i in ["1","0"]],
                   columns=[i for i in ["Predict1","Predict 0"]])
sns.heatmap(df_cm,annot=True,fmt='g')
kncp= accuracy_score(Y_test,predictions)