In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
train_data = pd.read_csv("AllData.csv")
test_data = pd.read_csv("AllDatals.csv")

In [3]:
train_data.head()


Unnamed: 0,Region,Country,Year,Vaccine,Target,Vaccinated,Coverage
0,East Asia & Pacific,Australia,1997,DTP1,249000,227000,91
1,East Asia & Pacific,Australia,1997,DTP3,249000,192000,77
2,East Asia & Pacific,Australia,1997,Hib3,249000,192000,77
3,East Asia & Pacific,Australia,1997,MCV1,249000,212000,85
4,East Asia & Pacific,Australia,1997,Pol3,249000,192000,77


In [4]:
#processing the data
y_train = train_data["Coverage"]
train_data.drop(labels="Coverage", axis=1, inplace=True)

In [5]:
#concatenated new data
full_data = train_data.append(test_data)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [6]:
#drop any columns
drop_columns = ["Region", "Country", "Vaccine","Target","Vaccinated","Coverage"]
full_data.drop(labels=drop_columns, axis=1, inplace=True)


In [7]:
#Any text data needs to be converted into numbers that our model can use, so let's change that now. We'll also fill any empty cells with 0:
full_data = pd.get_dummies(full_data, columns=["Year"])
full_data.fillna(value=0.0, inplace=True)

In [8]:
#Let's split the data into training and testing sets:

X_train = full_data.values[0:98]
X_test = full_data.values[99:]

In [9]:
#We'll now scale our data by creating an instance of the scaler and scaling it:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#Now we can split the data into training and testing sets. Let's also set a seed (so you can replicate the results) and select the percentage of the data for testing on:
state = 12  
test_size = 0.30  
  
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=test_size, random_state=state)

In [None]:
#Now we can try setting different learning rates, so that we can compare the performance of the classifier's performance at different learning rates.
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=0)
    gb_clf.fit(X_train, y_train)

    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_val, y_val)))


In [None]:
#Now we can evaluate the classifier by checking its accuracy and creating a confusion matrix. Let's create a new classifier and specify the best learning rate we discovered.
gb_clf2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=0)
gb_clf2.fit(X_train, y_train)
predictions = gb_clf2.predict(X_val)

print("Confusion Matrix:")
print(confusion_matrix(y_val, predictions))

print("Classification Report")
print(classification_report(y_val, predictions))