# Data and Setup

In [1]:
%%capture
# Importing required libraries
import pandas as pd
import numpy as np

# Viz libraries and style setting
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# Set the palette to the "pastel" default palette:
sns.set_palette("pastel")

In [2]:
%%capture
# Mounting your personal Gdrive to the Colab notebook
from google.colab import drive
drive.mount('/gdrive',force_remount=True)

In [3]:
SDGData = pd.read_csv('/gdrive/My Drive/Github/SDG-Econ-causality/Data/SDG Stats/SDGData.csv')

# Preparing Data 

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Modeling - Voting Classifier

In [None]:
# 1. Build multiple classification models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

#Generally works better if models you choose have diverse methodological approaches...

log_clf = LogisticRegression(random_state=42) # logistic regression w/ C=default
rnd_clf = RandomForestClassifier(random_state=42) # Random Forest
svm_clf = SVC(random_state=42) # support vector machine

In [None]:
# Goal is to predict ytest for each model and then use PREDICTIONS FROM EACH MODEL to select final predictions

# Need to set up a standard for selecting final prediction:
from sklearn.ensemble import VotingClassifier

# Estimators arg is giving each estimator a name for references in functions like GridsearchCV

# voting='hard' takes majority vote of each predicted value to select final prediction for ytest

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard') 

In [None]:
vmodel = voting_clf.fit(X_train, y_train)
print(vmodel.score(X_test, y_test)) #return accuracy of voting classifier

In [None]:

# Compare to accuracy of individual models...
log_clf = LogisticRegression(random_state=42) # logistic regression w/ C=default
rnd_clf = RandomForestClassifier(random_state=42) # Random Forest
svm_clf = SVC(random_state=42) # support vector machine

print(log_clf.fit(X_train, y_train).score(X_test, y_test))
print(rnd_clf.fit(X_train, y_train).score(X_test, y_test))
print(svm_clf.fit(X_train, y_train).score(X_test, y_test))

In [None]:
# 'soft' voting takes the predicted probabilities of each model and choose the highest value#
#Need to ensure that probabilities are generated in each model...

log_clf = LogisticRegression(random_state=42)
rnd_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(probability=True, random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')
voting_clf.fit(X_train, y_train)

In [None]:
voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft') 

vmodel = voting_clf.fit(X_train, y_train)

In [None]:
print(vmodel.score(X_test, y_test)) #return accuracy of voting classifier
