# Leitura dos dados

In [960]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Carregamento dos dados
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

Tipos:
Id                 int64
Program           object
Y0s1_enrol       float64
Y0s2_enrol       float64
Y1s1_enrol       float64
Y1s1_complete    float64
Y1s1_grade       float64
Y1s2_enrol       float64
Y1s2_complete    float64
Y1s2_grade       float64
Y2s1_enrol       float64
Y2s1_complete    float64
Y2s1_grade       float64
Y2s2_enrol       float64
Y2s2_complete    float64
Y2s2_grade       float64
Y3s1_enrol       float64
Y3s1_complete    float64
Y3s1_grade       float64
Y3s2_enrol       float64
Y3s2_complete    float64
Y3s2_grade       float64
Y4s1_enrol       float64
Y4s1_complete    float64
Y4s1_grade       float64
Y4s2_enrol       float64
Y4s2_complete    float64
Y4s2_grade       float64
Rest_enrol       float64
Rest_complete    float64
Rest_grade       float64
Failure            int64
dtype: object
Linhas & Colunas:
(1846, 32)
Colunas:
Index(['Id', 'Program', 'Y0s1_enrol', 'Y0s2_enrol', 'Y1s1_enrol',
       'Y1s1_complete', 'Y1s1_grade', 'Y1s2_enrol', 'Y1s2_complete',
      

# Tratamento dos dados

In [None]:
#Descrição da Data para termos a noção o tipo de dados com que estamos a trabalhar
def describe_data(df):
    print("Tipos:")
    print(df.dtypes)
    print("Linhas & Colunas:")
    print(df.shape)
    print("Colunas:")
    print(df.columns)
    print("Valores Vazios:")
    print(df.apply(lambda x: sum(x.isnull()) / len(df)))

describe_data(train)
describe_data(test)

#Conversão das possibilidades de 'Program' para inteiros, para podermos usar esse atributo para treinar e testar os dados
program_map = {'Biology': 0, 'Informatics': 1, 'Management': 2, 'Nursing': 3}

test['Program'] = test['Program'].map(program_map)
train['Program'] = train['Program'].map(program_map)

In [961]:
train.head(3)

Unnamed: 0,Id,Program,Y0s1_enrol,Y0s2_enrol,Y1s1_enrol,Y1s1_complete,Y1s1_grade,Y1s2_enrol,Y1s2_complete,Y1s2_grade,...,Y4s1_enrol,Y4s1_complete,Y4s1_grade,Y4s2_enrol,Y4s2_complete,Y4s2_grade,Rest_enrol,Rest_complete,Rest_grade,Failure
0,131386709,2,46.0,30.0,36.5,36.5,11.452,36.0,36.0,13.236,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,111354305,3,30.0,24.0,30.0,30.0,14.933,30.0,30.0,17.333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,111345828,3,30.0,40.0,26.0,26.0,17.0,20.0,10.0,16.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [962]:
test.head(3)

Unnamed: 0,Id,Program,Y0s1_enrol,Y0s2_enrol,Y1s1_enrol,Y1s1_complete,Y1s1_grade,Y1s2_enrol,Y1s2_complete,Y1s2_grade,...,Y3s2_grade,Y4s1_enrol,Y4s1_complete,Y4s1_grade,Y4s2_enrol,Y4s2_complete,Y4s2_grade,Rest_enrol,Rest_complete,Rest_grade
0,131394910,2,39.0,29.5,27.0,6.0,10.0,42.0,0.0,0.0,...,0.0,41.5,0.0,0.0,29.0,10.0,10.8,143.0,0.0,0.0
1,131391329,0,23.0,24.0,32.0,28.0,13.143,40.0,30.0,14.0,...,12.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,111345779,3,30.0,30.0,26.0,26.0,15.192,30.0,30.0,16.0,...,13.067,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Split dos dados

In [963]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

#Split data
X = train.drop(['Failure'], axis=1)
y = train['Failure']    

X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.33, random_state=42)

## Previsão e comparação de accuracy entre os algoritmos

* Após o Split fomos testar diferentes algoritmos, com os seus parametros padrão, para termos um ponto de partida de qual algoritmo poderá o ser o melhor a ser usado!
* Para essa seleção olhamos para a accuracy de cada um

In [964]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

#RandomForestClassifier
RF = RandomForestClassifier(n_estimators=500, max_leaf_nodes=1000)
RF.fit(X_train, y_train)
y_pred = RF.predict(X_test)
print(classification_report(y_test, y_pred))
print("Train Score:", RF.score(X_train, y_train))
print("Test Score:", RF.score(X_test, y_test))
RF_acc = round(accuracy_score(y_pred, y_test)*100, 2)
print(RF_acc)

              precision    recall  f1-score   support

           0       0.95      0.98      0.97       448
           1       0.95      0.86      0.90       162

    accuracy                           0.95       610
   macro avg       0.95      0.92      0.93       610
weighted avg       0.95      0.95      0.95       610

Train Score: 1.0
Test Score: 0.9491803278688524
94.92


In [965]:
from sklearn.neighbors import KNeighborsClassifier

#KNN
KNN = KNeighborsClassifier()
KNN.fit(X_train, y_train)
y_pred = KNN.predict(X_test)
print(classification_report(y_test, y_pred))
print("Train Score:", KNN.score(X_train, y_train))
print("Test Score:", KNN.score(X_test, y_test))
KNN_acc = round(accuracy_score(y_pred, y_test)*100, 2)
print(KNN_acc)

              precision    recall  f1-score   support

           0       0.84      0.94      0.89       448
           1       0.76      0.51      0.61       162

    accuracy                           0.83       610
   macro avg       0.80      0.72      0.75       610
weighted avg       0.82      0.83      0.81       610

Train Score: 0.8843042071197411
Test Score: 0.8262295081967214
82.62


In [966]:
from sklearn.linear_model import LogisticRegression

#Logistic Regression
LR = LogisticRegression()
LR.fit(X_train, y_train)
y_pred = LR.predict(X_test)
print(classification_report(y_test, y_pred))
print("Train Score:", LR.score(X_train, y_train))
print("Test Score:", LR.score(X_test, y_test))
LR_acc = round(accuracy_score(y_pred, y_test)*100, 2)
print(LR_acc)

              precision    recall  f1-score   support

           0       0.73      1.00      0.85       448
           1       0.00      0.00      0.00       162

    accuracy                           0.73       610
   macro avg       0.37      0.50      0.42       610
weighted avg       0.54      0.73      0.62       610

Train Score: 0.7265372168284789
Test Score: 0.7344262295081967
73.44


In [967]:
from sklearn.tree import DecisionTreeClassifier

#DecisionTreeClassifier
DT = DecisionTreeClassifier()
DT.fit(X_train, y_train)
y_pred = LR.predict(X_test)
DT_acc = round(accuracy_score(y_pred, y_test)*100, 2)
print(classification_report(y_test, y_pred))
print("Train Score:", DT.score(X_train, y_train))
print("Test Score:", DT.score(X_test, y_test))
DT_acc = round(accuracy_score(y_pred, y_test)*100, 2)
print(DT_acc)

              precision    recall  f1-score   support

           0       0.73      1.00      0.85       448
           1       0.00      0.00      0.00       162

    accuracy                           0.73       610
   macro avg       0.37      0.50      0.42       610
weighted avg       0.54      0.73      0.62       610

Train Score: 1.0
Test Score: 0.9065573770491804
73.44


In [968]:
from sklearn.ensemble import GradientBoostingClassifier

#Gradient Boosting Classifier
GBC = GradientBoostingClassifier()
GBC.fit(X_train, y_train)
y_pred = GBC.predict(X_test)
GBC_acc = round(accuracy_score(y_pred, y_test)*100, 2)
print(classification_report(y_test, y_pred))
print("Train Score:", GBC.score(X_train, y_train))
print("Test Score:", GBC.score(X_test, y_test))
GBC_acc = round(accuracy_score(y_pred, y_test)*100, 2)
print(GBC_acc)

              precision    recall  f1-score   support

           0       0.95      0.97      0.96       448
           1       0.92      0.85      0.88       162

    accuracy                           0.94       610
   macro avg       0.93      0.91      0.92       610
weighted avg       0.94      0.94      0.94       610

Train Score: 0.9951456310679612
Test Score: 0.940983606557377
94.1


In [969]:
from sklearn.naive_bayes import GaussianNB

# Gaussian Naive Bayes
GNB = GaussianNB()
GNB.fit(X_train, y_train)
y_pred = GNB.predict(X_test)
GNB_acc = round(accuracy_score(y_pred, y_test)*100, 2)
print(classification_report(y_test, y_pred))
print("Train Score:", GNB.score(X_train, y_train))
print("Test Score:", GNB.score(X_test, y_test))
GNB_acc = round(accuracy_score(y_pred, y_test)*100, 2)
print(GNB_acc)

              precision    recall  f1-score   support

           0       0.97      0.19      0.32       448
           1       0.31      0.98      0.47       162

    accuracy                           0.40       610
   macro avg       0.64      0.59      0.39       610
weighted avg       0.79      0.40      0.36       610

Train Score: 0.3907766990291262
Test Score: 0.4016393442622951
40.16


## Tabela de comparação com base na accuracy

In [970]:
#Comparação dos Modelos Testados para averiguação de qual o possivel a ser usado
models = pd.DataFrame({
    'Modelo' : ['RandomForestClassifier', 'KNN', 'Logistic Regression', 'DecisionTreeClassifier', 'Gradient Boosting Classifier', 'Gaussian Naive Bayes'],
    'Accuracy': [RF_acc, KNN_acc, LR_acc, DT_acc, GBC_acc, GNB_acc]})

models.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Modelo,Accuracy
0,RandomForestClassifier,94.92
4,Gradient Boosting Classifier,94.1
1,KNN,82.62
2,Logistic Regression,73.44
3,DecisionTreeClassifier,73.44
5,Gaussian Naive Bayes,40.16


### Resultado
 
* Com base na tabela acima representada, verificamos que o algoritmo com melhor Accuracy foi o Random Forest.
* Assim sendo, prosseguimos para o teste dos dados com esse algoritmo

## Modelos 

* O primeiro parametro que começamos a alterar, em prol, da obtenção de um modelo mais preciso foi o n_estimators.
* O parametro n_estimators, representa o número de árvores que a floresta irá ter. Esse número é 100 de padrão.

In [971]:
#RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(n_estimators=500)
RF.fit(X_train, y_train)
y_pred = RF.predict(X_test)
print(classification_report(y_test, y_pred))
print("Train Score:", RF.score(X_train, y_train))
print("Test Score:", RF.score(X_test, y_test))
RF_acc = round(accuracy_score(y_pred, y_test)*100, 2)
print(RF_acc)

              precision    recall  f1-score   support

           0       0.95      0.98      0.96       448
           1       0.93      0.85      0.89       162

    accuracy                           0.94       610
   macro avg       0.94      0.91      0.92       610
weighted avg       0.94      0.94      0.94       610

Train Score: 1.0
Test Score: 0.9426229508196722
94.26


### Resultados obtidos:
* Para o algoritmo foram testados diferentes número de estimadores(100(padrão), 200, 500, 800, 100)

* Os resulatados no Kaggle foram:
<br>
&nbsp&nbsp&nbsp&nbsp&nbsp&nbsp;
Para 100 estimadores: 0.95172
<br>
&nbsp&nbsp&nbsp&nbsp&nbsp&nbsp;
Para 200 estimadores: 0.95172
<br>
&nbsp&nbsp&nbsp&nbsp&nbsp&nbsp;
Para 500 estimadores: 0.95833
<br>
&nbsp&nbsp&nbsp&nbsp&nbsp&nbsp;
Para 800 estimadores: 0.95833
<br>
&nbsp&nbsp&nbsp&nbsp&nbsp&nbsp;
Para 1000 estimadores: 0.95172

### Conclusões:
* Entre 100 e 200 e até 500, score manteve-se inalterável
* Em 500, score sobe consideravelmente
* Entre 500 a 800, score constante
* Entre 800 e 100, score volta a decrescer
<br>
<br>
* **Prosseguimos com o valor de 500 estimadores**

## Outros parametros testados
* max_depth, este parametro representa a profundidade máxima da árvore

* Foram testados os valores:
<br>
&nbsp&nbsp&nbsp&nbsp&nbsp&nbsp;
Valor padrão - None
<br>
&nbsp&nbsp&nbsp&nbsp&nbsp&nbsp;
maxp_depth = 10
<br>
&nbsp&nbsp&nbsp&nbsp&nbsp&nbsp;
maxp_depth = 4


### Conclusões
* **Quer com o valor padrão, quer com 10, o score manteve-se constante em 0.95833**
* **Com max_depth=4 e 500 estimadores, score desceu para 0.93617**

<br>

* max_leaf_nodes, este parametro representa o numero de folhas da árvore

* Foram testados os valores:
<br>
&nbsp&nbsp&nbsp&nbsp&nbsp&nbsp;
max_leaf_nodes = 1000
<br>
&nbsp&nbsp&nbsp&nbsp&nbsp&nbsp;
max_leaf_nodes = 500
<br>
&nbsp&nbsp&nbsp&nbsp&nbsp&nbsp;
max_leaf_nodes = 250
<br>
&nbsp&nbsp&nbsp&nbsp&nbsp&nbsp;
max_leaf_nodes = 50
<br>
&nbsp&nbsp&nbsp&nbsp&nbsp&nbsp;
max_leaf_nodes = 30

### Conclusões
* **Os valores de scores obtidos no kaggle foram todos diferentes. Sendo mais elevado aquele em que max_leaf_nodes=500 e max_leaf_nodes=1000 e menor aquele em que max_leaf_nodes tinha o menor valor, 30.**


# Produção dos Ficheiros de Submissão

In [972]:
ids_for_test = test['Id']

#Constrição do Ficheiro de Submissão
submission = pd.DataFrame({
        "Id": ids_for_test,
        "Failure": RF.predict(test)
    })

submission.to_csv('submission_46.csv', index=False)

* Foram testados diversas combinações dos parametros acim mencionados, porém nunca obtendo score superior a 0.95833.

### Outros Testes

* Foram tambem testados ainda, modelos em que fossem excluídos alguns atributos. Atributos esses que tinham o menor peso no calculo da accuracy.
* Obtivemos esses valor de peso apartir da seguinte tabela.

In [None]:
feature_importances_df = pd.DataFrame(
    {"feature": list(X.columns), "importance": classifier.feature_importances_}
).sort_values("importance", ascending=False)

feature_importances_df

In [None]:
#Visualização das importâncias através de um gráfico de barras

sns.barplot(x=feature_importances_df.feature, y=feature_importances_df.importance)

plt.xlabel("Feature Importance Score")
plt.ylabel("Features")
plt.title("Visualizing Important Features")
plt.xticks(
    rotation=45, horizontalalignment="right", fontweight="light", fontsize="x-large"
)
plt.show()

* Testes estes sem sucesso, com isto dizemos, sem incremento do valor do score, relativamente ao maior obtido

# 5 Melhores submissões no Kaggle

### Submissões
* Notas: 
&nbsp&nbsp&nbsp&nbsp&nbsp&nbsp;
<br>
Todas as submissões foram feitas usando todos os atributos presentes no ficheiro test.csv
&nbsp&nbsp&nbsp&nbsp&nbsp&nbsp;
<br>
Com 0.33 de test_size no split

* RandomForestClassifier(n_estimators=500), com score de 0.95833

* RandomForestClassifier(n_estimators=500, max_leaf_nodes=1000), com score de 0.95833

* RandomForestClassifier(n_estimators=500, max_leaf_nodes=500), com score de 0.95833

* RandomForestClassifer(n_estimators=500, max_depth=10), com score de 0.95833

* RandomForestClassifier(n_estimators=800), com score de 0.95833


### **Score Máximo Obtido:**
* 0.95833