In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [20]:
train=pd.read_csv("../../Apuntes_MiDataSets/playground/playground-series-s4e6/train.csv")

In [21]:
colnames=train.columns

In [22]:
[col for col in colnames if col.startswith("Curricular")]

['Curricular units 1st sem (credited)',
 'Curricular units 1st sem (enrolled)',
 'Curricular units 1st sem (evaluations)',
 'Curricular units 1st sem (approved)',
 'Curricular units 1st sem (grade)',
 'Curricular units 1st sem (without evaluations)',
 'Curricular units 2nd sem (credited)',
 'Curricular units 2nd sem (enrolled)',
 'Curricular units 2nd sem (evaluations)',
 'Curricular units 2nd sem (approved)',
 'Curricular units 2nd sem (grade)',
 'Curricular units 2nd sem (without evaluations)']

In [23]:
X=train.drop(columns=['Target'])
y=train['Target']

In [24]:
def transforma(df):
    # Tiro la columna id, course en train
    
    df=df.drop(columns=['id','Course'])
    # realmente no hace falta tirarlas aqui
    
    # aqui hago nuevas columnas
    colnames=df.columns
    curricular_columns=[col for col in colnames if col.startswith("Curricular units")]
    
    df['UnitsCredited']=df['Curricular units 1st sem (credited)']+df['Curricular units 2nd sem (credited)']
    df['UnitsEnrolled']=df['Curricular units 1st sem (enrolled)']+df['Curricular units 2nd sem (enrolled)']
    df['UnitsEvaluations']=df['Curricular units 1st sem (evaluations)']+df['Curricular units 2nd sem (evaluations)']
    df['UnitsApproved']=df['Curricular units 1st sem (approved)']+df['Curricular units 2nd sem (approved)']
    df['UnitsWithoutEval']=df['Curricular units 1st sem (without evaluations)']+df['Curricular units 2nd sem (without evaluations)']
    df['MeanGrade']=(df['Curricular units 1st sem (grade)']+df['Curricular units 2nd sem (grade)'])/2
    
    # combino nuevas columnas
    df['TotalUnits']=df['UnitsCredited']+df['UnitsEnrolled']+df['UnitsApproved']
    df['PercCredited']=df['UnitsCredited']/df['TotalUnits'] 
    df['PercEnrolled']=df['UnitsEnrolled']/df['TotalUnits']
    df['PercApproved']=df['UnitsApproved']/df['TotalUnits']
    new_columns=["UnitsCredited","UnitsEnrolled","UnitsEvaluations","UnitsApproved","UnitsWithoutEval","MeanGrade","TotalUnits","PercCredited","PercEnrolled","PercApproved"]
    
    # hay NA cuando TotalUnits == 0, subsituyo los nans por 0
    df = df.replace(np.nan,0)
    
    # me quedo con las columnas que he trabajado
    data = df[curricular_columns+new_columns]
    return data

In [25]:
X=transforma(X)

In [26]:
X

Unnamed: 0,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),...,UnitsCredited,UnitsEnrolled,UnitsEvaluations,UnitsApproved,UnitsWithoutEval,MeanGrade,TotalUnits,PercCredited,PercEnrolled,PercApproved
0,0,6,6,6,14.500000,0,0,6,7,6,...,0,12,13,12,0,13.464286,24,0.0,0.500000,0.500000
1,0,6,8,4,11.600000,0,0,6,9,0,...,0,12,17,4,0,5.800000,16,0.0,0.750000,0.250000
2,0,6,0,0,0.000000,0,0,6,0,0,...,0,12,0,0,0,0.000000,12,0.0,1.000000,0.000000
3,0,7,9,7,12.591250,0,0,8,11,7,...,0,15,20,14,0,12.705625,29,0.0,0.517241,0.482759
4,0,7,12,6,12.933333,0,0,7,12,6,...,0,14,24,12,0,12.933333,26,0.0,0.538462,0.461538
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76513,0,6,9,6,10.666667,0,0,6,8,5,...,0,12,17,11,0,10.633333,23,0.0,0.521739,0.478261
76514,0,6,22,4,13.000000,0,0,6,9,6,...,0,12,31,10,0,13.437500,22,0.0,0.545455,0.454545
76515,0,5,13,4,12.500000,2,0,5,8,5,...,0,10,21,9,3,11.950000,19,0.0,0.526316,0.473684
76516,0,6,0,0,0.000000,0,0,6,0,0,...,0,12,0,0,0,0.000000,12,0.0,1.000000,0.000000


## Modelo

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [28]:

clf=RandomForestClassifier(max_depth=10)
clf.fit(X_train,y_train)

In [29]:
yhat_train = clf.predict(X_train)
accuracy_score(y_train,yhat_train)

0.8221272943608949

In [30]:
clf.predict_proba(X_train)
# dropout, enrolled, graduated

array([[0.18615474, 0.70425281, 0.10959245],
       [0.02346057, 0.0697784 , 0.90676103],
       [0.06777414, 0.0762604 , 0.85596546],
       ...,
       [0.06333856, 0.03036683, 0.90629462],
       [0.56814435, 0.42648716, 0.00536848],
       [0.04006013, 0.08988716, 0.87005271]])

In [31]:
yhat_train

array(['Enrolled', 'Graduate', 'Graduate', ..., 'Graduate', 'Dropout',
       'Graduate'], dtype=object)

In [32]:
yhat_test=clf.predict(X_test)
accuracy_score(y_test,yhat_test)

0.806344303195913

# Repetimos el proceso para test

In [33]:
test=pd.read_csv("../../Apuntes_MiDataSets/playground/playground-series-s4e6/test.csv")
submission=pd.read_csv("../../Apuntes_MiDataSets/playground/playground-series-s4e6/sample_submission.csv")


#### Puedo utilizar la misma transformación para test

In [34]:
X_submission = transforma(test)
# no tengo que repetir el codigo

In [35]:
X_submission.head()

Unnamed: 0,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),...,UnitsCredited,UnitsEnrolled,UnitsEvaluations,UnitsApproved,UnitsWithoutEval,MeanGrade,TotalUnits,PercCredited,PercEnrolled,PercApproved
0,0,7,0,0,0.0,0,0,8,0,0,...,0,15,0,0,0,0.0,15,0.0,1.0,0.0
1,0,6,7,6,14.857143,0,0,6,6,6,...,0,12,13,12,0,14.178571,24,0.0,0.5,0.5
2,0,6,11,6,12.0,0,0,6,11,5,...,0,12,22,11,0,11.5,23,0.0,0.521739,0.478261
3,2,6,15,5,11.5,0,3,8,14,5,...,5,14,29,10,0,11.25,29,0.172414,0.482759,0.344828
4,0,6,9,3,11.0,0,0,6,9,4,...,0,12,18,7,2,10.833333,19,0.0,0.631579,0.368421


In [36]:
yhat_submission = clf.predict(X_submission)

In [37]:
yhat_submission

array(['Dropout', 'Graduate', 'Graduate', ..., 'Dropout', 'Enrolled',
       'Dropout'], dtype=object)

In [38]:
submission.Target = yhat_submission
submission.head()

Unnamed: 0,id,Target
0,76518,Dropout
1,76519,Graduate
2,76520,Graduate
3,76521,Enrolled
4,76522,Enrolled


In [39]:
submission.to_csv("randomforest.csv",index=False)