In [54]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [55]:
train=pd.read_csv("train.csv")

In [56]:
colnames=train.columns

In [57]:
[col for col in colnames if col.startswith("Curricular")]

['Curricular units 1st sem (credited)',
 'Curricular units 1st sem (enrolled)',
 'Curricular units 1st sem (evaluations)',
 'Curricular units 1st sem (approved)',
 'Curricular units 1st sem (grade)',
 'Curricular units 1st sem (without evaluations)',
 'Curricular units 2nd sem (credited)',
 'Curricular units 2nd sem (enrolled)',
 'Curricular units 2nd sem (evaluations)',
 'Curricular units 2nd sem (approved)',
 'Curricular units 2nd sem (grade)',
 'Curricular units 2nd sem (without evaluations)']

In [58]:
X=train.drop(columns=['Target'])
y=train['Target']

In [30]:
def transforma(df):
    # Tiro la columna id, course en train
    
    df=df.drop(columns=['id','Course'])
    # realmente no hace falta tirarlas aqui
    
    # aqui hago nuevas columnas
    colnames=df.columns
    curricular_columns=[col for col in colnames if col.startswith("Curricular units")]
    df['UnitsCredited']=df['Curricular units 1st sem (credited)']+df['Curricular units 2nd sem (credited)']
    df['UnitsEnrolled']=df['Curricular units 1st sem (enrolled)']+df['Curricular units 2nd sem (enrolled)']
    df['UnitsEvaluations']=df['Curricular units 1st sem (evaluations)']+df['Curricular units 2nd sem (evaluations)']
    df['UnitsApproved']=df['Curricular units 1st sem (approved)']+df['Curricular units 2nd sem (approved)']
    df['UnitsWithoutEval']=df['Curricular units 1st sem (without evaluations)']+df['Curricular units 2nd sem (without evaluations)']
    df['MeanGrade']=(df['Curricular units 1st sem (grade)']+df['Curricular units 2nd sem (grade)'])/2
    
    # combino nuevas columnas
    df['TotalUnits']=df['UnitsCredited']+df['UnitsEnrolled']+df['UnitsApproved']
    df['PercCredited']=df['UnitsCredited']/df['TotalUnits']
    df['PercEnrolled']=df['UnitsEnrolled']/df['TotalUnits']
    df['PercApproved']=df['UnitsApproved']/df['TotalUnits']
    new_columns=["UnitsCredited","UnitsEnrolled","UnitsEvaluations","UnitsApproved","UnitsWithoutEval","MeanGrade","TotalUnits","PercCredited","PercEnrolled","PercApproved"]
     # hay NA cuando TotalUnits == 0, subsituyo los nans por 0
    df = df.replace(np.nan,0)
    
    # me quedaré estas dos en formato string
    df['Application order'] = df['Application order'].astype("str")
    df["Nacionality"] = df["Nacionality"].astype("str")
    
    # me quedo con las columnas que he trabajado
    data = df[curricular_columns+new_columns+["Application order","Nacionality"]]
    return data

In [59]:
X=transforma(X)


In [60]:
X.head()

Unnamed: 0,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),...,UnitsEvaluations,UnitsApproved,UnitsWithoutEval,MeanGrade,TotalUnits,PercCredited,PercEnrolled,PercApproved,Application order,Nacionality
0,0,6,6,6,14.5,0,0,6,7,6,...,13,12,0,13.464286,24,0.0,0.5,0.5,1,1
1,0,6,8,4,11.6,0,0,6,9,0,...,17,4,0,5.8,16,0.0,0.75,0.25,1,1
2,0,6,0,0,0.0,0,0,6,0,0,...,0,0,0,0.0,12,0.0,1.0,0.0,2,1
3,0,7,9,7,12.59125,0,0,8,11,7,...,20,14,0,12.705625,29,0.0,0.517241,0.482759,3,1
4,0,7,12,6,12.933333,0,0,7,12,6,...,24,12,0,12.933333,26,0.0,0.538462,0.461538,2,1


## OneHotEncoder - GetDummies bien hecho

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [62]:
X_train.Nacionality.value_counts()

Nacionality
1      50941
41       151
26        45
6         36
22        31
11        11
24        10
2          7
105        6
100        5
103        5
101        5
25         5
62         3
21         3
109        2
17         1
Name: count, dtype: int64

In [63]:
# hago el preprocesamiento en el X_train
from sklearn.preprocessing import OneHotEncoder
objeto_dummies = OneHotEncoder(sparse_output=False, #por defecto el formato el formato es sparse
                               drop='first', # no creamos una nueva columna con la primera categoria que encuentra
                               handle_unknown='ignore') # ignora nuevas categorias en el test

objeto_dummies.fit(X_train[["Application order","Nacionality"]])
# aqui podriamos poner más columnas y lo trataría como un get_dummies
# este objeto recordará las diferentes categorías de cada columna
# cuando se aplique a otro dataset (test o submission) dataset creara el mismo numero de columnas

In [67]:
objeto_dummies.transform(X_train[["Application order","Nacionality"]])
# devuelve las dummies pero en formato array, no tiene nombres de columnas

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [68]:
objeto_dummies.get_feature_names_out(["Application order","Nacionality"])

array(['Application order_1', 'Application order_2',
       'Application order_3', 'Application order_4',
       'Application order_5', 'Application order_6', 'Nacionality_100',
       'Nacionality_101', 'Nacionality_103', 'Nacionality_105',
       'Nacionality_109', 'Nacionality_11', 'Nacionality_17',
       'Nacionality_2', 'Nacionality_21', 'Nacionality_22',
       'Nacionality_24', 'Nacionality_25', 'Nacionality_26',
       'Nacionality_41', 'Nacionality_6', 'Nacionality_62'], dtype=object)

In [69]:
nombre_columnas= objeto_dummies.get_feature_names_out(["Application order","Nacionality"])
# a nombre_columnas guardo los nombres de las nuevas columnas

In [70]:
# introducir los datos en el dataframe
X_train[nombre_columnas] = objeto_dummies.transform(X_train[["Application order","Nacionality"]])
# crear las nuevas columnas en mi dataframe

In [71]:
X_train = X_train.drop(columns=["Application order","Nacionality"])
# tirar las columnas antiguas

In [72]:
X_train.head()

Unnamed: 0,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),...,Nacionality_17,Nacionality_2,Nacionality_21,Nacionality_22,Nacionality_24,Nacionality_25,Nacionality_26,Nacionality_41,Nacionality_6,Nacionality_62
37166,0,7,8,7,12.82875,0,0,8,8,8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50881,0,5,5,0,0.0,0,0,5,5,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4990,0,6,9,6,10.833333,0,0,6,7,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25394,0,5,8,2,12.5,0,0,5,11,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18727,0,6,7,3,10.666667,0,0,6,7,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [73]:
clf=LogisticRegression(max_iter=10000)
clf=RandomForestClassifier(max_depth=13)
clf.fit(X_train,y_train)

In [74]:
yhat_train = clf.predict(X_train)
accuracy_score(y_train,yhat_train)

0.8442272807068875

In [75]:
# creo las nuevas columnas
X_test[nombre_columnas] = objeto_dummies.transform(X_test[["Application order","Nacionality"]])
# tiro las columnas
X_test=X_test.drop(columns=["Application order","Nacionality"])



In [76]:
X_test.head()

Unnamed: 0,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),...,Nacionality_17,Nacionality_2,Nacionality_21,Nacionality_22,Nacionality_24,Nacionality_25,Nacionality_26,Nacionality_41,Nacionality_6,Nacionality_62
50428,0,6,9,6,12.333333,0,0,6,8,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70338,0,6,6,0,0.0,0,0,5,5,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11004,0,7,12,6,12.385714,0,0,8,9,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72562,0,5,9,0,0.0,0,0,5,9,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70900,0,6,0,0,0.0,0,0,6,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
yhat_test = clf.predict(X_test)
accuracy_score(y_test,yhat_test)
# tengo overfitting

0.8034533285810463

## Una vez tengamos un modelo, hay que repetir el preprocesamiento a test (al conjunto que sirve para hacer una submission)

In [78]:
test=pd.read_csv("test.csv")
submission=pd.read_csv("sample_submission.csv")


In [79]:
# funcion de preprocesamiento
test = transforma(test)
# imputer.transform (...)
# scaler.tranform (...)
# transformo las dummies
test[nombre_columnas]=objeto_dummies.transform(test[["Application order","Nacionality"]])
# tiro las columnas antiguas
test=test.drop(columns=["Application order","Nacionality"])



In [None]:
# test = transforma(test)
# pipeline.predict(X_test)

In [80]:
yhat_submission = clf.predict(test)

In [81]:
yhat_submission

array(['Dropout', 'Graduate', 'Graduate', ..., 'Dropout', 'Enrolled',
       'Dropout'], dtype=object)

In [53]:
submission.Target = yhat_submission
submission.head()

Unnamed: 0,id,Target
0,76518,Dropout
1,76519,Graduate
2,76520,Graduate
3,76521,Dropout
4,76522,Enrolled


In [54]:
submission.to_csv("submission.csv",index=False)