In [9]:
# Importamos las bibliotecas necesarias
import pandas as pd

In [10]:
# Cargar los archivos CSV
train_df = pd.read_csv('Data/train.csv')
test_df = pd.read_csv('Data/test.csv')
misconception_mapping_df = pd.read_csv('Data/misconception_mapping.csv')
sample_submission_df = pd.read_csv('Data/sample_submission.csv')

In [11]:
# Mostrar las primeras filas de cada archivo para verificar la estructura
print("Train CSV:")
print(train_df.head())

print("\nTest CSV:")
print(test_df.head())

print("\nMisconception Mapping CSV:")
print(misconception_mapping_df.head())

print("\nSample Submission CSV:")
print(sample_submission_df.head())

Train CSV:
   QuestionId  ConstructId                                      ConstructName  \
0           0          856  Use the order of operations to carry out calcu...   
1           1         1612  Simplify an algebraic fraction by factorising ...   
2           2         2774            Calculate the range from a list of data   
3           3         2377  Recall and use the intersecting diagonals prop...   
4           4         3387  Substitute positive integer values into formul...   

   SubjectId                                        SubjectName CorrectAnswer  \
0         33                                             BIDMAS             A   
1       1077                    Simplifying Algebraic Fractions             D   
2        339  Range and Interquartile Range from a List of Data             B   
3         88                       Properties of Quadrilaterals             C   
4         67                          Substitution into Formula             A   

               

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [13]:
# Combinar los textos de las preguntas y respuestas para entrenamiento
train_df['combined_text'] = train_df['QuestionText'] + ' ' + train_df['AnswerAText'] + ' ' + train_df['AnswerBText'] + ' ' + train_df['AnswerCText'] + ' ' + train_df['AnswerDText']
# Vectorización con TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)  # Puedes ajustar el número de características
X = vectorizer.fit_transform(train_df['combined_text'])

# Etiquetas: Misconception para cada respuesta incorrecta
y_A = train_df['MisconceptionAId'].fillna(-1)  # Rellenar NaN con un valor para indicar que no hay misconception
y_B = train_df['MisconceptionBId'].fillna(-1)
y_C = train_df['MisconceptionCId'].fillna(-1)
y_D = train_df['MisconceptionDId'].fillna(-1)

# Dividir en conjunto de entrenamiento y prueba
X_train, X_val, y_train_A, y_val_A = train_test_split(X, y_A, test_size=0.2, random_state=42)

In [14]:
# Ahora puedes entrenar un modelo de clasificación, por ejemplo, un Random Forest
from sklearn.ensemble import RandomForestClassifier

In [15]:
clf_A = RandomForestClassifier(n_estimators=100, random_state=42)
clf_A.fit(X_train, y_train_A)

# Evaluar en el conjunto de validación
y_pred_A = clf_A.predict(X_val)
print("Predicciones para AnswerA:", y_pred_A)

Predicciones para AnswerA: [-1.000e+00 -1.000e+00 -1.000e+00  1.786e+03 -1.000e+00 -1.000e+00
 -1.000e+00 -1.000e+00  1.035e+03 -1.000e+00 -1.000e+00 -1.000e+00
 -1.000e+00 -1.000e+00 -1.000e+00 -1.000e+00 -1.000e+00 -1.000e+00
 -1.000e+00 -1.000e+00 -1.000e+00 -1.000e+00  1.764e+03 -1.000e+00
 -1.000e+00 -1.000e+00 -1.000e+00 -1.000e+00 -1.000e+00 -1.000e+00
 -1.000e+00 -1.000e+00 -1.000e+00  8.430e+02 -1.000e+00  1.348e+03
 -1.000e+00 -1.000e+00  1.233e+03 -1.000e+00 -1.000e+00 -1.000e+00
 -1.000e+00 -1.000e+00 -1.000e+00 -1.000e+00  2.500e+03 -1.000e+00
 -1.000e+00 -1.000e+00 -1.000e+00 -1.000e+00 -1.000e+00 -1.000e+00
 -1.000e+00 -1.000e+00  3.100e+01  2.488e+03 -1.000e+00 -1.000e+00
 -1.000e+00 -1.000e+00 -1.000e+00 -1.000e+00 -1.000e+00 -1.000e+00
 -1.000e+00 -1.000e+00 -1.000e+00 -1.000e+00 -1.000e+00 -1.000e+00
 -1.000e+00 -1.000e+00 -1.000e+00 -1.000e+00 -1.000e+00 -1.000e+00
  8.670e+02 -1.000e+00 -1.000e+00 -1.000e+00 -1.000e+00  1.213e+03
  3.910e+02 -1.000e+00  9.780e+02 -