In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [59]:
student_data = pd.read_csv('../1.Data/student_data_filt.csv', index_col=False)
data_mapping = pd.read_csv('../1.Data/data_type_mapping.csv', index_col=False)

In [60]:
#Data needs to be split into training and testing sets

#Split the data into features and target label
target = student_data['Target']
features = student_data.drop('Target', axis=1)

#Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=42)


#### Encode Categorical Variables

In [61]:
#create a list of categorical columns
categorical_columns = data_mapping[data_mapping['Data Type'] == 'Categorical']['Column'].values

In [62]:
#Split the data into categorical and numerical features
X_train_cat = X_train[categorical_columns]
X_test_cat = X_test[categorical_columns]
X_train_num = X_train.drop(categorical_columns, axis=1)
X_test_num = X_test.drop(categorical_columns, axis=1)

In [63]:
#One hot encode the categorical data
#One hot encoding is used to convert categorical data, which is represented as strings, into numerical data
#This is done because most machine learning algorithms cannot handle categorical data
#One hot encoding creates a new column for each unique value in a categorical column

#keep track of the index of the training and testing sets
index_train = X_train_cat.index
index_test = X_test_cat.index

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X_train_cat)
X_test_encoded = encoder.transform(X_test_cat)

In [64]:
#Print the shape of the encoded data
print(X_train_encoded.shape)
print(X_test_encoded.shape)

(2722, 66)
(908, 66)


In [65]:
#Convert the encoded data into a dataframe
X_train_encoded = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out())
X_test_encoded = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out())
X_train_encoded.index = index_train
X_test_encoded.index = index_test

In [71]:
#Concatenate the encoded data with the numerical data
X_train = pd.concat([X_train_encoded, X_train_num], axis=1)
X_test = pd.concat([X_test_encoded, X_test_num], axis=1)

In [73]:
X_train.to_csv('../1.Data/TrainTest/X_train.csv', index=False)
X_test.to_csv('../1.Data/TrainTest/X_test.csv', index=False)
y_train.to_csv('../1.Data/TrainTest/y_train.csv', index=False)
y_test.to_csv('../1.Data/TrainTest/y_test.csv', index=False)
