In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [4]:
student_data = pd.read_csv('../1.Data/student_data_filt.csv', index_col=False)
data_mapping = pd.read_csv('../1.Data/data_type_mapping.csv', index_col=False)

#### Review Data Types, Association, and Scaled Impact

In [3]:
data_mapping.sort_values(by=['Impact'])

Unnamed: 0,Column,Data Type,Association Type,Association Result,Impact
16,Scholarship holder,Binary,Phi Coefficient,0.31238,High
1,Application mode,Categorical,Cramer's V,0.311989,High
14,Tuition fees up to date,Binary,Phi Coefficient,0.441309,High
3,Course,Categorical,Cramer's V,0.340179,High
29,Curricular units 2nd sem (grade),Continuous,Point Biserial,-0.60535,High
28,Curricular units 2nd sem (approved),Numerical-discrete,Point Biserial,-0.653995,High
15,Gender,Categorical,Cramer's V,0.251955,High
22,Curricular units 1st sem (approved),Numerical-discrete,Point Biserial,-0.554881,High
23,Curricular units 1st sem (grade),Continuous,Point Biserial,-0.519927,High
19,Curricular units 1st sem (credited),Numerical-discrete,Point Biserial,-0.0469,Low


#### Split Data into Train Test

In [4]:
#Data needs to be split into training and testing sets

#Split the data into features and target label
target = student_data['Target']
features = student_data.drop('Target', axis=1)

#Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=42)


#### Encode Categorical Variables

In [5]:
#create a list of categorical columns
categorical_columns = data_mapping[data_mapping['Data Type'] == 'Categorical']['Column'].values

In [6]:
#Split the data into categorical and numerical features
X_train_cat = X_train[categorical_columns]
X_test_cat = X_test[categorical_columns]
X_train_num = X_train.drop(categorical_columns, axis=1)
X_test_num = X_test.drop(categorical_columns, axis=1)

In [7]:
#One hot encode the categorical data
#One hot encoding is used to convert categorical data, which is represented as strings, into numerical data
#This is done because most machine learning algorithms cannot handle categorical data
#One hot encoding creates a new column for each unique value in a categorical column

#keep track of the index of the training and testing sets
index_train = X_train_cat.index
index_test = X_test_cat.index

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X_train_cat)
X_test_encoded = encoder.transform(X_test_cat)

In [8]:
#Print the shape of the encoded data
print(X_train_encoded.shape)
print(X_test_encoded.shape)

(2722, 65)
(908, 65)


In [9]:
#Convert the encoded data into a dataframe
X_train_encoded = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out())
X_test_encoded = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out())
X_train_encoded.index = index_train
X_test_encoded.index = index_test

In [10]:
#Concatenate the encoded data with the numerical data
X_train = pd.concat([X_train_encoded, X_train_num], axis=1)
X_test = pd.concat([X_test_encoded, X_test_num], axis=1)

In [11]:
X_train.to_csv('../1.Data/TrainTest/X_train.csv', index=False)
X_test.to_csv('../1.Data/TrainTest/X_test.csv', index=False)
y_train.to_csv('../1.Data/TrainTest/y_train.csv', index=False)
y_test.to_csv('../1.Data/TrainTest/y_test.csv', index=False)


##### Removal of 2nd Semester Figures to determine early intervention opportunities

In [5]:
#identiy the columns that have '2nd' in them
second_semester = [col for col in student_data if '2nd' in col] 

In [6]:
second_semester

['Curricular units 2nd sem (credited)',
 'Curricular units 2nd sem (enrolled)',
 'Curricular units 2nd sem (evaluations)',
 'Curricular units 2nd sem (approved)',
 'Curricular units 2nd sem (grade)',
 'Curricular units 2nd sem (without evaluations)']

In [8]:
student_data_nosecond = student_data.drop(second_semester, axis=1)

In [9]:
#Data needs to be split into training and testing sets

#Split the data into features and target label
target = student_data_nosecond['Target']
features = student_data_nosecond.drop('Target', axis=1)

#Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=36)

In [10]:
#create a list of categorical columns
categorical_columns = data_mapping[data_mapping['Data Type'] == 'Categorical']['Column'].values

In [11]:
#Split the data into categorical and numerical features
X_train_cat = X_train[categorical_columns]
X_test_cat = X_test[categorical_columns]
X_train_num = X_train.drop(categorical_columns, axis=1)
X_test_num = X_test.drop(categorical_columns, axis=1)
#One hot encode the categorical data
#One hot encoding is used to convert categorical data, which is represented as strings, into numerical data
#This is done because most machine learning algorithms cannot handle categorical data
#One hot encoding creates a new column for each unique value in a categorical column

#keep track of the index of the training and testing sets
index_train = X_train_cat.index
index_test = X_test_cat.index

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X_train_cat)
X_test_encoded = encoder.transform(X_test_cat)

In [12]:
#Print the shape of the encoded data
print(X_train_encoded.shape)
print(X_test_encoded.shape)

(2722, 65)
(908, 65)


In [13]:
#Convert the encoded data into a dataframe
X_train_encoded = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out())
X_test_encoded = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out())
X_train_encoded.index = index_train
X_test_encoded.index = index_test

In [14]:
#Concatenate the encoded data with the numerical data
X_train = pd.concat([X_train_encoded, X_train_num], axis=1)
X_test = pd.concat([X_test_encoded, X_test_num], axis=1)

In [15]:
X_train.to_csv('../1.Data/TrainTest/X_train_no2.csv', index=False)
X_test.to_csv('../1.Data/TrainTest/X_test_no2.csv', index=False)
y_train.to_csv('../1.Data/TrainTest/y_train_no2.csv', index=False)
y_test.to_csv('../1.Data/TrainTest/y_test_no2.csv', index=False)