### Import packages

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import Lasso
import numpy as np

### Read the data

In [None]:
train = pd.read_csv('/root/autodl-tmp/data/train.csv')
train_data = train.iloc[:, :-1]
train_label = train.iloc[:, -1]

test = pd.read_csv('/root/autodl-tmp/data/test.csv')
test_data = test.iloc[:, :-1]
test_label = test.iloc[:, -1]

### Exploratory Data Analysis
#### 1. Explore the dataset

In [None]:
train_data.head(1)

In [None]:
train_data

In [None]:
test_data.head(1)

In [None]:
test_data

#### 2. Dataset cleaning: remove the 'ID' column, which is irrelevant to our prediction

In [None]:
train_data = train_data.drop(train_data.columns[0], axis=1)
test_data = test_data.drop(test_data.columns[0], axis=1)

#### 3. Check NAs

In [None]:
train_na_summary = train_data.isna().sum()
print("Missing values in train_data_reordered:")
print(train_na_summary[train_na_summary > 0])

test_na_summary = test_data.isna().sum()
print("\nMissing values in test_data_reordered:")
print(test_na_summary[test_na_summary > 0])

#### 4. Visualization of categorical and continuous variables

In [None]:
categorial_variables = []
numeric_variables = []

categorial_variables = ['Marital status', 'Application mode', 'Course', 'Daytime/evening attendance', 
            'Previous qualification', 'Nacionality', 'Mother\'s qualification', 'Father\'s qualification', 
            'Mother\'s occupation', 'Father\'s occupation', 'Displaced', 'Educational special needs', 
            'Debtor', 'Tuition fees up to date', 'Gender', 'Scholarship holder', 'International']
numeric_variables = list(set(train_data.columns)-set(categorial_variables))

In [None]:
# categorial variables
fig, axes = plt.subplots(3, 6, figsize=(40, 20))
for k in range(len(categorial_variables)):
    sns.histplot(data=train, x=categorial_variables[k], ax=axes[k // 6, k % 6], bins=10, kde=False)
    axes[k // 6, k % 6].set_ylabel('Number of Students')
    axes[k // 6, k % 6].set_title(categorial_variables[k])
plt.tight_layout()
plt.show()

In [None]:
# numeric variables
fig, axes = plt.subplots(3, 6, figsize=(40, 20))
for k in range(len(numeric_variables)):
    sns.kdeplot(data=train, x=numeric_variables[k], ax=axes[k // 6, k % 6], fill=True, color='darkblue', alpha=0.5)
    axes[k // 6, k % 6].set_ylabel('Density')
    axes[k // 6, k % 6].set_title(numeric_variables[k])
plt.tight_layout()
plt.show()

##### 4.1 Normalization: Z-score standization (due to the observation of distribution of variables)

In [None]:
scaler = StandardScaler()
train_data_scaled = scaler.fit_transform(train_data)
test_data_scaled = scaler.fit_transform(test_data)

# scaler = MinMaxScaler()
# train_data_scaled = scaler.fit_transform(train_data)
# test_data_scaled = scaler.transform(test_data)

# train_data_scaled = train_data
# test_data_scaled = test_data

#### 5. Encode categorical variables ("Target")

In [None]:
type_dict = {'Graduate':0, 'Dropout':1, 'Enrolled':2}
train_label = train_label.map(type_dict)
test_label = test_label.map(type_dict)

#### 6. Imbalanced data

In [None]:
plt.figure(figsize=(8, 6))
palette = sns.color_palette("deep", 3)
sns.countplot(x='Target', data=train, palette=palette, hue='Target', legend=False)
plt.title('Distribution of Target Variable')
plt.xlabel('Target')
plt.ylabel('Count')
plt.show()

##### 6.1 SMOTE to handle

In [None]:
smote = SMOTE(random_state=42)
train_data_scaled_resampled, train_label_resampled = smote.fit_resample(train_data_scaled, train_label)
train_data_scaled_resampled_table = pd.DataFrame(train_data_scaled_resampled, columns=train_data.columns)
train_resampled = pd.concat([train_data_scaled_resampled_table, train_label_resampled], axis=1)
plt.figure(figsize=(8, 6))
palette = sns.color_palette("deep", 3)
sns.countplot(x='Target', data=train_resampled, palette=palette, hue='Target', legend=False)
plt.title('Distribution of Target Variable After SMOTE')
plt.xlabel('Target')
plt.ylabel('Count')
plt.show()

#### 7. Analyze the relationships between features (e.g., demographics, academic performance)

In [None]:
column_order = [
    'Marital status', 'Nacionality', 'Displaced', 'Gender',
    'Age at enrollment', 'International', "Mother's qualification",
    "Father's qualification", "Mother's occupation", "Father's occupation",
    'Educational special needs', 'Debtor', 'Tuition fees up to date',
    'Scholarship holder', 'Unemployment rate', 'Inflation rate', 'GDP',
    'Application mode', 'Application order', 'Course',
    'Daytime/evening attendance', 'Previous qualification',
    'Curricular units 1st sem (credited)', 'Curricular units 1st sem (enrolled)',
    'Curricular units 1st sem (evaluations)', 'Curricular units 1st sem (approved)',
    'Curricular units 1st sem (grade)', 'Curricular units 1st sem (without evaluations)',
    'Curricular units 2nd sem (credited)', 'Curricular units 2nd sem (enrolled)',
    'Curricular units 2nd sem (evaluations)', 'Curricular units 2nd sem (approved)',
    'Curricular units 2nd sem (grade)', 'Curricular units 2nd sem (without evaluations)'
]
train_data_reordered = train_data[column_order]
test_data_reordered = test_data[column_order]

In [None]:
correlation_matrix = train_data_reordered.corr()
plt.figure(figsize=(30,30))
sns.heatmap(correlation_matrix, cmap="Blues", annot=True)
plt.title("Correlation Heatmap of Train Dataset")
plt.xticks(rotation=70)
plt.show()

##### 7.1 Handle the collinearity (LASSO Regression)

In [None]:
lasso = Lasso(alpha=0.001)  # alpha is regularization strength
lasso.fit(train_data_scaled_resampled_table, train_label_resampled)

feature_weights = lasso.coef_
print("Feature Weights:", feature_weights)

selected_features = train_data.columns[feature_weights != 0]
print("Selected Features:", selected_features)

train_data_new = train_data_scaled_resampled_table[selected_features]

test_data_scaled_table = pd.DataFrame(test_data_scaled, columns=test_data.columns)
test_data_new = test_data_scaled_table[selected_features]

In [None]:
train_data_new

In [None]:
test_data_new

#### 8. Save the data to csv and npy

In [None]:
train_data_new.to_csv("/root/autodl-tmp/data/train_data_new.csv", index=False)
test_data_new.to_csv("/root/autodl-tmp/data/test_data_new.csv", index=False)

In [None]:
train_label_oh = np.eye(3)[train_label_resampled] # one-hot encoding
np.save("/root/autodl-tmp/data/train_label_new.npy", train_label_oh)

In [None]:
# train_data_new_0 = train_data_new[train_label_resampled==0].to_csv("/root/autodl-tmp/data/train_data_new_0.csv", index=False)
# train_data_new_1 = train_data_new[train_label_resampled==1].to_csv("/root/autodl-tmp/data/train_data_new_1.csv", index=False)
# train_data_new_2 = train_data_new[train_label_resampled==2].to_csv("/root/autodl-tmp/data/train_data_new_2.csv", index=False)