In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("D:/Projects/personalized-study-path/data/raw/students_data.csv")
df.head()

## EDA

In [None]:
print(df.shape)
print(df.columns)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

### Visual EDA Before Preprocessing

#### 1. Target Variable Distribution

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(6,4))
sns.countplot(data=df, x='Dropout_Likelihood')
plt.title("Dropout Likelihood Distribution")
plt.show()


#### 2. Numeric Column Distributions

In [None]:
df.hist(figsize=(15,10), bins= 30, edgecolor='black')
plt.tight_layout()
plt.show()

#### 3. Boxplots to see Outliers

In [None]:
numeric_cols = df.select_dtypes(include='number').columns

plt.figure(figsize=(16, 20))
for i, col in enumerate(numeric_cols, 1):
    plt.subplot(4, 2, i)
    sns.boxplot(y=df[col])
    plt.title(f'Box Plot of {col}')
    plt.tight_layout()


#### 4. Correlation Heatmap

In [None]:
# Select only numeric columns for correlation
numeric_df = df.select_dtypes(include='number')


plt.figure(figsize=(12, 8))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()


## Preprocessing

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [None]:
df = df.drop(columns=['Student_ID'])

In [None]:
# Separate features and target (optional for clustering)
X = df.drop(columns=['Dropout_Likelihood'])  # keep this column aside for later
y = df['Dropout_Likelihood']

In [None]:
# Define feature types
categorical_nominal = ['Gender', 'Course_Name', 'Learning_Style']
categorical_ordinal = ['Education_Level']
numerical_features = [col for col in X.select_dtypes(include=['int64', 'float64']).columns
                      if col not in categorical_ordinal]

In [None]:
# Optional: Define order for ordinal encoding
education_order = ['High School', 'Bachelor', 'Master', 'PhD']
education_mapping = {level: idx for idx, level in enumerate(education_order)}
X['Education_Level'] = X['Education_Level'].map(education_mapping)

In [None]:
# Define transformers
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [None]:
# Combine transformations
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_nominal)
])

In [None]:
# Fit and transform
X_preprocessed = preprocessor.fit_transform(X)

In [None]:
# Convert back to DataFrame (optional but helpful for inspection)
# Get new column names after one-hot encoding
ohe_cols = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_nominal)
final_columns = numerical_features + list(ohe_cols)

In [None]:
preprocessed_df = pd.DataFrame(X_preprocessed, columns=final_columns)

In [None]:
preprocessed_df

In [None]:
preprocessed_df.to_csv("preprocessed_students.csv", index=False)

In [None]:
preprocessed_df.dtypes