In [None]:
from azureml.core import Workspace, Dataset

ws = Workspace.from_config()
dataset = Dataset.get_by_name(ws, "student-data")
df = dataset.to_pandas_dataframe()  

In [None]:
df = df.drop(columns=['G1','G2'])


In [None]:
from azureml.core import Dataset, Workspace

ws = Workspace.from_config()

datastore = ws.get_default_datastore()

cleaned_dataset = Dataset.Tabular.register_pandas_dataframe(
    dataframe=df,
    target=datastore,        
    name="student-data-no-g1-g2",
    description="Student dataset without G1/G2"
)


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

df_encoded = df_preview.copy()
for col in df_encoded.select_dtypes(include=['object']).columns:
    df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col])

X = df_encoded.drop(columns=['G3'])
y = df_encoded['G3']

y_class = pd.cut(y, bins=[-1,10,15,20], labels=['low','medium','high'])

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y_class)

feat_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf.feature_importances_
}).sort_values(by='importance', ascending=False)

print(feat_importance)

low_importance = feat_importance[feat_importance['importance'] < 0.01]
print("\nFeatures that can likely be dropped:", low_importance['feature'].tolist())


In [None]:
from azureml.core import Workspace, Dataset

ws = Workspace.from_config()

cleaned_dataset = Dataset.get_by_name(ws, name="student-data-no-g1-g2")

df = cleaned_dataset.to_pandas_dataframe()

df = df.drop(columns=['higher'])

datastore = ws.get_default_datastore()
final_dataset = Dataset.Tabular.register_pandas_dataframe(
    dataframe=df,
    target=datastore,
    name="student-data-final",
    description="Student dataset without G1, G2 and higher"
)

print("New dataset registered:", final_dataset.name)


In [None]:
from azureml.core import Workspace, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ws = Workspace.from_config()
dataset = Dataset.get_by_name(ws, name='student-data-final')
df = dataset.to_pandas_dataframe()

X = df.drop(columns=['G3'])
y = df['G3']

cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('num', 'passthrough', num_cols)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.pipeline import Pipeline

rf = RandomForestRegressor(n_estimators=100, random_state=42)
gb = GradientBoostingRegressor(n_estimators=100, random_state=42)

voting_model = Pipeline([
    ('preprocess', preprocessor),
    ('ensemble', VotingRegressor([('rf', rf), ('gb', gb)]))
])

voting_model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

y_pred = voting_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R2: {r2:.4f}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Predicted vs Actual
plt.figure(figsize=(8,6))
sns.scatterplot(x=y_test, y=y_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel("Actual G3")
plt.ylabel("Predicted G3")
plt.title("Predicted vs Actual Grades")
plt.show()

# Residuals plot
residuals = y_test - y_pred
plt.figure(figsize=(8,6))
sns.histplot(residuals, kde=True, bins=20)
plt.xlabel("Residuals (Actual - Predicted)")
plt.title("Residual Distribution")
plt.show()


In [None]:
import joblib
joblib.dump(voting_model, "votingensemble_student_final.pkl")


In [None]:
from azureml.core import Workspace, Model

ws = Workspace.from_config()

model = Model.register(
    workspace=ws,
    model_path="votingensemble_student_final.pkl", 
    model_name="votingensemble_student_final",
    description="Voting ensemble model predicting G3",
)
print(f"Model registered: {model.name}, version: {model.version}")
