In [None]:
# ==========================================
# STEP 1: IMPORT LIBRARIES
# ==========================================
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 


# STEP 2: IMPORT DATASET
file_path = r"D:\project Data Mining\student-por.csv"

try:
    dataset = pd.read_csv(file_path)
    print(" Data Loaded Successfully.")
except FileNotFoundError:
    print(" Error: File not found. Please check the path.")
    

# Clean column names
dataset.columns = dataset.columns.str.strip()


# STEP 3: HANDLING MISSING DATA (CRITICAL STEP)

# Check and print missing values BEFORE handling
print("\n--- Missing Data Check (Before) ---")
missing_count = dataset.isnull().sum()
print(missing_count[missing_count > 0])

# Strategy: Replace missing values with the Mean (Average)
# We apply this ONLY to numerical columns to avoid errors with text columns
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Select numerical columns automatically
numerical_cols = dataset.select_dtypes(include=['int64', 'float64']).columns

# Apply the imputer
dataset[numerical_cols] = imputer.fit_transform(dataset[numerical_cols])

# Check and print missing values AFTER handling (Should be empty)
print("\n--- Missing Data Check (After) ---")
missing_after = dataset.isnull().sum()
if missing_after.sum() == 0:
    print(" All missing values have been handled successfully.")
else:
    print(" Warning: Some missing values remain!")
    print(missing_after[missing_after > 0])

# STEP 4: DEFINE FEATURES (X) AND TARGET (y)

# Now that data is clean, we separate X and y
X = dataset.drop(columns=['G1', 'G2', 'G3']).values
y = dataset['G3'].values

# STEP 5: ENCODING CATEGORICAL DATA

df_features = dataset.drop(columns=['G1', 'G2', 'G3'])
categorical_indices = [i for i, col in enumerate(df_features.columns) if df_features[col].dtype == 'object']

ct = ColumnTransformer(transformers=[
    ('encoder', OneHotEncoder(), categorical_indices)
], remainder='passthrough')

X = np.array(ct.fit_transform(X))

print("\n Categorical Data Encoded.")


# STEP 6: SPLITTING THE DATASET

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# STEP 7: FEATURE SCALING

sc = StandardScaler(with_mean=False) # with_mean=False is safer for sparse matrices from OneHotEncoder

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

print("\n Preprocessing Complete.")
print(f"Training Data Size: {len(X_train)}")
print(f"Testing Data Size: {len(X_test)}")