In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OrdinalEncoder
import matplotlib.pyplot as plt
import joblib
from sklearn.linear_model import LinearRegression
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

In [2]:
df = pd.read_csv("final_data.csv")
ordinal_features = [' khoa', ' hedt', ' chuyennganh2']

df[ordinal_features] = df[ordinal_features].fillna('Unknown')

encoder = OrdinalEncoder()
df_encoded = encoder.fit_transform(df[ordinal_features])

df_encoded = pd.DataFrame(df_encoded, columns=[f"{col}_mahoa" for col in ordinal_features], index=df.index)

df = pd.concat([df, df_encoded], axis=1)

df.drop(columns=ordinal_features, inplace=True)


print(df.head())

                                       mssv  hocky  namhoc  dtbhk  sotchk  \
0  0000AC05XPvAibaEXe9B2tolTZ0JLoBGbkQixQS6    2.0  2021.0   8.28    21.0   
1  0000AC05XPvAibaEXe9B2tolTZ0JLoBGbkQixQS6    1.0  2022.0   7.56    16.0   
2  0001EB57XPvAibaEXe/twT+sf632fUXnsgPGeB4G    2.0  2019.0   9.00    21.0   
3  0001EB57XPvAibaEXe/twT+sf632fUXnsgPGeB4G    1.0  2020.0   9.11    15.0   
4  0001EB57XPvAibaEXe/twT+sf632fUXnsgPGeB4G    2.0  2020.0   8.75    19.0   

        id   namsinh   gioitinh     noisinh      lopsh   khoahoc   tinhtrang  \
0      NaN       NaN        NaN         NaN        NaN       NaN         NaN   
1      NaN       NaN        NaN         NaN        NaN       NaN         NaN   
2  18570.0    2001.0        0.0   'Nghệ An'   TMĐT2019      14.0         1.0   
3  18570.0    2001.0        0.0   'Nghệ An'   TMĐT2019      14.0         1.0   
4  18570.0    2001.0        0.0   'Nghệ An'   TMĐT2019      14.0         1.0   

     diachi_tinhtp Column1  dtbhk2   khoa_mahoa   hedt_m

In [3]:
df = df.fillna(df.median(numeric_only=True))
df = df.fillna(df.mode().iloc[0])

# Chọn biến đầu vào và đầu ra
X = df.drop(columns=['nhom', 'dtbhk', 'id', ' namsinh', 'mssv', ' noisinh', ' diachi_tinhtp', 'Column1', ' lopsh'], errors='ignore')  # Xóa cột không cần thiết
y = df['dtbhk']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lr_model = LinearRegression()

In [4]:
lr_model.fit(X_train, y_train)
# tập Test
y_pred = lr_model.predict(X_test)

# Đánh giá
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R^2 Score: {r2}")

Mean Absolute Error (MAE): 0.9144273316818844
Mean Squared Error (MSE): 1.792692656017364
Root Mean Squared Error (RMSE): 1.3389147306745728
R^2 Score: 0.5175107420501133


In [2]:
spark = SparkSession \
    .builder \
    .appName("how to read csv file") \
    .getOrCreate()



25/04/08 22:10:44 WARN Utils: Your hostname, kpubuntu resolves to a loopback address: 127.0.1.1; using 10.0.240.155 instead (on interface wlp0s20f3)
25/04/08 22:10:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/08 22:10:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [16]:
spark = SparkSession.builder \
    .appName("StudentGradePrediction") \
    .getOrCreate()

# Load data
df = spark.read.csv("/DoAn/final_data.csv", header=True, inferSchema=True)

# Handle null values in ordinal features
ordinal_features = [" khoa", " hedt", " chuyennganh2"]
for col in ordinal_features:
    df = df.na.fill("Unknown", subset=[col])

# Create string indexers for ordinal features
indexers = [
    StringIndexer(inputCol=col, outputCol=f"{col}_mahoa", handleInvalid="keep")
    for col in ordinal_features
]


25/04/08 22:30:48 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [17]:
columns_to_drop = ['nhom', 'dtbhk', 'id', ' namsinh', 'mssv', ' noisinh', ' diachi_tinhtp', 'Column1', ' lopsh'] + ordinal_features

# Define the pipeline stages
pipeline_stages = indexers.copy()

# Apply the pipeline to transform the data
pipeline = Pipeline(stages=pipeline_stages)
transformed_df = pipeline.fit(df).transform(df)

# Display the transformed data
transformed_df.show(5)

# Handle remaining nulls - Using median for numeric columns
numeric_cols = [col for col in transformed_df.columns 
                if col not in columns_to_drop and transformed_df.schema[col].dataType.typeName() in ['integer', 'double']]

# Calculate median for each numeric column and fill nulls
medians = {}
for col in numeric_cols:
    median_value = transformed_df.approxQuantile(col, [0.5], 0.001)[0]
    medians[col] = median_value
    transformed_df = transformed_df.na.fill({col: median_value})

# For remaining non-numeric columns, fill with mode
for col in transformed_df.columns:
    if col not in numeric_cols and col not in columns_to_drop:
        # Find mode
        mode_value = transformed_df.groupby(col).count().orderBy("count", ascending=False).first()[0]
        transformed_df = transformed_df.na.fill({col: mode_value})


+--------------------+-----+------+-----+------+-------+--------+---------+----------+---------+-------+-------+--------+-------------+----------+---------------+-------+------+-----------+-----------+-------------------+
|                mssv|hocky|namhoc|dtbhk|sotchk|     id| namsinh| gioitinh|   noisinh|    lopsh|   khoa|   hedt| khoahoc| chuyennganh2| tinhtrang|  diachi_tinhtp|Column1|dtbhk2| khoa_mahoa| hedt_mahoa| chuyennganh2_mahoa|
+--------------------+-----+------+-----+------+-------+--------+---------+----------+---------+-------+-------+--------+-------------+----------+---------------+-------+------+-----------+-----------+-------------------+
|0000AC05XPvAibaEX...|  2.0|2021.0| 8.28|  21.0|   NULL|    NULL|     NULL|      NULL|     NULL|Unknown|Unknown|    NULL|      Unknown|      NULL|           NULL|   NULL|  7.79|        3.0|        2.0|                1.0|
|0000AC05XPvAibaEX...|  1.0|2022.0| 7.56|  16.0|   NULL|    NULL|     NULL|      NULL|     NULL|Unknown|Unknown|

In [18]:
feature_cols = [col for col in transformed_df.columns 
                if col not in columns_to_drop]

# Create vector assembler for feature columns
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
assembled_df = assembler.transform(transformed_df)

# Split data into training and test sets
train_df, test_df = assembled_df.randomSplit([0.8, 0.2], seed=42)

# Create and train the linear regression model
lr = LinearRegression(featuresCol="features", labelCol="dtbhk")
lr_model = lr.fit(train_df)

# Make predictions on test data
predictions = lr_model.transform(test_df)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol="dtbhk", predictionCol="prediction")

mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R^2 Score: {r2}")

# Save the model
lr_model.write().overwrite().save("spark_lr_model")

# Stop Spark session
spark.stop()

25/04/08 22:31:14 WARN Instrumentation: [db087753] regParam is zero, which might cause numerical instability and overfitting.
25/04/08 22:31:17 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/04/08 22:31:18 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
                                                                                

Mean Absolute Error (MAE): 0.899235506863826
Mean Squared Error (MSE): 1.7005400994995126
Root Mean Squared Error (RMSE): 1.30404758329576
R^2 Score: 0.5283404378295444


                                                                                