### Machine Learning Class Activity
Group Member: Guo Zimo (20233006327), Xie Bingcai (20233006377)

In [173]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, accuracy_score

## Part A: Data Preprocessing
1. Load data and check missing values.

In [174]:
df = pd.read_csv("ML-Activity-Data.csv")

print(df.shape)

print(df.isnull().sum())

(40, 15)
ID                                                          0
56947: \nHours studied per week (e.g., 6)\n\n               0
56948: \nAttendance percentage (e.g., 60)\n\n               0
56949: Average sleep hours per day (e.g., 6)                0
56950: Average daily social media use in hours (e.g., 2)    0
56951: Midterm score in % \n\n\n\n                          0
56952: \nInterest in Machine Learning\n\n                   0
56953: Confidence in programming                            0
56954: \nHours of exercise per week (e.g., 6)\n\n           0
56955: \nGroup work preference\n\n                          0
56956: English understanding level                          0
56957: Favorite study method                                0
56958: \nAverage caffeine drinks per day (e.g., 3)          0
56960: Grade Expectations                                   0
Target grade label                                          0
dtype: int64


2. Change the name of features.

In [175]:
df.columns = ["ID", "study hour", "attendance", "sleep hour", "phone use", "mid score", 
              "interest", "confidence", "sport hour", "group work", "English", "study method", 
              "caffine", "grade expect", "target grade"]

print(df.head())

   ID  study hour  attendance  sleep hour  phone use  mid score interest  \
0   1           7          95         8.0        2.0         75   medium   
1   2           8          95         8.0        3.0         80   medium   
2   3          20         100         6.0        2.0         40   medium   
3   4           7         100         7.5        4.0         95     high   
4   5           9         100         5.0        5.0        100     high   

  confidence  sport hour   group work       English study method  caffine  \
0        low           5  Small Group  Intermediate       Visual      1.0   
1     medium           2  Small Group  Intermediate     Hands-on      2.0   
2        low           2  Small Group  Intermediate       Visual      2.0   
3       high           3  Small Group       Advance     Hands-on      1.0   
4       high           3   Individual       Advance     Auditory      1.0   

  grade expect                   target grade  
0            B  Pass if i work h

3. Drop the first feature ID which is not important to model prediction.

In [176]:
df = df.drop(["ID"], axis=1)
print(df.shape)

(40, 14)


4. Use feature scaling to numerical data and ont-hot encoder to catogery data.

In [177]:
X = df.drop(["target grade"], axis=1)  
y = df["target grade"]

y = y.replace(["Definitely Pass", "Probably Pass", "Pass if i work hard from now.", "Not sure", "Probably Fail if i don't work hard."], [5, 4, 3, 2, 1])

preprocesser = ColumnTransformer(transformers=[
    ("num", StandardScaler(), ["study hour", "attendance", "sleep hour", "phone use", "mid score", "sport hour", "caffine"]),
    ("cat", OneHotEncoder(), ["interest", "confidence", "group work", "English", "study method", "grade expect"])
    
])

X_preprocessed = preprocesser.fit_transform(X)

  y = y.replace(["Definitely Pass", "Probably Pass", "Pass if i work hard from now.", "Not sure", "Probably Fail if i don't work hard."], [5, 4, 3, 2, 1])


5. Split training data set ang test data set.

In [178]:
random_state = 12704 # 6377 + 6327
X_trainging, X_test, y_training, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=random_state)

# print(X_trainging.shape)
# print(X_test.shape)
# print(y_training.shape)
# print(y_test.shape)
# print(y_test.shape)
# print(y_test)
# print(X_trainging[:3, :])

## Part B: Training Model

1. Ridge regression model.

In [179]:
ridgeClf_model = RidgeClassifier(alpha=0.4, random_state=random_state)

ridgeClf_model.fit(X_trainging, y_training)

y_pred = ridgeClf_model.predict(X_test)

y_pred = np.round(y_pred)

In [180]:
accuracy = accuracy_score(y_test, y_pred)
print("y text:", y_test)
print("y pred:", y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("R2 score: ", r2_score(y_test, y_pred))

y text: 11    3
37    5
39    5
34    3
17    4
18    5
9     5
14    5
Name: target grade, dtype: int64
y pred: [5 5 5 5 4 5 5 5]
Accuracy: 0.7500
R2 score:  -0.36170212765957444


5. Logistic regression model.

In [181]:
logistic_model = LogisticRegression(penalty="l2", multi_class="ovr", max_iter=1000)

logistic_model.fit(X_trainging, y_training)

y_pred = logistic_model.predict(X_test)

y_pred = np.round(y_pred)

In [182]:
accuracy = accuracy_score(y_test, y_pred)
print("y text:", y_test)
print("y pred:", y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("R2 score: ", r2_score(y_test, y_pred))

y text: 11    3
37    5
39    5
34    3
17    4
18    5
9     5
14    5
Name: target grade, dtype: int64
y pred: [5 5 5 5 4 5 5 5]
Accuracy: 0.7500
R2 score:  -0.36170212765957444


2. Decision tree model.

In [183]:
decisiontree_model = DecisionTreeClassifier(max_depth=6, random_state=random_state)

decisiontree_model.fit(X_trainging, y_training)

y_pred = decisiontree_model.predict(X_test)

y_pred = np.round(y_pred)

In [184]:
accuracy = accuracy_score(y_test, y_pred)
print("y text:", y_test)
print("y pred:", y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("R2 score: ", r2_score(y_test, y_pred))

y text: 11    3
37    5
39    5
34    3
17    4
18    5
9     5
14    5
Name: target grade, dtype: int64
y pred: [5 5 5 5 4 4 5 5]
Accuracy: 0.6250
R2 score:  -0.5319148936170213


3. Gaussian Naive Bayes model.

In [185]:
GNB_model = GaussianNB()

GNB_model.fit(X_trainging, y_training)

y_pred = GNB_model.predict(X_test)

y_pred = np.round(y_pred)

In [186]:
accuracy = accuracy_score(y_test, y_pred)
print("y text:", y_test)
print("y pred:", y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("R2 score: ", r2_score(y_test, y_pred))

y text: 11    3
37    5
39    5
34    3
17    4
18    5
9     5
14    5
Name: target grade, dtype: int64
y pred: [4 5 5 4 4 4 5 3]
Accuracy: 0.5000
R2 score:  -0.1914893617021276


4. Support vector machine model.

In [187]:
svm_model = SVC(kernel="linear", random_state=random_state)

svm_model.fit(X_trainging, y_training)

y_pred = svm_model.predict(X_test)

y_pred = np.round(y_pred)

In [188]:
accuracy = accuracy_score(y_test, y_pred)
print("y text:", y_test)
print("y pred:", y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("R2 score: ", r2_score(y_test, y_pred))

y text: 11    3
37    5
39    5
34    3
17    4
18    5
9     5
14    5
Name: target grade, dtype: int64
y pred: [5 5 5 5 4 4 5 3]
Accuracy: 0.5000
R2 score:  -1.2127659574468086


Random forest model.

In [189]:
forest_model = RandomForestClassifier(n_estimators=1000, random_state=random_state)

forest_model.fit(X_trainging, y_training)

y_pred = forest_model.predict(X_test)

y_pred = np.round(y_pred)

In [190]:
accuracy = accuracy_score(y_test, y_pred)
print("y text:", y_test)
print("y pred:", y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("R2 score: ", r2_score(y_test, y_pred))

y text: 11    3
37    5
39    5
34    3
17    4
18    5
9     5
14    5
Name: target grade, dtype: int64
y pred: [5 5 5 5 5 5 5 3]
Accuracy: 0.5000
R2 score:  -1.2127659574468086
