# 작업형 2 : 학생성적 예측 (회귀)
- 데이터 설명 : 학생성적 예측 (종속변수 : G3)
- 평가지표 : rmse 혹은 r2_score

- 제출 형태
```
   StudentID         G3
0       1000  15.970918
1       1008  10.737522
2       1013  14.769053
3       1014   7.326618
4       1017  10.755363
```

[데이터 마님 작업형2 연습문제 - 회귀, 학생 성적 예측](https://www.datamanim.com/dataset/03_dataq/typetwo.html#id15)

[캐글 경진대회 링크](https://www.kaggle.com/datasets/kukuroo3/student-performance-data-set-competition-form/code?select=y_train.csv)

[캐글 개인 노트북 공유](https://www.kaggle.com/code/minjunim/lgbm-r2-score-0-88)

In [None]:
import pandas as np
import numpy as np

# 데이터 로드
x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/X_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/y_train.csv")
x_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/studentscore/X_test.csv")

print(x_train.shape)
print(x_train.head(), "\n")
print(x_test.shape)

print(x_test.head(), "\n")
print(y_train.head())


# 1. EDA
# 1-1) train EDA
print(x_train.info(), "\n")
print(x_train.isnull().sum(), "\n")
print(x_train.describe(), "\n")

# 1-2) test EDA
print(x_test.info(), "\n")
print(x_test.isnull().sum(), "\n")
print(x_test.describe(), "\n")

# 1-3) train, test nunique 비교
print(x_train.describe(include="O"), "\n")
print(x_test.describe(include="O"), "\n")



# 2. 전처리
# 2-1) 결측치 제거, id 같은 불필요 칼럼 제거, X와 y분리
# train, test 모두 결측치는 존재하지 않음

# train에서 id 제거
x_train = x_train.drop("StudentID", axis=1)

# test에서 id 분리
test_id = x_test.pop("StudentID")

# 이미 분리되어 데이터 제공했으므로, 값만 할당해주기
y = y_train["G3"]

print(x_train.head(3), "\n")
print(x_test.head(3), "\n")
print(y.head(3), "\n")


# 2-2) 스케일링
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# 수치형 칼럼만 선택
con_cols = x_train.select_dtypes(exclude="object").copy().columns

x_train[con_cols] = scaler.fit_transform(x_train[con_cols])
x_test[con_cols] = scaler.transform(x_test[con_cols])

print(x_train.head(3), "\n")
print(x_test.head(3), "\n")

# 2-3) 인코딩
# 범주형 데이터 칼럼이 많으므로, 라벨인코딩 진행
from sklearn.preprocessing import LabelEncoder

# 범주형 칼럼 선택
cat_cols = x_train.select_dtypes(include="object").copy().columns

for col in cat_cols :
    le = LabelEncoder()
    x_train[col] = le.fit_transform(x_train[col])
    x_test[col] = le.transform(x_test[col])

print(x_train.head(3), "\n")
print(x_test.head(3), "\n")



# 3. 검증 데이터 분리
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(x_train, y, test_size=0.2, random_state=2023)
print(X_train.shape, X_val.shape, "\n")



# 4. 모델링
# 회귀 성능지표
from sklearn.metrics import r2_score, mean_squared_error

# 랜덤포레스트 회귀
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=2023)
rf.fit(X_train, y_train)

rf_pred = rf.predict(X_val)
rf_rmse = np.sqrt(mean_squared_error(y_val, rf_pred))
rf_r2 = r2_score(y_val, rf_pred)

print("랜덤포레스트 rmse : ", rf_rmse)
print("랜덤포레스트 r2_score : ", rf_r2)

# lgbm 회귀
from lightgbm import LGBMRegressor
lgbm = LGBMRegressor(random_state=2023)
lgbm.fit(X_train, y_train)

lgbm_pred = lgbm.predict(X_val)
lgbm_rmse = np.sqrt(mean_squared_error(y_val, lgbm_pred))
lgbm_r2 = r2_score(y_val, lgbm_pred)

print("lgbm rmse : ", lgbm_rmse)
print("lgbm r2_score : ", lgbm_r2)


# 최종 모델 : lgbm
pred = lgbm.predict(x_test)
print(pred)

# 5. 제출 : df, csv
submit = pd.DataFrame({"StudentID" : test_id, "G3" : pred})
submit.to_csv("grade.csv", index=False)

check = pd.read_csv("grade.csv")
print(check.head())

(678, 33)
   StudentID school sex  age address famsize Pstatus  Medu  Fedu      Mjob  \
0       1714     GP   F   18       U     GT3       T     4     3     other   
1       1254     GP   F   17       U     GT3       T     4     3    health   
2       1639     GP   F   16       R     GT3       T     4     4    health   
3       1118     GP   M   16       U     GT3       T     4     4  services   
4       1499     GP   M   19       U     GT3       T     3     2  services   

   ... romantic famrel freetime  goout  Dalc  Walc health absences  G1  G2  
0  ...       no      4        3      3     1     1      3        0  14  13  
1  ...      yes      4        4      3     1     3      4        0  13  15  
2  ...       no      2        4      4     2     3      4        6  10  11  
3  ...       no      5        3      3     1     3      5        0  15  13  
4  ...      yes      4        5      4     1     1      4        0   5   0  

[5 rows x 33 columns] 

(366, 33)
   StudentID school sex 