In [69]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import lightgbm as lgb
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
import matplotlib.pyplot as plt

In [4]:
y_true = [1, 0, 1, 1, 0, 1]
y_pred = [1, 0, 1, 0, 0, 1]

print(accuracy_score(y_true, y_pred))
print(precision_score(y_true, y_pred))
print(recall_score(y_true, y_pred))
print(f1_score(y_true, y_pred))

0.8333333333333334
1.0
0.75
0.8571428571428571


In [6]:
confusion_matrix(y_true, y_pred)

array([[2, 0],
       [1, 3]])

In [10]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.67      1.00      0.80         2
           1       1.00      0.75      0.86         4

    accuracy                           0.83         6
   macro avg       0.83      0.88      0.83         6
weighted avg       0.89      0.83      0.84         6



# 1. Iris 분류 모델

In [11]:
from sklearn.datasets import load_iris

In [15]:
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target
df['target_name'] = df['target'].map(lambda x: iris.target_names[x])
print(df.head())

print(df['target_name'].value_counts())
print(len(df))

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target target_name  
0       0      setosa  
1       0      setosa  
2       0      setosa  
3       0      setosa  
4       0      setosa  
target_name
setosa        50
versicolor    50
virginica     50
Name: count, dtype: int64
150


In [28]:
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

In [29]:
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(120, 4) (30, 4)
(120,) (30,)


In [30]:
model = LogisticRegression(max_iter = 200)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,200


In [31]:
y_pred = model.predict(X_test)
print("정확도:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names = iris.target_names))


정확도: 1.0
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      1.00      1.00        10
   virginica       1.00      1.00      1.00        10

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [67]:
penguins = pd.read_csv('../data/penguins_size.csv')
penguins = penguins.dropna(subset=['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g'])

model = lgb.LGBMClassifier()

X = penguins[['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g']]
y = []
for i in range(len(X)):
    penguins_species = penguins['species'].iloc[i]
    penguins_island = penguins['island'].iloc[i]
    sum = penguins_species + penguins_island
    y.append(sum)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("정확도:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000145 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 223
[LightGBM] [Info] Number of data points in the train set: 256, number of used features: 4
[LightGBM] [Info] Start training from score -2.111190
[LightGBM] [Info] Start training from score -1.783977
[LightGBM] [Info] Start training from score -2.048670
[LightGBM] [Info] Start training from score -1.537844
[LightGBM] [Info] Start training from score -1.001883
정확도: 0.627906976744186
                 precision    recall  f1-score   support

   AdelieBiscoe       0.30      0.23      0.26        13
    AdelieDream       0.33      0.54      0.41        13
AdelieTorgersen       0.17      0.11      0.13        18
 ChinstrapDream       0.93      1.00      0.96        13
   GentooBiscoe       1.00      1.00      1.00        29

       accuracy                           0.63        86
      macro avg      

# 2. 회귀

In [70]:
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df['target'] = housing.target
df.head()


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [71]:
def evaluate_regression(y_true, y_pred):
    print("MSE:", mean_squared_error(y_true, y_pred))
    print("MAE:", mean_absolute_error(y_true, y_pred))
    print("R²:", r2_score(y_true, y_pred))

In [72]:
X = housing.data
y = housing.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 1. 선형 회귀

In [73]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("=== Linear Regression ===")
evaluate_regression(y_test, y_pred)

=== Linear Regression ===
MSE: 0.5558915986952425
MAE: 0.5332001304956989
R²: 0.5757877060324521


## 2. 릿지 회귀

In [95]:
model = Ridge(alpha=1.0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("=== Ridge Regression ===")
evaluate_regression(y_test, y_pred)

=== Ridge Regression ===
MSE: 0.5558034669932196
MAE: 0.5332039182571576
R²: 0.5758549611440138


## 3. 라쏘 회귀

In [84]:
model = Lasso(alpha=0.01)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("=== Lasso Regression ===")
evaluate_regression(y_test, y_pred)

=== Lasso Regression ===
MSE: 0.544449158124652
MAE: 0.5362504645663381
R²: 0.5845196673976367


## 4. 의사결정트리 회귀

In [93]:
model = DecisionTreeRegressor(max_depth=9, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("=== Decision Tree Regression ===")
evaluate_regression(y_test, y_pred)

=== Decision Tree Regression ===
MSE: 0.4150681340811693
MAE: 0.43568046661955595
R²: 0.6832529836308363


## 5. 랜덤 포레스트 회귀

In [77]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("=== Random Forest Regression ===")
evaluate_regression(y_test, y_pred)

=== Random Forest Regression ===
MSE: 0.2553684927247781
MAE: 0.32754256845930246
R²: 0.8051230593157366


## 6. 서포트 벡터 회귀(SVR)

In [78]:
model = SVR(kernel='rbf', C=10, epsilon=0.1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("=== Support Vector Regression ===")
evaluate_regression(y_test, y_pred)


=== Support Vector Regression ===
MSE: 1.109306666935002
MAE: 0.7837683817103089
R²: 0.1534653033100083


## 7. 그래디언트 부스팅 회귀

In [79]:
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("=== Gradient Boosting Regression ===")
evaluate_regression(y_test, y_pred)


=== Gradient Boosting Regression ===
MSE: 0.2939973248643864
MAE: 0.3716425690425596
R²: 0.7756446042829697
