<a href="https://colab.research.google.com/github/JK-the-Ko/Thermo-Fluid-Dynamics-Experiment/blob/main/2022-2/%EC%97%B4%EC%9C%A0%EC%B2%B4%EA%B3%B5%ED%95%99%EC%8B%A4%ED%97%98_Week_9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Regression

## Import Library

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Import Scikit-Learn

In [None]:
import sklearn

In [None]:
sklearn.__version__

## Get Regression Dataset

In [None]:
from sklearn import datasets

In [None]:
data = datasets.load_diabetes(as_frame = True)

In [None]:
x, y = data.data, data.target

In [None]:
inputFeatures = x.columns
print(inputFeatures)

## Dataset Preprocessing

In [None]:
from sklearn.preprocessing import MinMaxScaler

### Scale Input Dataset

In [None]:
x, y = np.array(x), np.array(y)

In [None]:
mmScalerX = MinMaxScaler()
mmScalerX.fit(x)
x = mmScalerX.transform(x)

### Scale Target Dataset

In [None]:
mmScalerY = MinMaxScaler()
mmScalerY.fit(y.reshape(-1, 1))
y = mmScalerY.transform(y.reshape(-1, 1)).reshape(-1)

### Split Dataset

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.3, random_state = 42)

In [None]:
print(xTrain.shape, yTrain.shape)
print(xTest.shape, yTest.shape)

## Linear Regression

In [None]:
def getParameter(x: np.array, y: np.array) :
  xT = np.transpose(x)
  output = np.matmul(np.matmul(np.linalg.inv(np.matmul(xT, x)), xT), y)

  return output

In [None]:
betaHat = getParameter(xTrain, yTrain)

In [None]:
yTestHat = np.matmul(xTest, betaHat)

### Model Evaluation

In [None]:
def plotBarChart(yTest, yTestHat) :
  fig, ax = plt.subplots(figsize = (10,4))
  idx = np.asarray([i for i in range(50)])
  width = 0.2

  ax.bar(idx, yTest[:50], width = width)
  ax.bar(idx+width, yTestHat[:50], width = width)
  ax.set_xticks(idx)
  ax.legend(["Ground Truth", "Prediction"])
  ax.set_xlabel("# samples")
  ax.set_ylabel("Value")

  fig.tight_layout()
  plt.show()

In [None]:
def RMSE(yHat: np.array, y: np.array) :
  output = np.sqrt(np.mean(np.power(y - yHat, 2)))

  return output

In [None]:
yTest = mmScalerY.inverse_transform(yTest.reshape(-1, 1)).reshape(-1)
yTestHat = mmScalerY.inverse_transform(yTestHat.reshape(-1, 1)).reshape(-1)

In [None]:
plotBarChart(yTest, yTestHat)

In [None]:
RMSE(yTestHat, yTest)

## Decision Tree Regression

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
dtReg = DecisionTreeRegressor(random_state = 42)
dtReg.fit(xTrain, yTrain)
yTestHat = dtReg.predict(xTest)

### Feature Importance Visualization

In [None]:
import seaborn as sns

In [None]:
def visualizeImportances(data : pd.Series) :
  plt.figure(figsize = (8, 6))
  plt.title("Feature Importances")
  sns.barplot(x = data, y = data.index)
  plt.show()

In [None]:
visualizeImportances(pd.Series(dtReg.feature_importances_, index = inputFeatures).sort_values(ascending = False))

### Visualize Decision Tree

In [None]:
from sklearn.tree import plot_tree

In [None]:
plt.figure(figsize = (24, 18))
plot_tree(dtReg)
plt.show()

### Model Evaluation

In [None]:
yTestHat = mmScalerY.inverse_transform(yTestHat.reshape(-1, 1)).reshape(-1)

In [None]:
plotBarChart(yTest, yTestHat)

In [None]:
RMSE(yTestHat, yTest)

## Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rfReg = RandomForestRegressor(random_state = 42)
rfReg.fit(xTrain, yTrain)
yTestHat = rfReg.predict(xTest)

### Feature Importance Visualization

In [None]:
visualizeImportances(pd.Series(rfReg.feature_importances_, index = inputFeatures).sort_values(ascending = False))

### Model Evaluation

In [None]:
yTestHat = mmScalerY.inverse_transform(yTestHat.reshape(-1, 1)).reshape(-1)

In [None]:
plotBarChart(yTest, yTestHat)

In [None]:
RMSE(yTestHat, yTest)

# Binary Class Classification

## Get Classification Dataset

In [None]:
data = datasets.load_breast_cancer(as_frame = True)

In [None]:
x, y = data.data, data.target

In [None]:
inputFeatures = x.columns
print(inputFeatures)

## Dataset Analysis

In [None]:
x.describe()

In [None]:
y.plot.hist()
plt.show()

## Dataset Preprocessing

### Scale Input Dataset

In [None]:
x, y = np.array(x), np.array(y)

In [None]:
mmScalerX = MinMaxScaler()
mmScalerX.fit(x)
x = mmScalerX.transform(x)

### Split Dataset

In [None]:
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.3, random_state = 42)

In [None]:
print(xTrain.shape, yTrain.shape)
print(xTest.shape, yTest.shape)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lgReg = LogisticRegression(max_iter = 10000, random_state = 42)
lgReg.fit(xTrain, yTrain)
yTestHat = lgReg.predict(xTest)

### Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
accScore = accuracy_score(yTest, yTestHat)
print(accScore)

In [None]:
cm = confusion_matrix(yTest, yTestHat)
print(cm)

In [None]:
clsRp = classification_report(yTest, yTestHat)
print(clsRp)

## Decision Tree Classification

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtCls = DecisionTreeClassifier(random_state = 42)
dtCls.fit(xTrain, yTrain)
yTestHat = dtCls.predict(xTest)

### Feature Importance Visualization

In [None]:
visualizeImportances(pd.Series(dtCls.feature_importances_, index = inputFeatures).sort_values(ascending = False))

### Visualize Decision Tree

In [None]:
plt.figure(figsize = (24, 18))
plot_tree(dtCls)
plt.show()

### Model Evaluation

In [None]:
accScore = accuracy_score(yTest, yTestHat)
print(accScore)

In [None]:
cm = confusion_matrix(yTest, yTestHat)
print(cm)

In [None]:
clsRp = classification_report(yTest, yTestHat)
print(clsRp)

## Random Forest Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfCls = RandomForestClassifier(random_state = 42)
rfCls.fit(xTrain, yTrain)
yTestHat = rfCls.predict(xTest)

### Feature Importance Visualization

In [None]:
visualizeImportances(pd.Series(rfCls.feature_importances_, index = inputFeatures).sort_values(ascending = False))

### Model Evaluation

In [None]:
accScore = accuracy_score(yTest, yTestHat)
print(accScore)

In [None]:
cm = confusion_matrix(yTest, yTestHat)
print(cm)

In [None]:
clsRp = classification_report(yTest, yTestHat)
print(clsRp)

# Multi Class Classification

## Load Dataset

In [None]:
df = pd.read_csv("#csv 파일의 절대경로 입력")

In [None]:
df.shape

In [None]:
df

## Data Preprocessing

In [None]:
x = df.drop(columns = "Class", axis = 1)
y = df["Class"]

In [None]:
inputFeatures = x.columns

### Scale Input Dataset

In [None]:
mmScaler = MinMaxScaler()
mmScaler.fit(x)
x = mmScaler.transform(x)

### Label Encode Target Dataset

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
y

In [None]:
lbEnc = LabelEncoder()
lbEnc.fit(y)
y = lbEnc.transform(y)

In [None]:
y

### Split Dataset

In [None]:
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.3, random_state = 42)

In [None]:
print(xTrain.shape, yTrain.shape)
print(xTest.shape, yTest.shape)

## Random Forest Classification

In [None]:
rfCls = RandomForestClassifier(random_state = 42)
rfCls.fit(xTrain, yTrain)
yTestHat = rfCls.predict(xTest)

### Feature Importance Visualization

In [None]:
visualizeImportances(pd.Series(rfCls.feature_importances_, index = inputFeatures).sort_values(ascending = False))

### Model Evaluation

In [None]:
accScore = accuracy_score(yTest, yTestHat)
print(accScore)

In [None]:
cm = confusion_matrix(yTest, yTestHat)
print(cm)

In [None]:
clsRp = classification_report(yTest, yTestHat)
print(clsRp)

# Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
accScore = cross_val_score(lgReg, x, y, cv = 5)

In [None]:
for i, subScore in enumerate(accScore) :
  print(f"Part{i+1} Accuracy : {subScore:.4f}")

In [None]:
print(f"Average Score : {np.mean(accScore):.4f}")

# 실습

## Scikit-Learn의 Digits Dataset을 기반으로 Multi Class Classification을 진행하세요.
### 1) Dataset를 불러온 후 Min-Max Scailing을 진행하세요.
### 2) 분류 모델로는 Decision Tree와 Random Forest를 사용한 후 성능을 비교하세요.
### 3) Random Forest 모델을 기반으로 4-Fold Cross Validation을 진행하세요.

In [None]:
data = datasets.load_digits(as_frame = True)