# [선형회귀모델 실습: 하이닉스 데이터]

## 1. 모듈 불러오기

#### import 불러올 패키지명 as 그패키지를 파이썬에서 사용할 이름

In [None]:
# 데이터 전처리 패키지
import pandas as pd
import numpy as np

# 선형회귀 모델 구축 및 평가 패키지
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# 데이터 시각화 패키지
import matplotlib.pyplot as plt

## 2. 데이터 불러오기: 하이닉스 FDC 데이터

#### X 데이터(입력변수) 불러오기

+ 관측치 개수: 508,206개
+ 변수 개수: 24개
    + TIME: 관측 시간
    + EQP 2: 장비
    + MODULE 2: 모듈
    + LOT_ID 2: LOT ID
    + TIME_PROCESS: LOT별 관측 번호
    + Para 01~19: 관측값

In [None]:
x_data = pd.read_csv('C:/Users/Baek/Downloads/FD2_Analysis.L_XDATA.csv')
print(x_data.columns)

In [None]:
x_data.shape

In [None]:
x_data

#### Y data(출력변수) 불러오기

+ 관측치 개수: 35,178개
+ 변수 개수: 6개
    + LOT_ID 2: LOT ID
    + WF_ID: Wafer ID
    + Site: Wafer 내 관측 위치
    + X axis: Wafer 내 관측 X 좌표
    + Y axis: Wafer 내 관측 Y 좌표
    + Response: 반응률(예측하고자 하는 반응 변수)

In [None]:
y_data = pd.read_csv('C:/Users/Baek/Downloads/FD2_Analysis.L_YDATA.csv')
print(y_data.columns)

In [None]:
y_data.shape

In [None]:
y_data

## 3. 데이터 전처리

#### 데이터 내에 결측치 파악

In [None]:
x_data.isnull().sum()

In [None]:
y_data.isnull().sum()

#### 결측치가 존재하는 관측치 제거

In [None]:
y_data = y_data.dropna()

In [None]:
y_data.shape

#### 범주형 변수, 수치형 변수 분리

+ X 데이터(입력 변수): 24개
+ Key 변수: 1개
    + LOT_ID 2
+ 범주형 변수: 2개
    + EQP 2, MODULE 2
+ 수치형 변수: 19개
    + Para01~19
+ 사용하지 않는 변수: 2개
    + TIME, TIME_PROCESS

In [None]:
# 범주형 변수 & Key 변수 분리
data_categorical = x_data[['EQP 2', 'MODULE 2', 'LOT_ID 2']]

In [None]:
# 범주형 변수 중복값 제거
data_categorical = data_categorical.drop_duplicates()
data_categorical = data_categorical.reset_index(drop=True)

In [None]:
# Key 변수 분리
lot_id = data_categorical['LOT_ID 2']

In [None]:
# 범주형 변수 분리
data_categorical = data_categorical.drop(['LOT_ID 2'], axis=1)

In [None]:
# 범주형 변수 one-hot-encoding
data_categorical = pd.get_dummies(data_categorical, drop_first=True)

In [None]:
# 범주형 변수, Key 변수 결합
data_categorical =  pd.concat([lot_id, data_categorical], axis=1)

In [None]:
data_categorical

In [None]:
# 수치형 변수 분리
x_data_num = x_data.drop(['TIME','EQP 2', 'MODULE 2','TIME_PROCESS'], axis=1)

In [None]:
x_data_num

#### LOT_ID별 mean, max, min, variance, median, sum 값 요약

In [None]:
# mean값 요약
mean_data = x_data_num.groupby(x_data_num['LOT_ID 2']).mean()
mean_data.rename(columns = lambda x: x.replace('Para', 'mean'), inplace=True)

In [None]:
# max값 요약
max_data = x_data_num.groupby(x_data_num['LOT_ID 2']).max()
max_data.rename(columns = lambda x: x.replace('Para', 'max'), inplace=True)

In [None]:
# min값 요약
min_data = x_data_num.groupby(x_data_num['LOT_ID 2']).min() 
min_data.rename(columns = lambda x: x.replace('Para', 'min'), inplace=True)

In [None]:
# variance값 요약
var_data = x_data_num.groupby(x_data_num['LOT_ID 2']).var()
var_data.rename(columns = lambda x: x.replace('Para', 'var'), inplace=True)

In [None]:
# median값 요약
median_data = x_data_num.groupby(x_data_num['LOT_ID 2']).median()
median_data.rename(columns = lambda x: x.replace('Para', 'median'), inplace=True)

In [None]:
# sum값 요약
sum_data = x_data_num.groupby(x_data_num['LOT_ID 2']).sum()
sum_data.rename(columns = lambda x: x.replace('Para', 'sum'), inplace=True)

#### 분석 데이터 Set 구축
+ LOT_ID 기준으로 X, Y 데이터 병합
+ 최종 데이터
    + 관측치 개수: 35,164개
    + 입력변수: 119개
    + 출력변수: 1개(Response)

In [None]:
# LOT_ID별 요약 데이터 결합(수치형 데이터)
full_data = y_data.merge(mean_data, on='LOT_ID 2').merge(max_data, on='LOT_ID 2').merge(min_data, on='LOT_ID 2').merge(var_data, on='LOT_ID 2').merge(median_data, on='LOT_ID 2').merge(sum_data, on='LOT_ID 2')

In [None]:
# 범주형 변수 데이터 결합
full_data = full_data.merge(data_categorical, on='LOT_ID 2')

In [None]:
full_data.columns

In [None]:
full_data

In [None]:
# X & Y 데이터 SET 구축
data_x = full_data.drop(['LOT_ID 2', 'WF_ID', 'Site', 'X axis', 'Y axis', 'Response'], axis=1)
data_y = full_data['Response']

In [None]:
data_x

In [None]:
data_y

## 4. 전체 데이터에 대한 모델링

#### 모델 구축을 위한 Train, Test 데이터 분리

In [None]:
train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size=0.3, random_state=2020)

#### 평균, 분산을 활용한 Column 별 정규화 진행

In [None]:
sc = StandardScaler()
train_x_sc = sc.fit_transform(train_x)
test_x_sc = sc.fit_transform(test_x)

### 전체 data에 대한 Linear 모델 구축
#### statsmodels 의 OLS를 사용하여 선형회귀분석 시행(OLS: Ordinary Least Squares)
+ 모델 선언: model = sm.OLS(Y데이터, X데이터)
+ 모델 학습: model_trained = model.fit()

In [None]:
model = sm.OLS(train_y, train_x_sc)

In [None]:
model_trained = model.fit()

## 5. 전체 데이터에 대한 모델 결과 해석

+ R-sqared (결정계수, coefficient of determination): 모형의 성능
+ coef (회귀계수): X가 한단위 증가할 때 Y의 변화량
+ P>[t] (p-value): 0.1(or 0.05)이하일 때 변수가 유의미

In [None]:
model_trained.summary()

## 6. 실제값 vs 모델 예측값 비교

#### Linear Model 예측값 출력

In [None]:
y_pred_tr = model_trained.predict(train_x_sc)
y_pred_ts = model_trained.predict(test_x_sc)

#### [TEST] 실제값과 모델 예측값에 대한 산점도

In [None]:
plt.figure(figsize=(8,8))
plt.title('[Testing]True vs Predictied')
plt.xlim(-0.75,0.75)
plt.ylim(-0.75,0.75)
plt.xlabel('true value')
plt.ylabel('prediction')
plt.scatter(test_y, y_pred_ts)
plt.show()

### 모델 예측값 평가를 위한 지표들(MSE, MAE, R-squared, MAPE)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true-y_pred)/y_true))*100

In [None]:
print('Training MSE in Linear: {:.3f}' .format(mean_squared_error(train_y, y_pred_tr)))
print('Testing MSE in Linear: {:.3f}' .format(mean_squared_error(test_y, y_pred_ts)))

In [None]:
print('Training MAE in Linear: {:.3f}' .format(mean_absolute_error(train_y, y_pred_tr)))
print('Testing MAE in Linear: {:.3f}' .format(mean_absolute_error(test_y, y_pred_ts)))

In [None]:
print('Training R2 in Linear: {:.3f}' .format(r2_score(train_y, y_pred_tr)))
print('Testing R2 in Linear: {:.3f}' .format(r2_score(test_y, y_pred_ts)))

#### 실제값에 '0'이 존재하는 경우에는 MAPE를 계산할 수 없음

In [None]:
print('Training MAPE in Linear: {:.3f}' .format(mean_absolute_percentage_error(train_y, y_pred_tr)))
print('Testing MAPE in Linear: {:.3f}' .format(mean_absolute_percentage_error(test_y, y_pred_ts)))

## 7. Site(측정 위치 동일)별 선형 회귀 모델(Linear Regression) 구축

### Site=1인 데이터로 분석

#### 모델 구축을 위한 데이터 전처리

In [None]:
data_1 = full_data[full_data['Site'] == 1]

In [None]:
data_1 = data_1.drop(['LOT_ID 2', 'WF_ID', 'Site', 'X axis', 'Y axis'], axis=1)

In [None]:
data_1_X = data_1.drop(['Response'], axis=1)
data_1_Y = data_1['Response']

#### 선형 회귀 모델 구축

In [None]:
train_x, test_x, train_y, test_y = train_test_split(data_1_X, data_1_Y, test_size=0.3, random_state=2020)

In [None]:
sc = StandardScaler()
train_x_sc = sc.fit_transform(train_x)
test_x_sc = sc.fit_transform(test_x)

In [None]:
model = sm.OLS(train_y, train_x_sc)
model_trained = model.fit()

In [None]:
y_pred_tr = model_trained.predict(train_x_sc)
y_pred_ts = model_trained.predict(test_x_sc)

#### 모델에 대한 성능 평가

In [None]:
print('Training MSE in Linear: {:.3f}' .format(mean_squared_error(train_y, y_pred_tr)))
print('Testing MSE in Linear: {:.3f}' .format(mean_squared_error(test_y, y_pred_ts)))

print('Training MAE in Linear: {:.3f}' .format(mean_absolute_error(train_y, y_pred_tr)))
print('Testing MAE in Linear: {:.3f}' .format(mean_absolute_error(test_y, y_pred_ts)))

print('Training R2 in Linear: {:.3f}' .format(r2_score(train_y, y_pred_tr)))
print('Testing R2 in Linear: {:.3f}' .format(r2_score(test_y, y_pred_ts)))

In [None]:
plt.figure(figsize=(8,8))
plt.title('[Testing]True vs Predictied')
plt.xlim(-0.75,0.75)
plt.ylim(-0.75,0.75)
plt.xlabel('true value')
plt.ylabel('prediction')
plt.scatter(test_y, y_pred_ts)

## Site=2일 때 결과 도출

In [None]:
data_2 = full_data[full_data['Site'] == 2]
data_2 = data_2.drop(['LOT_ID 2', 'WF_ID', 'Site', 'X axis', 'Y axis'], axis=1)

data_2_X = data_2.drop(['Response'], axis=1)
data_2_Y = data_2['Response']

train_x, test_x, train_y, test_y = train_test_split(data_2_X, data_2_Y, test_size=0.3, random_state=2020)

sc = StandardScaler()
train_x_sc = sc.fit_transform(train_x)
test_x_sc = sc.fit_transform(test_x)

model= sm.OLS(train_y, train_x_sc)
model_trained = model.fit()

y_pred_tr = model_trained.predict(train_x_sc)
y_pred_ts = model_trained.predict(test_x_sc)

print('Training MSE in Linear: {:.3f}' .format(mean_squared_error(train_y, y_pred_tr)))
print('Testing MSE in Linear: {:.3f}' .format(mean_squared_error(test_y, y_pred_ts)))

print('Training MAE in Linear: {:.3f}' .format(mean_absolute_error(train_y, y_pred_tr)))
print('Testing MAE in Linear: {:.3f}' .format(mean_absolute_error(test_y, y_pred_ts)))

print('Training R2 in Linear: {:.3f}' .format(r2_score(train_y, y_pred_tr)))
print('Testing R2 in Linear: {:.3f}' .format(r2_score(test_y, y_pred_ts)))

plt.figure(figsize=(8,8))
plt.title('[Testing]True vs Predictied')
plt.xlim(-0.75,0.75)
plt.ylim(-0.75,0.75)
plt.xlabel('true value')
plt.ylabel('prediction')
plt.scatter(test_y, y_pred_ts)

## Site=3일 때 결과 도출

In [None]:
data_3 = full_data[full_data['Site'] == 3]
data_3 = data_3.drop(['LOT_ID 2', 'WF_ID', 'Site', 'X axis', 'Y axis'], axis=1)

data_3_X = data_3.drop(['Response'], axis=1)
data_3_Y = data_3['Response']

train_x, test_x, train_y, test_y = train_test_split(data_3_X, data_3_Y, test_size=0.3, random_state=2020)

sc = StandardScaler()
train_x_sc = sc.fit_transform(train_x)
test_x_sc = sc.fit_transform(test_x)

model = sm.OLS(train_y, train_x_sc)
model_trained = model.fit()

y_pred_tr = model_trained.predict(train_x_sc)
y_pred_ts = model_trained.predict(test_x_sc)

print('Training MSE in Linear: {:.3f}' .format(mean_squared_error(train_y, y_pred_tr)))
print('Testing MSE in Linear: {:.3f}' .format(mean_squared_error(test_y, y_pred_ts)))

print('Training MAE in Linear: {:.3f}' .format(mean_absolute_error(train_y, y_pred_tr)))
print('Testing MAE in Linear: {:.3f}' .format(mean_absolute_error(test_y, y_pred_ts)))

print('Training R2 in Linear: {:.3f}' .format(r2_score(train_y, y_pred_tr)))
print('Testing R2 in Linear: {:.3f}' .format(r2_score(test_y, y_pred_ts)))

plt.figure(figsize=(8,8))
plt.title('[Testing]True vs Predictied')
plt.xlim(-0.75,0.75)
plt.ylim(-0.75,0.75)
plt.xlabel('true value')
plt.ylabel('prediction')
plt.scatter(test_y, y_pred_ts)

## Site=4일 때 결과 도출

In [None]:
data_4 = full_data[full_data['Site'] == 4]
data_4 = data_4.drop(['LOT_ID 2', 'WF_ID', 'Site', 'X axis', 'Y axis'], axis=1)

data_4_X = data_4.drop(['Response'], axis=1)
data_4_Y = data_4['Response']

train_x, test_x, train_y, test_y = train_test_split(data_4_X, data_4_Y, test_size=0.3, random_state=2020)

sc = StandardScaler()
train_x_sc = sc.fit_transform(train_x)
test_x_sc = sc.fit_transform(test_x)

model = sm.OLS(train_y, train_x_sc)
model_trained = model.fit()

y_pred_tr = model_trained.predict(train_x_sc)
y_pred_ts = model_trained.predict(test_x_sc)

print('Training MSE in Linear: {:.3f}' .format(mean_squared_error(train_y, y_pred_tr)))
print('Testing MSE in Linear: {:.3f}' .format(mean_squared_error(test_y, y_pred_ts)))

print('Training MAE in Linear: {:.3f}' .format(mean_absolute_error(train_y, y_pred_tr)))
print('Testing MAE in Linear: {:.3f}' .format(mean_absolute_error(test_y, y_pred_ts)))

print('Training R2 in Linear: {:.3f}' .format(r2_score(train_y, y_pred_tr)))
print('Testing R2 in Linear: {:.3f}' .format(r2_score(test_y, y_pred_ts)))

plt.figure(figsize=(8,8))
plt.title('[Testing]True vs Predictied')
plt.xlim(-0.75,0.75)
plt.ylim(-0.75,0.75)
plt.xlabel('true value')
plt.ylabel('prediction')
plt.scatter(test_y, y_pred_ts)

## Site=5일 때 결과 도출

In [None]:
data_5 = full_data[full_data['Site'] == 5]
data_5 = data_5.drop(['LOT_ID 2', 'WF_ID', 'Site', 'X axis', 'Y axis'], axis=1)

data_5_X = data_5.drop(['Response'], axis=1)
data_5_Y = data_5['Response']

train_x, test_x, train_y, test_y = train_test_split(data_5_X, data_5_Y, test_size=0.3, random_state=2020)

sc = StandardScaler()
train_x_sc = sc.fit_transform(train_x)
test_x_sc = sc.fit_transform(test_x)

model = sm.OLS(train_y, train_x_sc)
model_trained = model.fit()

y_pred_tr = model_trained.predict(train_x_sc)
y_pred_ts = model_trained.predict(test_x_sc)

print('Training MSE in Linear: {:.3f}' .format(mean_squared_error(train_y, y_pred_tr)))
print('Testing MSE in Linear: {:.3f}' .format(mean_squared_error(test_y, y_pred_ts)))

print('Training MAE in Linear: {:.3f}' .format(mean_absolute_error(train_y, y_pred_tr)))
print('Testing MAE in Linear: {:.3f}' .format(mean_absolute_error(test_y, y_pred_ts)))

print('Training R2 in Linear: {:.3f}' .format(r2_score(train_y, y_pred_tr)))
print('Testing R2 in Linear: {:.3f}' .format(r2_score(test_y, y_pred_ts)))

plt.figure(figsize=(8,8))
plt.title('[Testing]True vs Predictied')
plt.xlim(-0.75,0.75)
plt.ylim(-0.75,0.75)
plt.xlabel('true value')
plt.ylabel('prediction')
plt.scatter(test_y, y_pred_ts)

## Site=6일 때 결과 도출

In [None]:
data_6 = full_data[full_data['Site'] == 6]
data_6 = data_6.drop(['LOT_ID 2', 'WF_ID', 'Site', 'X axis', 'Y axis'], axis=1)

data_6_X = data_6.drop(['Response'], axis=1)
data_6_Y = data_6['Response']

train_x, test_x, train_y, test_y = train_test_split(data_6_X, data_6_Y, test_size=0.3, random_state=2020)

sc = StandardScaler()
train_x_sc = sc.fit_transform(train_x)
test_x_sc = sc.fit_transform(test_x)

model = sm.OLS(train_y, train_x_sc)
model_trained = model.fit()

y_pred_tr = model_trained.predict(train_x_sc)
y_pred_ts = model_trained.predict(test_x_sc)

print('Training MSE in Linear: {:.3f}' .format(mean_squared_error(train_y, y_pred_tr)))
print('Testing MSE in Linear: {:.3f}' .format(mean_squared_error(test_y, y_pred_ts)))

print('Training MAE in Linear: {:.3f}' .format(mean_absolute_error(train_y, y_pred_tr)))
print('Testing MAE in Linear: {:.3f}' .format(mean_absolute_error(test_y, y_pred_ts)))

print('Training R2 in Linear: {:.3f}' .format(r2_score(train_y, y_pred_tr)))
print('Testing R2 in Linear: {:.3f}' .format(r2_score(test_y, y_pred_ts)))

plt.figure(figsize=(8,8))
plt.title('[Testing]True vs Predictied')
plt.xlim(-0.75,0.75)
plt.ylim(-0.75,0.75)
plt.xlabel('true value')
plt.ylabel('prediction')
plt.scatter(test_y, y_pred_ts)

## Site=7일 때 결과 도출

In [None]:
data_7 = full_data[full_data['Site'] == 7]
data_7 = data_7.drop(['LOT_ID 2', 'WF_ID', 'Site', 'X axis', 'Y axis'], axis=1)

data_7_X = data_7.drop(['Response'], axis=1)
data_7_Y = data_7['Response']

train_x, test_x, train_y, test_y = train_test_split(data_7_X, data_7_Y, test_size=0.3, random_state=2020)

sc = StandardScaler()
train_x_sc = sc.fit_transform(train_x)
test_x_sc = sc.fit_transform(test_x)

model = sm.OLS(train_y, train_x_sc)
model_trained = model.fit()

y_pred_tr = model_trained.predict(train_x_sc)
y_pred_ts = model_trained.predict(test_x_sc)

print('Training MSE in Linear: {:.3f}' .format(mean_squared_error(train_y, y_pred_tr)))
print('Testing MSE in Linear: {:.3f}' .format(mean_squared_error(test_y, y_pred_ts)))

print('Training MAE in Linear: {:.3f}' .format(mean_absolute_error(train_y, y_pred_tr)))
print('Testing MAE in Linear: {:.3f}' .format(mean_absolute_error(test_y, y_pred_ts)))

print('Training R2 in Linear: {:.3f}' .format(r2_score(train_y, y_pred_tr)))
print('Testing R2 in Linear: {:.3f}' .format(r2_score(test_y, y_pred_ts)))

plt.figure(figsize=(8,8))
plt.title('[Testing]True vs Predictied')
plt.xlim(-0.75,0.75)
plt.ylim(-0.75,0.75)
plt.xlabel('true value')
plt.ylabel('prediction')
plt.scatter(test_y, y_pred_ts)

## Site=8일 때 결과 도출

In [None]:
data_8 = full_data[full_data['Site'] == 8]
data_8 = data_8.drop(['LOT_ID 2', 'WF_ID', 'Site', 'X axis', 'Y axis'], axis=1)

data_8_X = data_8.drop(['Response'], axis=1)
data_8_Y = data_8['Response']

train_x, test_x, train_y, test_y = train_test_split(data_8_X, data_8_Y, test_size=0.3, random_state=2020)

sc = StandardScaler()
train_x_sc = sc.fit_transform(train_x)
test_x_sc = sc.fit_transform(test_x)

model = sm.OLS(train_y, train_x_sc)
model_trained = model.fit()

y_pred_tr = model_trained.predict(train_x_sc)
y_pred_ts = model_trained.predict(test_x_sc)

print('Training MSE in Linear: {:.3f}' .format(mean_squared_error(train_y, y_pred_tr)))
print('Testing MSE in Linear: {:.3f}' .format(mean_squared_error(test_y, y_pred_ts)))

print('Training MAE in Linear: {:.3f}' .format(mean_absolute_error(train_y, y_pred_tr)))
print('Testing MAE in Linear: {:.3f}' .format(mean_absolute_error(test_y, y_pred_ts)))

print('Training R2 in Linear: {:.3f}' .format(r2_score(train_y, y_pred_tr)))
print('Testing R2 in Linear: {:.3f}' .format(r2_score(test_y, y_pred_ts)))

plt.figure(figsize=(8,8))
plt.title('[Testing]True vs Predictied')
plt.xlim(-0.75,0.75)
plt.ylim(-0.75,0.75)
plt.xlabel('true value')
plt.ylabel('prediction')
plt.scatter(test_y, y_pred_ts)

## Site=9일 때 결과 도출

In [None]:
data_9 = full_data[full_data['Site'] == 9]
data_9 = data_9.drop(['LOT_ID 2', 'WF_ID', 'Site', 'X axis', 'Y axis'], axis=1)

data_9_X = data_9.drop(['Response'], axis=1)
data_9_Y = data_9['Response']

train_x, test_x, train_y, test_y = train_test_split(data_9_X, data_9_Y, test_size=0.3, random_state=2020)

sc = StandardScaler()
train_x_sc = sc.fit_transform(train_x)
test_x_sc = sc.fit_transform(test_x)

model = sm.OLS(train_y, train_x_sc)
model_trained = model.fit()

y_pred_tr = model_trained.predict(train_x_sc)
y_pred_ts = model_trained.predict(test_x_sc)

print('Training MSE in Linear: {:.3f}' .format(mean_squared_error(train_y, y_pred_tr)))
print('Testing MSE in Linear: {:.3f}' .format(mean_squared_error(test_y, y_pred_ts)))

print('Training MAE in Linear: {:.3f}' .format(mean_absolute_error(train_y, y_pred_tr)))
print('Testing MAE in Linear: {:.3f}' .format(mean_absolute_error(test_y, y_pred_ts)))

print('Training R2 in Linear: {:.3f}' .format(r2_score(train_y, y_pred_tr)))
print('Testing R2 in Linear: {:.3f}' .format(r2_score(test_y, y_pred_ts)))

plt.figure(figsize=(8,8))
plt.title('[Testing]True vs Predictied')
plt.xlim(-0.75,0.75)
plt.ylim(-0.75,0.75)
plt.xlabel('true value')
plt.ylabel('prediction')
plt.scatter(test_y, y_pred_ts)

## Site=10일 때 결과 도출

In [None]:
data_10 = full_data[full_data['Site'] == 10]
data_10 = data_10.drop(['LOT_ID 2', 'WF_ID', 'Site', 'X axis', 'Y axis'], axis=1)

data_10_X = data_10.drop(['Response'], axis=1)
data_10_Y = data_10['Response']

train_x, test_x, train_y, test_y = train_test_split(data_10_X, data_10_Y, test_size=0.3, random_state=2020)

sc = StandardScaler()
train_x_sc = sc.fit_transform(train_x)
test_x_sc = sc.fit_transform(test_x)

model = sm.OLS(train_y, train_x_sc)
model_trained = model.fit()

y_pred_tr = model_trained.predict(train_x_sc)
y_pred_ts = model_trained.predict(test_x_sc)

print('Training MSE in Linear: {:.3f}' .format(mean_squared_error(train_y, y_pred_tr)))
print('Testing MSE in Linear: {:.3f}' .format(mean_squared_error(test_y, y_pred_ts)))

print('Training MAE in Linear: {:.3f}' .format(mean_absolute_error(train_y, y_pred_tr)))
print('Testing MAE in Linear: {:.3f}' .format(mean_absolute_error(test_y, y_pred_ts)))

print('Training R2 in Linear: {:.3f}' .format(r2_score(train_y, y_pred_tr)))
print('Testing R2 in Linear: {:.3f}' .format(r2_score(test_y, y_pred_ts)))

plt.figure(figsize=(8,8))
plt.title('[Testing]True vs Predictied')
plt.xlim(-0.75,0.75)
plt.ylim(-0.75,0.75)
plt.xlabel('true value')
plt.ylabel('prediction')
plt.scatter(test_y, y_pred_ts)

## Site=11일 때 결과 도출

In [None]:
data_11 = full_data[full_data['Site'] == 11]
data_11 = data_11.drop(['LOT_ID 2', 'WF_ID', 'Site', 'X axis', 'Y axis'], axis=1)

data_11_X = data_11.drop(['Response'], axis=1)
data_11_Y = data_11['Response']

train_x, test_x, train_y, test_y = train_test_split(data_11_X, data_11_Y, test_size=0.3, random_state=2020)

sc = StandardScaler()
train_x_sc = sc.fit_transform(train_x)
test_x_sc = sc.fit_transform(test_x)

model = sm.OLS(train_y, train_x_sc)
model_trained = model.fit()

y_pred_tr = model_trained.predict(train_x_sc)
y_pred_ts = model_trained.predict(test_x_sc)

print('Training MSE in Linear: {:.3f}' .format(mean_squared_error(train_y, y_pred_tr)))
print('Testing MSE in Linear: {:.3f}' .format(mean_squared_error(test_y, y_pred_ts)))

print('Training MAE in Linear: {:.3f}' .format(mean_absolute_error(train_y, y_pred_tr)))
print('Testing MAE in Linear: {:.3f}' .format(mean_absolute_error(test_y, y_pred_ts)))

print('Training R2 in Linear: {:.3f}' .format(r2_score(train_y, y_pred_tr)))
print('Testing R2 in Linear: {:.3f}' .format(r2_score(test_y, y_pred_ts)))

plt.figure(figsize=(8,8))
plt.title('[Testing]True vs Predictied')
plt.xlim(-0.75,0.75)
plt.ylim(-0.75,0.75)
plt.xlabel('true value')
plt.ylabel('prediction')
plt.scatter(test_y, y_pred_ts)

## Site=12일 때 결과 도출

In [None]:
data_12 = full_data[full_data['Site'] == 12]
data_12 = data_12.drop(['LOT_ID 2', 'WF_ID', 'Site', 'X axis', 'Y axis'], axis=1)

data_12_X = data_12.drop(['Response'], axis=1)
data_12_Y = data_12['Response']

train_x, test_x, train_y, test_y = train_test_split(data_12_X, data_12_Y, test_size=0.3, random_state=2020)

sc = StandardScaler()
train_x_sc = sc.fit_transform(train_x)
test_x_sc = sc.fit_transform(test_x)

model = sm.OLS(train_y, train_x_sc)
model_trained = model.fit()

y_pred_tr = model_trained.predict(train_x_sc)
y_pred_ts = model_trained.predict(test_x_sc)

print('Training MSE in Linear: {:.3f}' .format(mean_squared_error(train_y, y_pred_tr)))
print('Testing MSE in Linear: {:.3f}' .format(mean_squared_error(test_y, y_pred_ts)))

print('Training MAE in Linear: {:.3f}' .format(mean_absolute_error(train_y, y_pred_tr)))
print('Testing MAE in Linear: {:.3f}' .format(mean_absolute_error(test_y, y_pred_ts)))

print('Training R2 in Linear: {:.3f}' .format(r2_score(train_y, y_pred_tr)))
print('Testing R2 in Linear: {:.3f}' .format(r2_score(test_y, y_pred_ts)))

plt.figure(figsize=(8,8))
plt.title('[Testing]True vs Predictied')
plt.xlim(-0.75,0.75)
plt.ylim(-0.75,0.75)
plt.xlabel('true value')
plt.ylabel('prediction')
plt.scatter(test_y, y_pred_ts)

## Site=13일 때 결과 도출

In [None]:
data_13 = full_data[full_data['Site'] == 13]
data_13 = data_13.drop(['LOT_ID 2', 'WF_ID', 'Site', 'X axis', 'Y axis'], axis=1)

data_13_X = data_13.drop(['Response'], axis=1)
data_13_Y = data_13['Response']

train_x, test_x, train_y, test_y = train_test_split(data_13_X, data_13_Y, test_size=0.3, random_state=2020)

sc = StandardScaler()
train_x_sc = sc.fit_transform(train_x)
test_x_sc = sc.fit_transform(test_x)

model = sm.OLS(train_y, train_x_sc)
model_trained = model.fit()

y_pred_tr = model_trained.predict(train_x_sc)
y_pred_ts = model_trained.predict(test_x_sc)

print('Training MSE in Linear: {:.3f}' .format(mean_squared_error(train_y, y_pred_tr)))
print('Testing MSE in Linear: {:.3f}' .format(mean_squared_error(test_y, y_pred_ts)))

print('Training MAE in Linear: {:.3f}' .format(mean_absolute_error(train_y, y_pred_tr)))
print('Testing MAE in Linear: {:.3f}' .format(mean_absolute_error(test_y, y_pred_ts)))

print('Training R2 in Linear: {:.3f}' .format(r2_score(train_y, y_pred_tr)))
print('Testing R2 in Linear: {:.3f}' .format(r2_score(test_y, y_pred_ts)))

plt.figure(figsize=(8,8))
plt.title('[Testing]True vs Predictied')
plt.xlim(-0.75,0.75)
plt.ylim(-0.75,0.75)
plt.xlabel('true value')
plt.ylabel('prediction')
plt.scatter(test_y, y_pred_ts)