In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
input_data = np.array([[3, -1.5, 3, -6.4], [0, 3, -1.3, 4.1], [1, 2.3, -2.9, -4.3]])
input_data

In [None]:
plt.boxplot(input_data)
plt.show()

In [None]:
#warning 메세지를 없앨 때
import warnings
warnings.filterwarnings('ignore')

In [None]:
data_scaled = scaler.fit_transform(input_data)

In [None]:
plt.boxplot(data_scaled)
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_scaled = scaler.fit_transform(input_data)
plt.boxplot(data_scaled)
plt.show()

# BREAST CANCER 예제

In [None]:
# 데이터 가져오기
from sklearn import datasets
cancer = datasets.load_breast_cancer()

In [None]:
print(cancer.DESCR)

In [None]:
# 데이터 프레임 만들기
df = pd.DataFrame(cancer.data)
df.columns = cancer.feature_names
df['class'] = cancer.target
df

# 데이터 검사

In [None]:
df.head() # 상위 5개 데이터 확인

In [None]:
df.shape #데이터의 사이즈 확인

In [None]:
df.dtypes #데이터 타입 확인

In [None]:
df.isna().sum() #넑값 체크

In [None]:
df.describe() # 기초 통계량 확인

In [None]:
df.groupby('class').size() # class의 갯수 확인

# PRELIMINARY ANALYSIS

In [None]:
# 단변량 분석 (히스토그램, 박스플랏, 카운트 플랏)

In [None]:
df.hist(figsize=(15,15))
plt.tight_layout()
plt.show()

In [None]:
df.boxplot() #scaling이 필요함

In [None]:
# 다변량 분석 (산점도, 상관관계)
import seaborn as sns
sns.pairplot(df, hue='class')

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(), annot=True)

# 모델링

In [None]:
# x and y split
y = df['class']
x = df.drop('class', axis=1)

In [None]:
#scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(x)
plt.boxplot(x_scaled)
plt.show()

In [None]:
# resampling
from imblearn.over_sampling import SMOTE
smote = SMOTE()
x_smote, y_smote = smote.fit_resample(x_scaled, y)
sns.countplot(x=y_smote)

In [None]:
# train and test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_smote, y_smote, test_size=.2, random_state=1)

# SCALING

In [None]:
# SCALING
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#정규화
data = np.array([[3, -1.5, 3, -6.4],
                 [0, 3, -1.3, 4.1],
                 [1, 2.3, -2.9, -4.3]])
plt.boxplot(data)
warnings.filterwarnings('ignore')

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data)
plt.boxplot(scaled_data)
warnings.filterwarnings('ignore')

In [None]:
#표준화
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)
plt.boxplot(scaled_data)
warnings.filterwarnings('ignore')

In [None]:
from sklearn.preprocessing import scale #standard scaler와 같은 결과
data_scaled = scale(data)
plt.boxplot(data_scaled)
warnings.filterwarnings('ignore')

In [None]:
from sklearn.preprocessing import normalize
data_normalized = normalize(data, norm='l2') #l1 uses absolute value, l2 uses square root
plt.boxplot(data_normalized)
warnings.filterwarnings('ignore')

In [None]:
from sklearn.preprocessing import Normalizer
scaler = Normalizer()
data_normalized = scaler.fit_transform(data)
plt.boxplot(data_normalized)
warnings.filterwarnings('ignore')

In [None]:
from sklearn.preprocessing import Binarizer
data_scaled = Binarizer(threshold=1.4).transform(data)
plt.boxplot(data_scaled)
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
from sklearn import datasets
iris = datasets.load_iris()

In [None]:
y = iris.target
x = iris.data

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=1)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
rf.score(x_test, y_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, rf.predict(x_test)))

In [None]:
from sklearn.model_selection import GridSearchCV

params = { 'n_estimators' : [10, 100],
           'max_depth' : [6, 8, 10, 12],
           'min_samples_leaf' : [8, 12, 18],
           'min_samples_split' : [8, 16, 20]
            }

In [None]:
# GridSearchCV 수행
rf = RandomForestClassifier(random_state = 0, n_jobs = -1)
grid_cv = GridSearchCV(rf, param_grid = params, cv = 3, n_jobs = -1)
grid_cv.fit(x_train, y_train)

In [None]:
print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

In [None]:
grid_cv.best_estimator_

In [None]:
print(classification_report(y_test, grid_cv.predict(x_test)))

# Breast Cancer 예제

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets

In [None]:
# 데이터 프레임 만들기
cancer = datasets.load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['class'] = cancer.target

In [None]:
print(cancer.DESCR)

In [None]:
#데이터 검사
df.head()
df.shape
df.dtypes
df.isna().sum()
df.describe()

In [None]:
#preliminary analysis
df.hist(figsize=(10,10))
plt.tight_layout()

In [None]:
df.boxplot(figsize=(10,10)) #scaling 필요
plt.xticks(rotation=60)
warnings.filterwarnings('ignore')

In [None]:
import seaborn as sns
sns.pairplot(df, hue='class')

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(), annot=True)

In [None]:
sns.heatmap(df.corr()[df.corr()>=.7])

In [None]:
#x and y split
y = df['class'] #종속변수
x = df.drop('class', axis=1) #독립변수
# x = df[col_lst]

In [None]:
#scaling x
x.boxplot()
plt.xticks(rotation=60)
plt.show()

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(x)
plt.boxplot(x_scaled)
plt.show()

In [None]:
# resample
sns.countplot(x=df['class'])

In [None]:
# !pip install imblearn
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler()
x_rus, y_rus = rus.fit_resample(x_scaled, y)
sns.countplot(x=y_rus)

In [None]:
#train and test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_rus, y_rus, test_size=.3, random_state=1)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

In [None]:
model_lst = [LogisticRegression(), DecisionTreeClassifier(), RandomForestClassifier(), LinearDiscriminantAnalysis(), KNeighborsClassifier(), SVC()]
score_lst = []
cv_lst = []
for model in model_lst:
    model.fit(x_train, y_train)
    score_lst.append(model.score(x_test, y_test))
    cv_lst.append(cross_val_score(model, x_train, y_train, cv=10, scoring='accuracy'))

In [None]:
acc_df = pd.DataFrame(score_lst, columns=['accuracy'], index=['LOG', 'DT', 'RF', 'LDA', 'KNN', 'SVM'])
acc_df

In [None]:
cv_df = pd.DataFrame(cv_lst)
cv_df = cv_df.T
cv_df.columns = ['LOG', 'DT', 'RF', 'LDA', 'KNN', 'SVM']
cv_df

In [None]:
cv_df.boxplot()

In [None]:
means = cv_df.mean()
stds = cv_df.std()
cv_summary = pd.concat([means, stds], axis=1)
cv_summary.columns = ['MEAN', 'STD']
cv_summary

In [None]:
# SVM
svm = SVC()
svm.fit(x_train, y_train)

In [None]:
y_pred = svm.predict(x_test)
print(classification_report(y_test, y_pred))

In [None]:
# 파라미터튜닝
# https://www.geeksforgeeks.org/svm-hyperparameter-tuning-using-gridsearchcv-ml/
param_grid = {'C' : [.1, 1, 10, 100, 1000],
              'gamma': [1, .1, .01, .001, .0001],
              'kernel': ['rbf']}

In [None]:
grid_cv = GridSearchCV(svm, param_grid, refit=True, verbose=3)
grid_cv.fit(x_train, y_train)

In [None]:
grid_cv.best_params_

In [None]:
grid_cv.best_estimator_

In [None]:
y_pred = grid_cv.predict(x_test)
print(classification_report(y_test, y_pred))

In [None]:
test_data = df.iloc[0,:-1]
grid_cv.predict(scaler.fit_transform(test_data.values.reshape(1,-1))) #양성이라고 예측

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
# , plot_roc_curve
sns.heatmap(confusion_matrix(y_test, grid_cv.predict(x_test)), annot=True)

In [None]:
print(classification_report(y_test, grid_cv.predict(x_test)))

In [None]:
# https://scikit-learn.org/1.0/modules/generated/sklearn.metrics.plot_roc_curve.html
from sklearn.metrics import roc_curve, roc_auc_score
fpr, tpr, _ = roc_curve(y_test, grid_cv.predict(x_test))
auc = roc_auc_score(y_test, grid_cv.predict(x_test))

In [None]:
plt.plot(fpr,tpr,label="AUC="+str(auc))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.show()

In [None]:
#변수 중요도 ==> 변수 줄일 때 사용
rf_model = RandomForestClassifier()
rf_model.fit(x_train, y_train)
rf_model.feature_importances_ #중요한 변수에 대한 점수
sns.barplot(y=df.columns[:-1], x=rf_model.feature_importances_)
plt.show()
col_lst = df.columns[:-1][rf_model.feature_importances_ >= .06] #중요도가 높은 변수들
col_lst

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets

In [None]:
# Load Boson data. 보스톤 데이터 로딩
from sklearn.datasets import fetch_openml
housing = fetch_openml(name="house_prices", as_frame=True)

In [None]:
print(housing)

In [None]:
housing.keys()

In [None]:
housing.feature_names

In [None]:
df = pd.read_csv('/content/boston.csv', delimiter=r"\s+")
df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B',' LSTAT', 'MEDV']
df

In [None]:
# Data exploration (head, shape, dtyes, describe, isna.sum()). 데이터 탐색
df.head()
df.shape
df.dtypes
df.describe()
df.isna().sum()

In [None]:
df.columns

In [None]:
#시각화
df.hist()
plt.tight_layout()
plt.show()

In [None]:
df.boxplot() #scaling
plt.show()

In [None]:
sns.pairplot(df)
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(), annot=True)
plt.show()

In [None]:
# X and y split. 독립, 종속변수 분리
y = df.MEDV
x = df.drop('MEDV', axis=1)

In [None]:
# Data preprocessing (StandardScaler). 전처리
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)
plt.boxplot(x_scaled)
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Train and test split (sklearn.model_selection.train_test_split, test size=.2). 훈련, 테스트 데이터 분리
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=.2, random_state=1)

In [None]:
# Linear Regression 회귀분석
from sklearn.linear_model import LinearRegression
lin_model = LinearRegression()
lin_model.fit(x_train, y_train)
lin_model.coef_ #기울기
lin_model.intercept_ #y절편

In [None]:
# Model evaluation (r2_score) 모델평가
lin_model.score(x_test, y_test)
from sklearn.metrics import r2_score
r2_score(y_test, lin_model.predict(x_test)) #0.7545577207950153

In [None]:
# 10 fold cross validation (r2) 10겹 교차검증
from sklearn.model_selection import cross_val_score
lin_cv = cross_val_score(lin_model, x_test, y_test, scoring='r2', cv=10)

In [None]:
# Decision Tree Regressor 의사결정 트리
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
dtr.fit(x_train, y_train)

In [None]:
# Model evaluation (r2_score) 모델평가
dtr.score(x_test, y_test) # 0.8274774714959514
r2_score(y_test, dtr.predict(x_test))

In [None]:
# 10 fold cross validation 10겹 교차검증
dtr_cv = cross_val_score(dtr, x_test, y_test, scoring='r2', cv=10)

In [None]:
# Random Forest Regressor 랜덤 포레스트
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(x_train, y_train)

In [None]:
# Model evaluation (r2_score) 모델평가
rfr.score(x_test, y_test) #0.9059660350016387
r2_score(y_test, rfr.predict(x_test))

In [None]:
# 10 fold cross validation 10겹 교차검증
rfr_cv = cross_val_score(rfr, x_test, y_test, scoring='r2', cv=10)

In [None]:
# Model Comparison 모델 비교
# Mean and standard deviation table 평균, 표준편차 테이블
df_com = pd.DataFrame({
    'mean': [lin_cv.mean(), dtr_cv.mean(), rfr_cv.mean()],
    'std':[lin_cv.std(), dtr_cv.std(), rfr_cv.std()]
    }, index=['LR', 'DT', 'RF'])
df_com #RF의 정확도가 가장 높음

In [None]:
# Boxplot 박스플랏
plt.boxplot([lin_cv, dtr_cv, rfr_cv])
plt.xticks([1,2,3], ['LR', 'DT', 'RF'])
plt.show() #RF의 정확도가 가장 높음

In [None]:
#pima indian diabete dataset
pima = pd.read_csv('/content/pima-indians-diabetes.csv', header=None)
pima.columns = ['preg', 'plas','pres', 'skin', 'test', 'mass', 'pedi', 'age',  'class']

In [None]:
# X and y split. 독립, 종속변수 분리
y = pima['class']
x = pima.drop('class', axis=1)
x.boxplot()

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(x)
plt.boxplot(x_scaled)
warnings.filterwarnings('ignore')

In [None]:

# Splitting Dataset (decide the test size). 테스트셋 크기를 결정해서 데이터셋을 분할
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=.2, random_state=1)

# Resampling using SMOTE. 리샘플링
sns.countplot(y_train)
from imblearn.over_sampling import SMOTE
smote = SMOTE()
x_smote, y_smote = smote.fit_resample(x_train, y_train)

# Build DT, RF, LDA, kNN, and SVM models. 5개 모델 구축
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

models = [DecisionTreeClassifier(),RandomForestClassifier(),LinearDiscriminantAnalysis(),KNeighborsClassifier(),SVC()]
cv_list = []
acc_list = []
for model in models:
    model.fit(x_train, y_train) #학습
    cv_list.append(cross_val_score(model, x_test, y_test, cv=10, scoring='accuracy'))
    acc_list.append(model.score(x_test, y_test))

# Use accuracy score to compare the models. 정확도를 사용하여 평가
model_columns = ['DT', 'RF', 'LDA', 'KNN', 'SVM']
df_com = pd.DataFrame(acc_list, index=model_columns)
df_com #RF, KNN, SVM이 정확도가 높음

# Use 10-fold cross validation to compare the models. 교차검증을 사용하여 모델 비교
df_cv = pd.DataFrame(cv_list, index=model_columns)
df_summary = pd.concat([df_cv.mean(axis=1),df_cv.std(axis=1)], axis=1)
df_summary.columns = ['MEAN', 'STD']
df_summary #SVM 정확도가 제일 높음

# Use confusion matrix, ROC curve to compare the models. 혼동행렬, 락커브를 이용해서 모델 비교
svm = SVC()
svm.fit(x_train, y_train)

from sklearn.metrics import confusion_matrix, plot_roc_curve
cfm = confusion_matrix(y_test, svm.predict(x_test))
sns.heatmap(cfm, annot=True)

plot_roc_curve(svm, x_test, y_test)

# Predict with actual data. 실데이터를 가지고 예측하기
test_data = pima.iloc[0,:-1].values.reshape(1,-1)
test_data = scaler.transform(test_data)
svm.predict(test_data) #양성이라고 예측

In [None]:
#voting classifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC

log_model = LogisticRegression() #0.935672514619883
rf_model = RandomForestClassifier() #0.9415204678362573
svm_model = SVC() #0.8713450292397661
voting_model = VotingClassifier(estimators=[('log_model', log_model),
                                            ('rf_model', rf_model),
                                            ('svm_model', svm_model)], voting='hard')
voting_model.fit(x_smote, y_smote)
print('{}: {}'.format(voting_model.__class__.__name__, voting_model.score(x_test, y_test))) #0.935672514619883

# 모델 비교
for model in (log_model, rf_model, svm_model):
    model.fit(x_smote, y_smote)
    print('{}: {}'.format(model.__class__.__name__, model.score(x_test, y_test)))

#random forest
rf_model = RandomForestClassifier()
print(rf_model.n_estimators) #100
rf_model.fit(x_smote, y_smote)
print(rf_model.score(x_test, y_test)) #0.935672514619883
rf_model.feature_importances_ #중요한 변수에 대한 점수

#변수 중요도 ==> 변수 줄일 때 사용
rf_model.feature_importances_ #중요한 변수에 대한 점수
ranking = pd.Series(rf_model.feature_importances_, index=x.columns)
ranking = ranking.sort_values(ascending=False)
sns.barplot(y=ranking.index, x=ranking)
plt.show()
x.shape

#PARAMETER TUNING
from sklearn.model_selection import GridSearchCV
params = {
    'n_estimators': [100],
    'max_depth': [6, 8, 10, 12],
    'min_samples_leaf': [8, 12, 18],
    'min_samples_split': [8, 16, 20]}
rf = RandomForestClassifier(random_state=0, n_jobs=-1)
grid_cv = GridSearchCV(rf, param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(x_train, y_train)
print(grid_cv.best_params_) #best hyper parameter
print(grid_cv.best_score_) #best score 0.9271356783919598

rf2 = RandomForestClassifier(random_state=0, n_jobs=1, max_depth=6, min_samples_leaf=8, min_samples_split=8, n_estimators=100)
rf2.fit(x_train, y_train)
rf2.score(x_test, y_test) #0.9298245614035088

#ADA BOOSTING
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# 아다부스트 모델 구축
ada_model = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),
    n_estimators = 200,
    algorithm = 'SAMME.R',
    learning_rate=0.5
)

# 모델 학습
ada_model.fit(x_train, y_train)
ada_model.score(x_test, y_test) #0.9649122807017544

#Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
import time
from sklearn.metrics import accuracy_score

start_time = time.time()

gb_model = GradientBoostingClassifier(random_state=0)
gb_model.fit(x_train,y_train)
gb_pred = gb_model.predict(x_test)
gb_accuracy = accuracy_score(y_test, gb_pred)

print(f'GBM 정확도: {np.round(gb_accuracy, 4)}') #0.9385964912280702
print(f'GBM 수행 시간: {np.round((time.time() - start_time), 1)} 초')

#PARAMETER TUNING
params = {
    'n_estimators': [100, 500],
    'learning_rate': [.05, .1]}
grid_cv = GridSearchCV(gb_model, param_grid=params, cv=2, verbose=1, n_jobs=-1)
grid_cv.fit(x_train, y_train)
print('최적 하이퍼 파라미터: \n', grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

# GridSearchCV를 이용해 최적으로 학습된 estimators로 예측 수행
gb_pred = grid_cv.best_estimator_.predict(x_test)
gb_accuracy = accuracy_score(y_test, gb_pred)
print('GBM 정확도: {0:.4f}'.format(gb_accuracy)) #0.9649

#XGBOOSTING
# https://velog.io/@fiifa92/%EC%95%99%EC%83%81%EB%B8%94Ensemble-%EA%B8%B0%EB%B2%95#:~:text=%EC%95%99%EC%83%81%EB%B8%94%EC%9D%80%20%EC%97%AC%EB%9F%AC%20%EA%B0%9C%EC%9D%98%20%EB%B6%84%EB%A5%98%EA%B8%B0,%EB%8B%A4%EC%96%91%ED%95%9C%20%EC%95%99%EC%83%81%EB%B8%94%20%EB%B0%A9%EC%8B%9D%EC%9D%B4%20%EC%9E%88%EB%8B%A4.
# !pip install xgboost
from xgboost import XGBClassifier
xgb_wrapper = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)
xgb_wrapper.fit(x_train, y_train)
print(xgb_wrapper.score(x_test, y_test)) #0.9473684210526315
w_preds = xgb_wrapper.predict(x_test)
w_pred_proba = xgb_wrapper.predict_proba(x_test)[:, 1]

#조기중단 100
xgb_wrapper.fit(x_train, y_train, early_stopping_rounds=100, eval_metric="logloss", eval_set=[(x_test, y_test)], verbose=True)
print(xgb_wrapper.score(x_test, y_test)) #0.9532163742690059
ws100_preds = xgb_wrapper.predict(x_test)
ws100_pred_proba = xgb_wrapper.predict_proba(x_test)[:, 1]

#조기중단 10
xgb_wrapper.fit(x_train, y_train, early_stopping_rounds=10, eval_metric="logloss", eval_set=[(x_test, y_test)], verbose=True)
print(xgb_wrapper.score(x_test, y_test)) #0.9532163742690059
ws10_preds = xgb_wrapper.predict(x_test)
ws10_pred_proba = xgb_wrapper.predict_proba(x_test)[:, 1]

from xgboost import plot_importance

fig, ax = plt.subplots(figsize=(10, 12))
plot_importance(xgb_wrapper, ax=ax)

#LIGHT GBM
!pip install lightgbm
from lightgbm import LGBMClassifier, plot_importance

lgbm_model = LGBMClassifier(n_estimators=400)
lgbm_model.fit(x_train, y_train, early_stopping_rounds=100, eval_metric='logloss', eval_set = [(x_test, y_test)], verbose=True)
lgbm_model.score(x_test, y_test) #0.9473684210526315

plot_importance(lgbm_model, max_num_features=15)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
os.chdir('E:/Data')

# Import the train data of Loan Predication data set, do some basic exploration tasks, and use the graphs to visualize the data. 대출 훈련데이터를 가져와서 기본 탐색을 수행하고 시각화 하시오.
loan = pd.read_csv('loan.csv')
loan.columns
# loan.set_index('Loan_ID', inplace=True) #인덱스를 바꿀때 쓰는 코드

# head(), shape, dtypes, describe(), isna.sum()
loan.head()
loan.shape
loan.dtypes #분류알고리즘을 써야함
a = loan.describe()
loan.isna().sum() #널값처리가 필요함

# Histogram, boxplot, pairplot, heatmap, countplot
loan.hist()
plt.tight_layout()
plt.show()

loan.boxplot() #스케일링이 필요함
plt.show()

loan.columns
sns.countplot(loan.Loan_Status) #리샘플링이 필요함
plt.show()

sns.pairplot(loan, hue='Loan_Status')
plt.show()

sns.heatmap(loan.corr(), annot=True)
plt.show()

#데이터 탐색 및 시각화
#여러개의 그래프를 한군데에 그릴 때
loan.columns
fig = plt.figure(figsize=(12, 8))
fig.add_subplot(221)   #top left
plt.boxplot(loan.ApplicantIncome)
plt.title('Applicant Income')
fig.add_subplot(222)   #top right
plt.hist(loan.ApplicantIncome, bins=20)
plt.title('Applicant Income')
fig.add_subplot(223)   #bottom left
plt.boxplot(loan.CoapplicantIncome)
plt.title('Coapplicant Income')
fig.add_subplot(224)   #bottom right
plt.hist(loan.CoapplicantIncome, bins=20)
plt.title('Coapplicant Income')
plt.show()

#계량형 데이터를 범주형으로 비교할 때
loan.boxplot(column='ApplicantIncome', by='Education')
plt.show()

#범주형 데이터 빈도테이블
loan.Property_Area.value_counts()
loan.Credit_History.value_counts(dropna=False)

#두개 범주형 변수의 빈도테이블
a = pd.crosstab(loan.Loan_Status, loan.Credit_History)
a.plot(kind='bar')

sns.heatmap(a, annot=True, fmt='g')

#세개 범주형 변수의 빈도테이블
a = pd.crosstab(loan.Loan_Status, [loan.Credit_History, loan.Gender])
a.plot(kind='bar', stacked=True)

#계량형 데이터를 범주형 데이터로 변환하여 빈도테이블
loan.LoanAmount.hist(bins=30)

labels = ["low","medium","high","very high"]
cut_points = [90,140,190]
minval = loan.LoanAmount.min()
maxval = loan.LoanAmount.max()
break_points = [minval] + cut_points + [maxval]
loan["LoanAmount_Bin"] = pd.cut(loan.LoanAmount, bins=break_points, labels=labels, include_lowest=True)
loan.LoanAmount_Bin.value_counts(sort=False, dropna=False)

# Filling missing data 분실값 처리
loan.isna().sum()
# Mean for LoanAmount 대출액은 평균으로 대체
loan.LoanAmount.hist()
loan.LoanAmount.fillna(loan.LoanAmount.mean(), inplace=True)

# Mode for Self_Employed, Gender, Married, Dependents, Loan_Amount_Term, and Credit_History 나머지는 최빈수로 대체
loan.Self_Employed.fillna(loan.Self_Employed.mode()[0], inplace=True)

for i in ['Gender', 'Married', 'Dependents', 'Loan_Amount_Term', 'Credit_History']:
    loan[i].fillna(loan[i].mode()[0], inplace=True)

# Encoding non-numeric values. 범주형 변수를 인코딩
loan.dtypes
# Encode using get_dummies --> Property_Area
dummy = pd.get_dummies(loan.Property_Area)
loan = pd.concat([loan, dummy], axis=1)

# Encode using sklearn label encoder --> 나머지 변수들
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
loan.Gender = le.fit_transform(loan.Gender)
le.inverse_transform(loan.Gender) #숫자를 글짜로 복귀

# Encode multiple variables using for loop
for i in ['Married','Dependents','Education','Self_Employed','Loan_Status']:
    le = LabelEncoder()
    loan[i] = le.fit_transform(loan[i])

# X and y split 독립변수, 종속변수 분리
# Loan_Status for dependent variables
# Other variables for independent variables
loan.columns
y = loan.Loan_Status
x = loan.drop(['Loan_ID', 'Property_Area', 'LoanAmount_Bin', 'Loan_Status'], axis=1)

# X값 스케일링(MinMaxScaler, StandardScaler)
x.boxplot()
plt.show()

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x = scaler.fit_transform(x)

plt.boxplot(x)
plt.show()

# Train and test split 훈련, 테스트데이터 분리
# From sklearn.model_selection import train_test_split
# X_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=7) #데이터를 나눠서 80프로는 훈련용으로 20프로는 테스트용으로 사용함
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=1)

# 리샘플링 (오버샘플링, 언더샘플링, SMOTE)
sns.countplot(y_train)
plt.show()

from imblearn.over_sampling import SMOTE
smote = SMOTE()
x_train, y_train = smote.fit_resample(x_train, y_train)

sns.countplot(y_train)
plt.show()

# Build a logistic regression model. 로지스틱 회귀모델 구축
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_train, y_train)

# Predict the loan status and check the accuracy score of the model with cross validation. 예측하고 정확도로 교차검증
model.score(x_test, y_test) #0.7642276422764228

from sklearn.model_selection import cross_val_score
cv_score = cross_val_score(model, x_test, y_test, cv=10, scoring='accuracy')
cv_score.mean()
cv_score.std()

# Create a confusion matrix,  classification report, and ROC curve. 혼동행렬, 분류보고서, ROC커브 생성
from sklearn.metrics import confusion_matrix, classification_report, plot_roc_curve
y_pred = model.predict(x_test)
confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))
plot_roc_curve(model, x_test, y_test)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#문서 인코딩
text = ['This is the first document.',
          'This document is the second document.',
          'And this is the third one.',
          'Is this the first document?']

#COUNT VECTORIZER
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv = CountVectorizer()
df_x = pd.DataFrame(cv.fit_transform(text).toarray()) #숫자화시켜서 데이터프레임 생성
df_x.columns = cv.get_feature_names() #단어이름을 컬럼이름으로!
df_x

#N-GRAM
cv = CountVectorizer(analyzer='word', ngram_range=(2,2)) #두개의 단어를 기준으로 숫자화
df_x2 = pd.DataFrame(cv.fit_transform(text).toarray()) #숫자화시킨 데이터 프레임
df_x2.columns = cv.get_feature_names() #단어이름 컬럼
df_x2

#TFIDF VECTORIZER
tfidf = TfidfVectorizer()
df_x3 = pd.DataFrame(tfidf.fit_transform(text).toarray())
df_x3.columns = tfidf.get_feature_names()
df_x3

#REGULAR EXPRESSION TOKENIZER
from nltk.tokenize import RegexpTokenizer
sentence  = "Think and wonder, wonder and think."
token = RegexpTokenizer(r"\w+") #한개 이상의 글자
words = token.tokenize(sentence)
# 비슷한 방법
# import re
# words = re.split('\W+', sentence) #글자가 아닌 것으로 나눔
print(words)

#COUNT VECTORIZER PARAMETERS
text = ['This is the first document.',
          'This document is the second document.',
          'And this is the third one.',
          'Is this the first document?']
token = RegexpTokenizer(r'[a-zA-Z0-9]+') #알파벳, 숫자
cv3 = CountVectorizer(lowercase=True,#소문자
stop_words='english', #영어불용어
ngram_range = (1,1), #1-gram
tokenizer = token.tokenize) #알파벳, 숫자만 잘라서 인코딩
text_counts= cv3.fit_transform(text)
df_x4 = pd.DataFrame(text_counts.toarray(), columns=cv3.get_feature_names())
df_x4

# 훈련데이터 읽기 (PhraseId를 인덱스로!)
sent = pd.read_csv('/content/sent_train.tsv', sep='\t')
sent.set_index('PhraseId', inplace=True)
sent

# 데이터프레임 요약 head(), shape, dtypes, isna().sum(), describe()
sent.head() #상위 5개 데이터 검색

sent.shape #156060개의 레코드

sent.dtypes #Sentiment가 범주형 데이터 임

sent.isna().sum() #널값 체크

sent.describe() #기초 통계

"""# 데이터시각화"""

# 박스플랏과 히스토그램을 이용한 일변량분석
sent.boxplot('Sentiment')
plt.show()

sent.hist('Sentiment')
plt.show()

# 감성점수에 따른 데이터 갯수
sent.columns
tab = sent.groupby('Sentiment').size()
tab

# 감정점수에 따른 막대그래프
sns.countplot(sent.Sentiment)
plt.xticks([0,1,2,3,4],['negative', 'somewhat negative', 'neutral', 'somewhat positive', 'positive'], rotation=90)
plt.show()

"""# 전처리"""

import nltk
nltk.download('popular')

# 불용어 및 문장기호 제거
import string
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps = PorterStemmer()

def clean_text(text):
    text = "".join([char.lower() for char in text if char not in string.punctuation])
    tokens = re.split('\W+', text) #or word_tokenize(text)
    text = [ps.stem(word) for word in tokens if word not in set(stopwords.words('english'))]
    return text

# 너무 오래 걸려서 사이즈를 줄임
sent = sent[:1000]
sent.shape

lst = []
for p in sent.Phrase: #줄별로 실행
    lst.append(' '.join(clean_text(p))) #전처리후 줄별로 합침
    print('{} added'.format(p))
sent.Phrase = lst #전처리한 데이터를 구문컬럼에 저장
sent.to_csv('sent.tsv', sep='\t')

# 독립, 종속변수 분리
y = sent.Sentiment
x = sent.Phrase

# 단어 벡터화
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

token = RegexpTokenizer(r'[a-zA-Z0-9]+') #글자, 숫자이외의 것은 제거
cv = CountVectorizer(lowercase=True, #소문자
                     stop_words='english', #불용어
                     ngram_range = (1,1), #1 gram
                     tokenizer = token.tokenize) #정규식을 이용해서 자름
df_x = pd.DataFrame(cv.fit_transform(x).toarray(), columns=cv.get_feature_names())
df_x.head()

# 훈련, 테스트 데이터 분리 (sklearn.model_selection, test=.30, random_state))
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_x, y, test_size=.30, random_state=1)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

# 로지스틱 회귀분석
from sklearn.linear_model import LogisticRegression
model = LogisticRegression() #모델 생성
model.fit(x_train, y_train) #모델 훈련
model.score(x_test, y_test) #정확도를 이용해서 평가

from sklearn.metrics import confusion_matrix, classification_report
y_pred = model.predict(x_test) #테스트 데이터의 예측값
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='g') #혼동행렬
plt.show()

# 분류보고서
print(classification_report(y_test, y_pred))

# 머신러닝 모델 라이브러리 가져오기
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report

models = [DecisionTreeClassifier(), RandomForestClassifier(), KNeighborsClassifier(), SVC()]
names = ['DT', 'RF', 'KNN', 'SVM']
for model, name in zip(models, names):
    model.fit(x_train,y_train) #훈련
    acc_score = model.score(x_test, y_test) #평가
    y_pred = model.predict(x_test) #예측
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='g') #혼동행렬
    plt.title('{}: {}'.format(name, acc_score))
    plt.show()
    print(classification_report(y_test, y_pred)) #분류보고서

# 테스트 데이터로 예측
text = 'So there is no way for me to plug it in here in the US unless I go by a converter.'
text = ' '.join(clean_text(text))
test_data = pd.DataFrame({'Phrase':text},index=[0])
df_test = pd.DataFrame(cv.transform(test_data).toarray(), columns=cv.get_feature_names())
print(models[1].predict(df_test))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#COUNT VECTERIZER
from sklearn.feature_extraction.text import CountVectorizer

corpus = ['This is the first document.',
          'This document is the second document.',
          'And this is the third one.',
          'Is this the first document?']
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.toarray())
X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
a = X_df.head(10)

#ngram
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2)) #word n-gram
X2 = vectorizer2.fit_transform(corpus)
print(vectorizer2.get_feature_names())
print(X2.toarray())
X2_df = pd.DataFrame(X2.toarray(), columns=vectorizer2.get_feature_names())
b = X2_df.head(10)

#tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer3 = TfidfVectorizer()
X3 = vectorizer3.fit_transform(corpus)
print(vectorizer3.get_feature_names())
print(X3.toarray())
X3_df = pd.DataFrame(X3.toarray(), columns=vectorizer3.get_feature_names())
c = X3_df.head(10)

#RegexpTokenizer
from nltk.tokenize import RegexpTokenizer
sentence  = "Think and wonder, wonder and think."
token = RegexpTokenizer(r"\w+")
new_words = token.tokenize(sentence)
print(new_words)

#posterior probability
weather = ['sunny', 'overcast', 'rainy', 'sunny', 'sunny', 'overcast', 'rainy', 'rainy', 'sunny', 'rainy', 'sunny', 'overcast', 'overcast', 'rainy']
play = ['no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', 'no', 'yes', 'yes', 'no', 'yes', 'yes', 'no']
df = pd.DataFrame({'weather':weather,
                   'play':play})
tab = pd.crosstab(df.weather, df.play, margins=True, margins_name='total')
tab
overcast = tab.total[0]
rainy = tab.total[1]
sunny = tab.total[2]
no = tab.iloc[3,0]
yes = tab.iloc[3,1]
p_yes_sunny = (len(df[(df.weather == 'sunny')&(df.play == 'yes')])/yes)*yes/sunny

#text classification
p1 = (2+1)/(11+14) * (1+1)/(11+14) * (0+1)/(11+14) * (2+1)/(11+14) * (3/5)
p2 = (1+1)/(9+14) * (0+1)/(9+14) * (1+1)/(9+14) * (0+1)/(9+14) * (2/5)

#Naive Bayesian Model Example
tennis = pd.read_csv('PlayTennis.csv')
df = tennis.copy()

#encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
col_names = list(df.columns)
#col_names = []
#for i in df.columns:
#    col_names.append(i)
for i in col_names:
    df[i] = le.fit_transform(df[i])

#x and y split
df.columns
y = df['Play Tennis']
x = df.drop('Play Tennis', axis=1)

#train and test split
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=.3, random_state=1)

#MODELS
gnb = GaussianNB()
gnb.fit(x_train, y_train)
y_pred = gnb.predict(x_test)
gnb_score = accuracy_score(y_test, y_pred) #.8
# confusion_matrix(y_test, y_pred)
# print(classification_report(y_test, y_pred))

bnb = BernoulliNB() #binary
bnb.fit(x_train, y_train)
y_pred = bnb.predict(x_test)
bnb_score = accuracy_score(y_test, y_pred) #.6

mnb = MultinomialNB() #범주형
mnb.fit(x_train, y_train)
y_pred = mnb.predict(x_test)
mnb_score = accuracy_score(y_test, y_pred) #.6

print(gnb_score, bnb_score, mnb_score)

test_data = pd.DataFrame([[1, 2, 0, 0]], columns=col_names[:4])
gnb.predict(test_data) #0
mnb.predict(test_data) #0
bnb.predict(test_data) #0

#fruit prediction problem
fruit = pd.read_csv('fruit_data_with_colors.txt', sep='\t')
fruit.head()
fruit.columns

#x and y split
y = fruit.fruit_label
x = fruit.drop(['fruit_label', 'fruit_name', 'fruit_subtype'], axis=1)

x.boxplot()
plt.show()

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(x)
x_df = pd.DataFrame(x_scaled, columns=x.columns)
x_df.boxplot()

#train and test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=.2, random_state=1)

#bayesian!!!!! 다같이 .6
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score
gnb = GaussianNB()
gnb.fit(x_train, y_train) #***
y_pred = gnb.predict(x_test)
gnb_score = accuracy_score(y_test, y_pred) #*****

mnb = MultinomialNB()
mnb.fit(x_train, y_train)
y_pred = mnb.predict(x_test)
mnb_score = accuracy_score(y_test, y_pred)

bnb = BernoulliNB()
bnb.fit(x_train, y_train)
y_pred = bnb.predict(x_test)
bnb_score = accuracy_score(y_test, y_pred)

print(gnb_score, mnb_score, bnb_score)

test_data = x_train.iloc[[0]]
y_train.iloc[[0]]
# test_data = scaler.transform(test_data)
# test_data = pd.DataFrame(test_data, columns=x.columns)
gnb.predict(test_data)

#phrase and Sentiment Analysis
# Read sent_train.tsv 훈련데이터 읽기
data=pd.read_csv('sent_train.tsv', sep='\t')
data.head()
data.dtypes

# Count phrase by sentiments and draw bar chart. 감정에 따른 표현을 카운트, 막대그래프
data.columns
tab = data.Sentiment.value_counts(sort=False)
# tab =data.groupby('Sentiment').count()
plt.style.use('ggplot')
plt.bar(tab.index, tab)
plt.xlabel('Review Sentiments')
plt.ylabel('Number of Review')
plt.show()

#import string
#import re
#from nltk.corpus import stopwords
#from nltk.stem import PorterStemmer
#ps = PorterStemmer()
#
#def clean_text(text):
#    text = "".join([word.lower() for word in text if word not in string.punctuation])
#    tokens = re.split('\W+', text) #or word_tokenize(text)
#    text = [ps.stem(word) for word in tokens if word not in set(stopwords.words('english'))]
#    return text
#clean_text(text)

# Remove stopwords and punctuations. 스탑워드와 문장기호 삭제
# Use CountVectorizer and TF-IDF to generate bag of words. 단어를 벡터화
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

#tokenizer to remove unwanted elements from out data like symbols and numbers
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
phrase_encoded = cv.fit_transform(data.Phrase)

# Train and test split (.3 test set). 30프로의 테스트셋으로 분리
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(phrase_encoded, data.Sentiment, test_size=0.3, random_state=1)

# Build and evaluate a naïve Bayesian model (BernoulliNB, MultinomialNB). 나이브베이즈 모델 생성
from sklearn.naive_bayes import MultinomialNB #범주형
# Model Generation Using Multinomial Naive Bayes
mnb = MultinomialNB()
mnb.fit(x_train, y_train)
y_pred = mnb.predict(X_test)
mnb_score = accuracy_score(y_test, y_pred)

#evaluating output
#from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
#print("MultinomialNB Test Accuracy:", accuracy_score(y_test, y_pred)) #test performance 0.6049169122986885
#print("MultinomialNB Training Accuracy:", accuracy_score(y_train, mnb.predict(x_train))) #train performance
#confusion_matrix(y_test, y_pred) #test performance
#print(classification_report(y_test, y_pred))

# gnb = GaussianNB()
# gnb.fit(x_train.toarray(), y_train) #***
# y_pred = gnb.predict(x_test)
# gnb_score = accuracy_score(y_test, y_pred) #*****

bnb = BernoulliNB()
bnb.fit(x_train, y_train)
y_pred = bnb.predict(x_test)
bnb_score = accuracy_score(y_test, y_pred) #0.6043615703361955

# Compare the results. 결과 비교
print(mnb_score, bnb_score) #mnb

# Prediction with test data. 테스트데이터로 예측
test_data = pd.DataFrame({'Phrase':'hello my name is kim'}, index=[0])
test_data_num= cv.transform(test_data)
test_data_num.toarray()
mnb.predict(test_data_num)