# 데이터 전처리 실습
## Orange Telecom Churn Data 실습
### 1. 패키지 불러오기

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

### 2. 데이터 분석
1. 데이터의 크기 출력
2. 데이터의 특징 출력
3. 데이터의 정보 출력

In [None]:
# load dataset
data_path = '/content/drive/MyDrive/Colab Notebooks/opensource/week11/Orange_Telecom_Churn_Data.csv'
data = pd.read_csv(data_path)
# print the shape of the dataset
print("Shape of the dataset: ", data.shape)
# print the columns of the dataset
print("Columns of the dataset: ", data.columns)
# print the info of the dataset
print("Info of the dataset: ", data.info())


### 3. 데이터 전처리
1. 학습데이터와 테스트 데이터 나누기
2. 데이터 중복 제거
3. 필요없는 특징 제거

In [None]:
# split the dataset into features and target
X = data.drop(['churned'], axis=1)
y = data['churned']

X = X.drop(['state', 'area_code', 'phone_number'], axis=1)

# split the dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# print the shape of the training and testing datasets
print("Shape of the training dataset: ", X_train.shape)
print("Shape of the testing dataset: ", X_test.shape)

# delete duplicates in the training dataset
X_train = X_train.drop_duplicates()
y_train = y_train.loc[X_train.index]

print("Shape of the training dataset after deleting duplicates: ", X_train.shape)


### 3. 데이터 전처리
3. 누락 데이터 대체

(1) 평균값, 최빈값으로 대체

연속값이 있는 특징 -> 평균값

범주형 데이터가 있는 특징 -> 최빈값

In [None]:
# for numerical columns, fill missing values with the mean of the column
# for categorical columns, fill missing values with the mode of the column

# get the numerical columns
num_columns = X_train.select_dtypes(include=['int64', 'float64']).columns
# get the categorical columns
cat_columns = X_train.select_dtypes(include=['object']).columns

X_mean_imputed = X_train.copy()
y_mean_imputed = y_train.copy()

X_test_mean_imputed = X_test.copy()
y_test_mean_imputed = y_test.copy()

# fill missing values
for col in num_columns:
    X_mean_imputed[col].fillna(X_mean_imputed[col].mean(), inplace=True)
    X_test_mean_imputed[col].fillna(X_test_mean_imputed[col].mean(), inplace=True)

for col in cat_columns:
    X_mean_imputed[col].fillna(X_mean_imputed[col].mode()[0], inplace=True)
    X_test_mean_imputed[col].fillna(X_test_mean_imputed[col].mode()[0], inplace=True)


print("Missing values filled")
# print the number of missing values in the training dataset
print("Number of missing values in the training dataset: ", X_mean_imputed.isnull().sum().sum())
# print the number of missing values in the testing dataset
print("Number of missing values in the testing dataset: ", X_test_mean_imputed.isnull().sum().sum())


(2) k-최근접 이웃을 활용한 대체

In [None]:
# fill missing values using k-NN
X_knn_imputed = X_train.copy()
y_knn_imputed = y_train.copy()

X_test_knn_imputed = X_test.copy()
y_test_knn_imputed = y_test.copy()

knn_imputer = KNNImputer(n_neighbors=5)

X_knn_imputed[num_columns] = knn_imputer.fit_transform(X_knn_imputed[num_columns])
X_test_knn_imputed[num_columns] = knn_imputer.transform(X_test_knn_imputed[num_columns])

for col in cat_columns:
    X_knn_imputed[col].fillna(X_knn_imputed[col].mode()[0], inplace=True)
    X_test_knn_imputed[col].fillna(X_test_knn_imputed[col].mode()[0], inplace=True)


print("Missing values filled using k-NN")
print("Number of missing values in the training dataset: ", X_knn_imputed.isnull().sum().sum())
print("Number of missing values in the testing dataset: ", X_test_knn_imputed.isnull().sum().sum())


### 3. 데이터 전처리
4. 데이터 형식 문제 제거 (레이블 인코딩)

In [None]:
le = LabelEncoder()
cat_columns = X_train.select_dtypes(include=['object']).columns.tolist()
# fit the LabelEncoder on both training and testing data
# transform the categorical columns in X_knn_imputed and X_test_knn_imputed, X_mean_imputed and X_test_mean_imputed
for col in cat_columns:
    le.fit(X_train[col])
    X_knn_imputed[col] = le.transform(X_knn_imputed[col])
    X_test_knn_imputed[col] = le.transform(X_test_knn_imputed[col])

for col in cat_columns:
    le.fit(X_train[col])
    X_mean_imputed[col] = le.transform(X_mean_imputed[col])
    X_test_mean_imputed[col] = le.transform(X_test_mean_imputed[col])

print("Label encoding completed")


5. 이상치 탐지 및 제거

In [None]:
# find ouliers in the training dataset
# calculate the z-scores of the numerical columns
# make function
def remove_outliers(X, y):
    z_scores = (X[num_columns] - X[num_columns].mean()) / X[num_columns].std()
    outliers = z_scores[(z_scores > 3).any(axis=1)].index
    X_no_outliers = X.drop(outliers)
    y_no_outliers = y.loc[X_no_outliers.index]
    return X_no_outliers, y_no_outliers

print("Shape of the training dataset before removing outliers (KNN): ", X_knn_imputed.shape)
print("Shape of the testing dataset before removing outliers (KNN): ", X_test_knn_imputed.shape)
print("Shape of the training dataset before removing outliers (MEAN): ", X_mean_imputed.shape)
print("Shape of the testing dataset before removing outliers (MEAN): ", X_test_mean_imputed.shape)

X_knn_no_outliers, y_knn_no_outliers = remove_outliers(X_knn_imputed, y_knn_imputed)
X_mean_no_outliers, y_mean_no_outliers = remove_outliers(X_mean_imputed, y_mean_imputed)

X_test_knn_no_outliers, y_test_knn_no_outliers = remove_outliers(X_test_knn_imputed, y_test_knn_imputed)
X_test_mean_no_outliers, y_test_mean_no_outliers = remove_outliers(X_test_mean_imputed, y_test_mean_imputed)

print("Shape of the training dataset after removing outliers (KNN): ", X_knn_no_outliers.shape)
print("Shape of the testing dataset after removing outliers (KNN): ", X_test_knn_no_outliers.shape)
print("Shape of the training dataset after removing outliers (MEAN): ", X_mean_no_outliers.shape)
print("Shape of the testing dataset after removing outliers (MEAN): ", X_test_mean_no_outliers.shape)



### 데이터 변환
1. 데이터 표준화

In [None]:
# standardize the numerical columns
scaler = StandardScaler()
X_knn_standardized = X_knn_no_outliers.copy()
X_test_knn_standardized = X_test_knn_no_outliers.copy()
X_mean_standardized = X_mean_no_outliers.copy()
X_test_mean_standardized = X_test_mean_no_outliers.copy()

X_knn_standardized[num_columns] = scaler.fit_transform(X_knn_no_outliers[num_columns])
X_test_knn_standardized[num_columns] = scaler.transform(X_test_knn_no_outliers[num_columns])

X_mean_standardized[num_columns] = scaler.fit_transform(X_mean_no_outliers[num_columns])
X_test_mean_standardized[num_columns] = scaler.transform(X_test_mean_no_outliers[num_columns])

print("Standardization completed")

2. 데이터 정규화

In [None]:
# normalize the numerical columns
scaler = MinMaxScaler()
X_knn_normalized = X_knn_standardized.copy()
X_test_knn_normalized = X_test_knn_standardized.copy()
X_mean_normalized = X_mean_standardized.copy()
X_test_mean_normalized = X_test_mean_standardized.copy()

X_knn_normalized[num_columns] = scaler.fit_transform(X_knn_standardized[num_columns])
X_test_knn_normalized[num_columns] = scaler.transform(X_test_knn_standardized[num_columns])

X_mean_normalized[num_columns] = scaler.fit_transform(X_mean_standardized[num_columns])
X_test_mean_normalized[num_columns] = scaler.transform(X_test_mean_standardized[num_columns])

print("Normalization completed")

### 성능 평가
kNN을 사용하여 데이터간의 성능을 평가

In [None]:
# check accuracy using kNN classifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_knn_normalized, y_knn_no_outliers)
y_pred_knn = knn.predict(X_test_knn_normalized)
accuracy_knn_normalized = accuracy_score(y_test_knn_no_outliers, y_pred_knn)

knn.fit(X_knn_standardized, y_knn_no_outliers)
y_pred_knn = knn.predict(X_test_knn_standardized)
accuracy_knn_standardized = accuracy_score(y_test_knn_no_outliers, y_pred_knn)

knn.fit(X_knn_no_outliers, y_knn_no_outliers)
y_pred_knn = knn.predict(X_test_knn_no_outliers)
accuracy_knn_no_outliers = accuracy_score(y_test_knn_no_outliers, y_pred_knn)

knn.fit(X_knn_imputed, y_knn_imputed)
y_pred_knn = knn.predict(X_test_knn_imputed)
accuracy_knn_imputed = accuracy_score(y_test_knn_imputed, y_pred_knn)

# make a DataFrame to store the accuracy scores
accuracy_scores_knn = pd.DataFrame({
    'Method': ['No outliers', 'With outliers', 'Standardized', 'Normalized'],
    'Accuracy': [accuracy_knn_no_outliers, accuracy_knn_imputed, accuracy_knn_standardized, accuracy_knn_normalized]
})

print(accuracy_scores_knn)


In [None]:
# check same but using mean imputed data
knn.fit(X_mean_normalized, y_mean_no_outliers)
y_pred_knn = knn.predict(X_test_mean_normalized)
accuracy_knn_normalized = accuracy_score(y_test_mean_no_outliers, y_pred_knn)

knn.fit(X_mean_standardized, y_mean_no_outliers)
y_pred_knn = knn.predict(X_test_mean_standardized)
accuracy_knn_standardized = accuracy_score(y_test_mean_no_outliers, y_pred_knn)

knn.fit(X_mean_no_outliers, y_mean_no_outliers)
y_pred_knn = knn.predict(X_test_mean_no_outliers)
accuracy_knn_no_outliers = accuracy_score(y_test_mean_no_outliers, y_pred_knn)

knn.fit(X_mean_imputed, y_mean_imputed)
y_pred_knn = knn.predict(X_test_mean_imputed)
accuracy_knn_imputed = accuracy_score(y_test_mean_imputed, y_pred_knn)

# make a DataFrame to store the accuracy scores
accuracy_scores_mean = pd.DataFrame({
    'Method': ['No outliers', 'With outliers', 'Standardized', 'Normalized'],
    'Accuracy': [accuracy_knn_no_outliers, accuracy_knn_imputed, accuracy_knn_standardized, accuracy_knn_normalized]
})
print(accuracy_scores_mean)

In [None]:
# aggregate the accuracy scores for both mean imputed and k-NN imputed data
#  big 2 columns (knn and mean)
accuracy_scores = pd.concat([accuracy_scores_knn, accuracy_scores_mean], keys=['knn', 'mean'])
print(accuracy_scores)

