# 0. System Check

In [1]:
!nvidia-smi

Thu Feb  8 01:41:38 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.147.05   Driver Version: 525.147.05   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  Off |
|  0%   32C    P8    24W / 450W |     71MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# 1. Module

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import joblib 

import os, logging
from tqdm import tqdm
import time
from helpers import get_ip_address, has_write_permission, save_preprocessed_data, read_preprocessed_data

import warnings
# 경고 메시지 무시
warnings.filterwarnings("ignore")

# 2. Data Load_Y

In [3]:
data_locations = {
    '223.195.111.48': '/project/datacamp/team11/data',
    '147.47.44.229': '/home/jinhyun/data/1kGP',
}

chr_list = [str(x) for x in range(1,23)]
gt_dict = {"0|0" :0, "0|1" : 1, "1|0" : 2, "1|1" : 3 } # genotype dict for converting string-> inteter 

In [4]:
raw_data_path = data_locations.get(get_ip_address(), '/not_found')
sample_annotation_file = os.path.join(raw_data_path, "igsr-1000 genomes 30x on grch38.tsv")
preprocess_path = os.path.join(raw_data_path, "preprocessed")

assert os.path.exists(preprocess_path), f"Data path not exists: {raw_data_path} OR IP setting is incorrect: {get_ip_address()}"
assert os.path.isfile(sample_annotation_file), f"File not exists : {sample_annotation_file}"
assert has_write_permission(preprocess_path), f"You do not have write permission for {preprocess_path}"

In [5]:
sample_annotation_df = pd.read_csv(sample_annotation_file, sep="\t")
print(f"Read sample annotation info with shape : {sample_annotation_df.shape}")

Read sample annotation info with shape : (3202, 9)


In [None]:
# sample_annotation_df

## Superpopulation code

In [None]:
# continent = sample_annotation_df['Superpopulation code']
# continent

## Population code

In [6]:
remove_row = sample_annotation_df[sample_annotation_df['Population code'] == 'IBS,MSL']
print(remove_row.index)

New_sample_annotation_df_ = sample_annotation_df.drop(remove_row.index)


# print(New_sample_annotation_df_)
New_sample_annotation_df_.value_counts('Population code')

continent = New_sample_annotation_df_['Population code']
continent

Index([2192], dtype='int64')


0       FIN
1       FIN
2       FIN
3       FIN
4       FIN
       ... 
3197    GIH
3198    GIH
3199    GIH
3200    GIH
3201    CEU
Name: Population code, Length: 3201, dtype: object

# 3. Data_Load_X

## 3.1 merged_random_100_matrix

In [None]:
npy_path = '/project/datacamp/team11/data/preprocessed/merged_random_100_matrix.npy'
data = np.load(npy_path)

X = data
y = continent

## 3.2 merged_random_1k_matrix

In [None]:
npy_path = '/project/datacamp/team11/data/preprocessed/merged_random_1k_matrix.npy'
data = np.load(npy_path)

X = data
y = continent

## 3.3 merged_random_10k_matrix

In [33]:
npy_path = '/project/datacamp/team11/data/preprocessed/merged_random_10k_annotated_matrix.npy'
data = np.load(npy_path)

X = data
y = continent

## 3.4 merged_random_100k_matrix

In [None]:
npy_path = '/project/datacamp/team11/data/preprocessed/merged_random_100k_matrix.npy'
data = np.load(npy_path)

X = data
y = continent

## 3.5 merged_random_1M_matrix

In [None]:
npy_path = '/project/datacamp/team11/data/preprocessed/merged_random_1M_matrix.npy'
data = np.load(npy_path)

X = data
y = continent

## 3.6 merged_zerofilter_0.0005_matrix

In [None]:
npy_path = '/project/datacamp/team11/data/preprocessed/merged_zerofilter_0.0005_matrix.npy'
data = np.load(npy_path)

X = data
y = continent

## Data of Population_name

In [34]:
X = np.delete(X, 2192, axis=0)
print(X)
print(X.shape)

[[0 4 0 ... 8 0 7]
 [0 4 0 ... 8 0 7]
 [0 4 0 ... 8 0 7]
 ...
 [0 4 0 ... 8 0 7]
 [0 4 0 ... 8 0 7]
 [0 4 0 ... 8 0 7]]
(3201, 20000)


## Data Check

In [35]:
print(X)
print(X.shape)
print(y)
print(y.shape)

[[0 4 0 ... 8 0 7]
 [0 4 0 ... 8 0 7]
 [0 4 0 ... 8 0 7]
 ...
 [0 4 0 ... 8 0 7]
 [0 4 0 ... 8 0 7]
 [0 4 0 ... 8 0 7]]
(3201, 20000)
0       FIN
1       FIN
2       FIN
3       FIN
4       FIN
       ... 
3197    GIH
3198    GIH
3199    GIH
3200    GIH
3201    CEU
Name: Population code, Length: 3201, dtype: object
(3201,)


## Data_Split

In [36]:
# 데이터를 훈련 세트와 테스트 세트로 분할
random_seed = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)


In [37]:
print(X_train.shape)
print(X_test.shape)
print(y_train)
print(y_test)

(2560, 20000)
(641, 20000)
561     MSL
282     IBS
1465    PUR
1576    IBS
807     ITU
       ... 
1095    YRI
1130    MXL
1294    GIH
860     STU
3175    ASW
Name: Population code, Length: 2560, dtype: object
1374    FIN
1195    MXL
2938    STU
1506    PUR
214     CLM
       ... 
43      CHS
87      FIN
1812    IBS
2939    TSI
2479    YRI
Name: Population code, Length: 641, dtype: object


# 4. Modeling

## 4.1 SVM

In [None]:
# from sklearn.model_selection import train_test_split
# from sklearn.svm import SVC
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import accuracy_score
# import time

# svm_model = SVC()
# param_grid = {'C': [0.1, 1, 10], 'gamma': [0.01, 0.1, 1], 'kernel': ['linear', 'rbf']}
# grid_search = GridSearchCV(svm_model, param_grid, cv=2, n_jobs=-1, verbose=2)

# grid_search.fit(X_train, y_train)
# print("Best Parameters: ", grid_search.best_params_)
# best_svm_model = grid_search.best_estimator_
# best_svm_model_params = grid_search.best_params_
# y_pred = best_svm_model.predict(X_test)

### 4.1.1 1M SVM(Previous parameters)

In [38]:
best_params = {'C': 1, 'gamma': 0.01, 'kernel': 'linear'} # 100
previous_best_params = {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'} # 1k
previous_best_params = {'C': 0.1, 'gamma': 0.01, 'kernel': 'linear'} 
svm_model = SVC(**previous_best_params)

start_time = time.time()

svm_model.fit(X_train, y_train)

fitting_end_time = time.time()

y_pred = svm_model.predict(X_test)

end_time = time.time()

fitting_elapsed_time = fitting_end_time - start_time
prediction_elapsed_time = end_time - fitting_end_time

fitting_minutes = int(fitting_elapsed_time // 60)
fitting_seconds = int(fitting_elapsed_time % 60)

prediction_minutes = int(prediction_elapsed_time // 60)
prediction_seconds = int(prediction_elapsed_time % 60)

start_time_formatted = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_time))
fitting_end_time_formatted = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(fitting_end_time))
end_time_formatted = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(end_time))

print(f"Start time: {start_time_formatted}")
print(f"Model fitting end time: {fitting_end_time_formatted}")
print(f"End time: {end_time_formatted}")
print(f"Total model fitting time: {fitting_minutes} minutes {fitting_seconds} seconds")
print(f"Total prediction time: {prediction_minutes} minutes {prediction_seconds} seconds")


Start time: 2024-02-08 01:54:00
Model fitting end time: 2024-02-08 01:54:37
End time: 2024-02-08 01:54:47
Total model fitting time: 0 minutes 36 seconds
Total prediction time: 0 minutes 10 seconds


In [39]:
end_time = time.time()
elapsed_time_seconds = end_time - start_time

print(f"Total model fitting and prediction time: {elapsed_time_seconds} seconds")

Total model fitting and prediction time: 47.265421628952026 seconds


## 4.2 Parameter load

### 4.2.1 merged_random_100_matrix

In [None]:
joblib.dump(best_svm_model, 'best_svm_model100.joblib')

with open('best_params100.txt', 'w') as f:
    f.write(str(grid_search.best_params_))
#######################################################
loaded_model = joblib.load('best_svm_model100.joblib')

with open('best_params100.txt', 'r') as f:
    loaded_params = eval(f.read())

y_pred = loaded_model.predict(X_test)

### 4.2.2 merged_random_1k_matrix

In [None]:
joblib.dump(best_svm_model, 'best_svm_model1k.joblib')

with open('best_params1k.txt', 'w') as f:
    f.write(str(grid_search.best_params_))
#######################################################
loaded_model = joblib.load('best_svm_model1k.joblib')

with open('best_params1k.txt', 'r') as f:
    loaded_params = eval(f.read())

y_pred = loaded_model.predict(X_test)

### 4.2.3 merged_random_10k_matrix

In [None]:
joblib.dump(best_svm_model, 'best_svm_model10k.joblib')

with open('best_params10k.txt', 'w') as f:
    f.write(str(grid_search.best_params_))
#######################################################
loaded_model = joblib.load('best_svm_model10k.joblib')

with open('best_params10k.txt', 'r') as f:
    loaded_params = eval(f.read())

y_pred = loaded_model.predict(X_test)

### 4.2.4 merged_random_100k_matrix

In [None]:
joblib.dump(best_svm_model, 'best_svm_model100k.joblib')

with open('best_params100k.txt', 'w') as f:
    f.write(str(grid_search.best_params_))
#######################################################
loaded_model = joblib.load('best_svm_model100k.joblib')

with open('best_params100k.txt', 'r') as f:
    loaded_params = eval(f.read())

y_pred = loaded_model.predict(X_test)

### 4.2.5 merged_random_1M_matrix

In [None]:
joblib.dump(best_svm_model, 'best_svm_model1M.joblib')

with open('best_params1M.txt', 'w') as f:
    f.write(str(grid_search.best_params_))
#######################################################
loaded_model = joblib.load('best_svm_model1M.joblib')

with open('best_params1M.txt', 'r') as f:
    loaded_params = eval(f.read())

y_pred = loaded_model.predict(X_test)

### 4.2.6 merged_random_zerofilter_0.005_matrix

In [None]:
joblib.dump(best_svm_model, 'best_svm_model_zerofilter_0.0005.joblib')

with open('best_params_zerofilter_0.0005.txt', 'w') as f:
    f.write(str(grid_search.best_params_))
#######################################################
loaded_model = joblib.load('best_svm_model_zerofilter_0.0005.joblib')

with open('best_params_zerofilter_0.0005.txt', 'r') as f:
    loaded_params = eval(f.read())

y_pred = loaded_model.predict(X_test)

### y_test, y_pred

In [None]:
# print(y_test)
# print(y_pred)

# 5 Quality_evaluation metrics

## 5.1 Accuracy

In [40]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}' )

Accuracy: 0.7800312012480499


## 5.2 F1-score

In [None]:
f1_mi = f1_score(y_test, y_pred, average='micro')
print(f'F1-score_micro: {f1_mi}')
f1_ma = f1_score(y_test, y_pred, average='macro')
print(f'F1-score_macro: {f1_ma}')
f1_w = f1_score(y_test, y_pred, average='weighted')
print(f'F1-score_weighted: {f1_w}')

## 5.3 Confusion matrix

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, 
            xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()