In [None]:
from sklearn.metrics import silhouette_score, root_mean_squared_error, r2_score
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.ensemble import RandomForestRegressor
from hirarchy import build_tree, get_level_data
from yellowbrick.cluster import KElbowVisualizer
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import chisquare, pearsonr
from sklearn.model_selection import KFold
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from matplotlib import rcParams
from tqdm import tqdm
import seaborn as sns
import pickle as pkl
import pandas as pd
import numpy as np
import warnings
import sys
import os

In [None]:
rcParams['font.family'] = 'DejaVu Sans'
warnings.filterwarnings('ignore')
sys.path.append('./')
os.makedirs('./submission/concatenated', exist_ok=True)
# os.makedirs('./submission/concatenated/dragon/data', exist_ok=True)
os.makedirs('./submission/concatenated/mordred/data', exist_ok=True)
# os.makedirs('./submission/concatenated/morgan/data', exist_ok=True)

# mordred

In [None]:
mordred_cluster_center_df = pd.read_csv('./submission/mordred/data/cluster_center_df.csv').set_index('CID')
mordred_features_20_df = pd.read_csv('./submission/mordred/data/features_20.csv').set_index('CID')
mordred_features_20_distance_df = pd.read_csv('./submission/mordred/data/features_20_distances.csv').set_index('CID')
mordred_X_cluster_center = pd.read_csv('./submission/mordred/data/X_cluster_center.csv').rename(columns={'chi2': 'chi2_cluster', 'pearsonr':'pearsonr_cluster', 'cosine':'cosine_cluster', 'euclidean':'euclidean_cluster'}).reset_index()
mordred_X_features_20 = pd.read_csv('./submission/mordred/data/X_features_20.csv').rename(columns={'chi2': 'chi2_f20', 'pearsonr':'pearsonr_f20', 'cosine':'cosine_f20', 'euclidean':'euclidean_f20'}).reset_index()
mordred_X_features_20_distance = pd.read_csv('./submission/mordred/data/X_features_20_distances.csv').rename(columns={'chi2': 'chi2_f20_distance', 'pearsonr':'pearsonr_f20_distance', 'cosine':'cosine_f20_distance', 'euclidean':'euclidean_f20_distance'}).reset_index()

In [None]:
vlas = mordred_X_cluster_center[['index', 'Dataset', 'Mixture 1', 'Mixture 2', 'expected']] == mordred_X_features_20[['index', 'Dataset', 'Mixture 1', 'Mixture 2', 'expected']]
print(f'VLA: {vlas.all().all()}')
vlas = mordred_X_cluster_center[['index', 'Dataset', 'Mixture 1', 'Mixture 2', 'expected']] == mordred_X_features_20_distance[['index', 'Dataset', 'Mixture 1', 'Mixture 2', 'expected']]
print(f'VLA: {vlas.all().all()}')

In [None]:
meta = mordred_X_features_20_distance[['index', 'Dataset', 'Mixture 1', 'Mixture 2', 'expected']]
mordred_X_cluster_center.drop(columns=['expected'], inplace=True)
mordred_X_features_20.drop(columns=['expected'], inplace=True)
mordred_X_features_20_distance.drop(columns=['expected'], inplace=True)
meta

In [None]:
mordred_X_cluster_center.head(1)

In [None]:
mordred_X_features_20.head(1)

In [None]:
mordred_X_features_20_distance.head(1)

In [None]:
# Rename columns
for col in mordred_X_cluster_center.columns:
    # Add cluster_center suffix
    mordred_X_cluster_center.rename(columns={col: f'{col}_cluster'}, inplace=True)
for col in mordred_X_features_20.columns:
    # Add f20 suffix
    mordred_X_features_20.rename(columns={col: f'{col}_f20'}, inplace=True)
for col in mordred_X_features_20_distance.columns:
    # Add f20_distance suffix
    mordred_X_features_20_distance.rename(columns={col: f'{col}_f20_distance'}, inplace=True)

In [None]:
mordred_X_cluster_center[mordred_X_cluster_center.columns[4:]].shape, mordred_X_features_20[mordred_X_features_20.columns[4:]].shape, mordred_X_features_20_distance[mordred_X_features_20_distance.columns[4:]].shape

In [None]:
print(mordred_X_cluster_center[mordred_X_cluster_center.columns[4:]].shape[1] + mordred_X_features_20[mordred_X_features_20.columns[4:]].shape[1] + mordred_X_features_20_distance[mordred_X_features_20_distance.columns[4:]].shape[1])

In [None]:
combined_X = pd.concat([mordred_X_cluster_center[mordred_X_cluster_center.columns[4:]], mordred_X_features_20[mordred_X_features_20.columns[4:]], mordred_X_features_20_distance[mordred_X_features_20_distance.columns[4:]]], axis=1)
combined_X

In [None]:
for col in combined_X.columns:
    print(col)

In [None]:
combined_X = pd.concat([meta, combined_X], axis=1)
# send expected to the end
expected = combined_X.pop('expected')
combined_X['expected'] = expected
combined_X.drop(columns=['index'], inplace=True)
combined_X.to_csv('./submission/concatenated/mordred/data/combined_X.csv')

In [None]:
plt.figure(figsize=(6, 4))
plt.hist(combined_X['expected'])
plt.show()

In [None]:
combined_X['expected'].quantile([0.1, 0.9])

In [None]:
extreme = combined_X[(combined_X['expected'] > 0.8) | (combined_X['expected'] < 0.2)]
extreme.to_csv('./submission/concatenated/mordred/data/extreme.csv', index=False)
extreme_idx = extreme.index.values
extreme.shape

In [None]:
combined_X_no_extreme = combined_X[(combined_X['expected'] <= 0.8) & (combined_X['expected'] >= 0.2)]
combined_X_no_extreme.shape

In [None]:
combined_X_no_extreme.head()

In [None]:
external_train_idx = np.load('./submission/concatenated/data/external_train.npy')
external_test_idx = np.load('./submission/concatenated/data/external_test.npy')

In [None]:
combined_X_no_extreme_train = combined_X_no_extreme.loc[external_train_idx]
combined_X_no_extreme_test = combined_X_no_extreme.loc[external_test_idx]

In [None]:
combined_X_no_extreme_train.head(2)

In [None]:
combined_X_no_extreme_test.head(2)

In [None]:
combined_X_no_extreme_test.to_csv('./submission/concatenated/mordred/data/combined_X_no_extreme_test.csv', index=False)
combined_X_no_extreme_train.to_csv('./submission/concatenated/mordred/data/combined_X_no_extreme_train.csv', index=False)

In [None]:
# kf = KFold(n_splits=4, shuffle=True, random_state=42)
# train_idx = []
# test_idx = []
# for idx1, idx2 in kf.split(combined_X_no_extreme_train):
#     include = list(combined_X_no_extreme_train.iloc[idx1].index.values)
#     include.extend(extreme_idx)
#     include = np.array(include)
#     train_idx.append(include) 
#     test_idx.append(combined_X_no_extreme_train.iloc[idx2].index.values)
# KFOLD
train_idx = np.load('./submission/concatenated/data/train_idx.npy')
test_idx = np.load('./submission/concatenated/data/test_idx.npy')

In [None]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [None]:
for i in tqdm(range(4)):
    os.makedirs(f'./submission/concatenated/mordred/results/fold_{i+1}', exist_ok=True)

    train = combined_X.loc[train_idx[i]]
    val = combined_X.loc[test_idx[i]]
    
    y_train = train['expected'].values
    X_train = train[train.columns[3:-1]].values
    y_val = val['expected'].values
    X_val = val[val.columns[3:-1]].values

    print(f'\tTraining XGBoost')
    xg_model = XGBRegressor()
    xg_model.fit(X_train, y_train)

    print(f'\tTraining RF')
    rf_model = RandomForestRegressor()
    rf_model.fit(X_train, y_train)

    y_pred_xg = xg_model.predict(X_val)
    y_pred_rf = rf_model.predict(X_val)

    result_df = pd.DataFrame({'index': val.index.values, 'expected': y_val, 'xg': y_pred_xg, 'rf': y_pred_rf})
    result_df.to_csv(f'./submission/concatenated/mordred/results/fold_{i+1}/result.csv', index=False)
    
    rmse_xg = root_mean_squared_error(y_val, y_pred_xg)
    rmse_rf = root_mean_squared_error(y_val, y_pred_rf)
    r2_xg = r2_score(y_val, y_pred_xg)
    r2_rf = r2_score(y_val, y_pred_rf)


    plt.figure(figsize=(6, 4))
    sns.regplot(x='expected', y='xg', data=result_df, line_kws={'color': 'red'})
    plt.title(f'XGBoost - Fold {i+1}')
    plt.xlabel('Expected')
    plt.ylabel('Predicted')
    plt.savefig(f'./submission/concatenated/mordred/results/fold_{i+1}/xg.png')
    plt.close()

    plt.figure(figsize=(6, 4))
    sns.regplot(x='expected', y='rf', data=result_df, line_kws={'color': 'red'})
    plt.title(f'Random Forest - Fold {i+1}')
    plt.xlabel('Expected')
    plt.ylabel('Predicted')
    plt.savefig(f'./submission/concatenated/mordred/results/fold_{i+1}/rf.png')
    plt.close()

    print(f'Fold {i+1} - XGBoost RMSE: {rmse_xg}, R2: {r2_xg}')
    print(f'Fold {i+1} - Random Forest RMSE: {rmse_rf}, R2: {r2_rf}')

    with open(f'./submission/concatenated/mordred/results/fold_{i+1}/model_xg.pkl', 'wb') as f:
        pkl.dump(xg_model, f)
    with open(f'./submission/concatenated/mordred/results/fold_{i+1}/model_rf.pkl', 'wb') as f:
        pkl.dump(rf_model, f)
    
    # text file
    with open(f'./submission/concatenated/mordred/results/fold_{i+1}/results.txt', 'w') as f:
        f.write(f'Fold {i+1} - XGBoost RMSE: {rmse_xg}, R2: {r2_xg}\n')
        f.write(f'Fold {i+1} - Random Forest RMSE: {rmse_rf}, R2: {r2_rf}')

In [None]:
# Testing Data
test_X = combined_X_no_extreme_test[combined_X_no_extreme_train.columns[3:-1]].values
test_y = combined_X_no_extreme_test['expected'].values
test_X.shape

In [None]:
# Testing Data
test_X = combined_X_no_extreme_test[combined_X_no_extreme_train.columns[3:-1]].values
test_y = combined_X_no_extreme_test['expected'].values
for i in tqdm(range(4)):
    # Load models
    xg_model = pkl.load(open(f'./submission/concatenated/mordred/results/fold_{i+1}/model_xg.pkl', 'rb'))
    rf_model = pkl.load(open(f'./submission/concatenated/mordred/results/fold_{i+1}/model_rf.pkl', 'rb'))

    y_pred_xg = xg_model.predict(test_X)
    y_pred_rf = rf_model.predict(test_X)

    result_df = pd.DataFrame({'index': combined_X_no_extreme_test.index.values, 'expected': test_y, 'xg': y_pred_xg, 'rf': y_pred_rf})
    result_df.to_csv(f'./submission/concatenated/mordred/results/fold_{i+1}/test_result.csv', index=False)

    rmse_xg = root_mean_squared_error(test_y, y_pred_xg)
    rmse_rf = root_mean_squared_error(test_y, y_pred_rf)
    r2_xg = r2_score(test_y, y_pred_xg)
    r2_rf = r2_score(test_y, y_pred_rf)

    plt.figure(figsize=(6, 4))
    sns.regplot(x='expected', y='xg', data=result_df, line_kws={'color': 'red'})
    plt.title(f'XGBoost - Fold {i+1}')
    plt.xlabel('Expected')
    plt.ylabel('Predicted')
    plt.savefig(f'./submission/concatenated/mordred/results/fold_{i+1}/test_xg.png')
    plt.close()

    plt.figure(figsize=(6, 4))
    sns.regplot(x='expected', y='rf', data=result_df, line_kws={'color': 'red'})
    plt.title(f'Random Forest - Fold {i+1}')
    plt.xlabel('Expected')
    plt.ylabel('Predicted')
    plt.savefig(f'./submission/concatenated/mordred/results/fold_{i+1}/test_rf.png')
    plt.close()

    print(f'Fold {i+1} - XGBoost RMSE: {rmse_xg}, R2: {r2_xg}')
    print(f'Fold {i+1} - Random Forest RMSE: {rmse_rf}, R2: {r2_rf}')

    with open(f'./submission/concatenated/mordred/results/fold_{i+1}/test_results.txt', 'w') as f:
        f.write(f'Fold {i+1} - XGBoost RMSE: {rmse_xg}, R2: {r2_xg}\n')
        f.write(f'Fold {i+1} - Random Forest RMSE: {rmse_rf}, R2: {r2_rf}')

In [None]:
combined_X_no_extreme_train.head(1)

In [None]:
extreme.head(1)