In [None]:
from sklearn.metrics import silhouette_score, root_mean_squared_error, r2_score
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.ensemble import RandomForestRegressor
from yellowbrick.cluster import KElbowVisualizer
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import chisquare, pearsonr
from sklearn.model_selection import KFold
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from matplotlib import rcParams
from tqdm import tqdm
import seaborn as sns
import pickle as pkl
import pandas as pd
import numpy as np
import warnings
import sys
import os

In [None]:
rcParams['font.family'] = 'DejaVu Sans'
warnings.filterwarnings('ignore')
sys.path.append('./')
os.makedirs('./submission/concatenated/combined/data', exist_ok=True)

# All features

In [None]:
combined_X_dragon = pd.read_csv('./submission/concatenated/dragon/data/combined_X.csv').rename(columns={'Unnamed: 0': 'ID'}).sort_values(by='ID')
combined_X_mordred = pd.read_csv('./submission/concatenated/mordred/data/combined_X.csv').rename(columns={'Unnamed: 0': 'ID'}).sort_values(by='ID')
combined_X_morgan = pd.read_csv('./submission/concatenated/morgan/data/combined_X.csv').rename(columns={'Unnamed: 0': 'ID'}).sort_values(by='ID')

In [None]:
combined_X_mordred

In [None]:
vlas = combined_X_dragon[['ID', 'Dataset', 'Mixture 1', 'Mixture 2', 'expected']] == combined_X_mordred[['ID', 'Dataset', 'Mixture 1', 'Mixture 2', 'expected']]
print(f'VLA: {vlas.all().all()}')
vlas = combined_X_dragon[['ID', 'Dataset', 'Mixture 1', 'Mixture 2', 'expected']] == combined_X_morgan[['ID', 'Dataset', 'Mixture 1', 'Mixture 2', 'expected']]
print(f'VLA: {vlas.all().all()}')

In [None]:
meta = combined_X_morgan[['ID', 'Dataset', 'Mixture 1', 'Mixture 2', 'expected']]
combined_X_dragon.drop(columns=['expected'], inplace=True)
combined_X_mordred.drop(columns=['expected'], inplace=True)
combined_X_morgan.drop(columns=['expected'], inplace=True)
meta

In [None]:
combined_X_mordred.head(2)

In [None]:
combined_X_morgan.head(2)

In [None]:
# Rename columns
for col in combined_X_dragon.columns[4:]:
    # add dragon prefix
    combined_X_dragon.rename(columns={col: 'dragon_' + col}, inplace=True)
for col in combined_X_mordred.columns[4:]:
    # add mordred prefix
    combined_X_mordred.rename(columns={col: 'mordred_' + col}, inplace=True)
for col in combined_X_morgan.columns[4:]:
    # add morgan prefix
    combined_X_morgan.rename(columns={col: 'morgan_' + col}, inplace=True)

In [None]:
combined_X_dragon[combined_X_dragon.columns[4:]].shape, combined_X_mordred[combined_X_mordred.columns[4:]].shape, combined_X_morgan[combined_X_morgan.columns[4:]].shape

In [None]:
print(combined_X_dragon[combined_X_dragon.columns[4:]].shape[1]+combined_X_mordred[combined_X_mordred.columns[4:]].shape[1])
print(combined_X_dragon[combined_X_dragon.columns[4:]].shape[1]+combined_X_mordred[combined_X_mordred.columns[4:]].shape[1]+combined_X_morgan[combined_X_morgan.columns[4:]].shape[1])

In [None]:
combined_X_dragon[combined_X_dragon.columns[4:]]

In [None]:
combined_X = pd.concat([combined_X_dragon[combined_X_dragon.columns[4:]], combined_X_mordred[combined_X_mordred.columns[4:]], combined_X_morgan[combined_X_morgan.columns[4:]]], axis=1)
# Add metadata
combined_X = pd.concat([meta, combined_X], axis=1)
# # send expected to the end
expected = combined_X.pop('expected')
combined_X['expected'] = expected
combined_X.drop(columns=['ID'], inplace=True)
# combined_X = combined_X.sample(frac=1)
combined_X

In [None]:
combined_X.to_csv('./submission/concatenated/combined/data/combined_X.csv', index=False)

In [None]:
plt.figure(figsize=(6, 4))
plt.hist(combined_X['expected'])
plt.show()

In [None]:
extreme = combined_X[(combined_X['expected'] > 0.8) | (combined_X['expected'] < 0.2)]
extreme.to_csv('./submission/concatenated/combined/data/extreme.csv', index=False)
extreme_idx = extreme.index.values
extreme.shape

In [None]:
combined_X_no_extreme = combined_X[(combined_X['expected'] <= 0.8) & (combined_X['expected'] >= 0.2)]
combined_X_no_extreme = combined_X_no_extreme.sample(frac=1, random_state=42)
combined_X_no_extreme.shape

In [None]:
external_train_idx = np.load('./submission/concatenated/data/external_train.npy')
external_test_idx = np.load('./submission/concatenated/data/external_test.npy')

In [None]:
combined_X_no_extreme_train = combined_X_no_extreme.loc[external_train_idx]
combined_X_no_extreme_test = combined_X_no_extreme.loc[external_test_idx]
combined_X_no_extreme_train.shape, combined_X_no_extreme_test.shape

In [None]:
combined_X_no_extreme_test.head(2)

In [None]:
combined_X_no_extreme_train.to_csv('./submission/concatenated/combined/data/combined_X_no_extreme_train.csv', index=False)
combined_X_no_extreme_test.to_csv('./submission/concatenated/combined/data/combined_X_no_extreme_test.csv', index=False)

In [None]:
# KFOLD
train_idx = np.load('./submission/concatenated/data/train_idx.npy')
test_idx = np.load('./submission/concatenated/data/test_idx.npy')
train_idx.shape, test_idx.shape

In [None]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [None]:
train = combined_X.loc[train_idx[1]]
val = combined_X.loc[test_idx[1]]
train.shape, val.shape

In [None]:
train.columns[3:-1]

In [None]:
y_train = train['expected'].values
X_train = train[train.columns[3:-1]].values
y_val = val['expected'].values
X_val = val[val.columns[3:-1]].values
X_train.shape, X_val.shape, y_train.shape, y_val.shape

In [None]:
train.iloc[:, 3:-1]

In [None]:
for i in tqdm(range(4)):
    os.makedirs(f'./submission/concatenated/combined/results/fold_{i+1}', exist_ok=True)

    train = combined_X.loc[train_idx[i]]
    val = combined_X.loc[test_idx[i]]
    
    y_train = train['expected'].values
    X_train = train.iloc[:, 3:-1].values
    y_val = val['expected'].values
    X_val = val.iloc[:, 3:-1].values
    print(f'\tTrain X Shape: {X_train.shape}, y Shape: {y_train.shape}')
    print(f'\tVal X Shape: {X_val.shape}, y Shape: {y_val.shape}')

    print(f'\tTraining XGBoost')
    xg_model = XGBRegressor()
    xg_model.fit(X_train, y_train)

    print(f'\tTraining RF')
    rf_model = RandomForestRegressor()
    rf_model.fit(X_train, y_train)

    y_pred_xg = xg_model.predict(X_val)
    y_pred_rf = rf_model.predict(X_val)

    result_df = pd.DataFrame({'index': val.index.values, 'expected': y_val, 'xg': y_pred_xg, 'rf': y_pred_rf})
    result_df.to_csv(f'./submission/concatenated/combined/results/fold_{i+1}/result.csv', index=False)
    
    rmse_xg = root_mean_squared_error(y_val, y_pred_xg)
    rmse_rf = root_mean_squared_error(y_val, y_pred_rf)
    r2_xg = r2_score(y_val, y_pred_xg)
    r2_rf = r2_score(y_val, y_pred_rf)


    plt.figure(figsize=(6, 4))
    sns.regplot(x='expected', y='xg', data=result_df, line_kws={'color': 'red'})
    plt.title(f'XGBoost - Fold {i+1}')
    plt.xlabel('Expected')
    plt.ylabel('Predicted')
    plt.savefig(f'./submission/concatenated/combined/results/fold_{i+1}/xg.png')
    plt.close()

    plt.figure(figsize=(6, 4))
    sns.regplot(x='expected', y='rf', data=result_df, line_kws={'color': 'red'})
    plt.title(f'Random Forest - Fold {i+1}')
    plt.xlabel('Expected')
    plt.ylabel('Predicted')
    plt.savefig(f'./submission/concatenated/combined/results/fold_{i+1}/rf.png')
    plt.close()

    print(f'Fold {i+1} - XGBoost RMSE: {rmse_xg}, R2: {r2_xg}')
    print(f'Fold {i+1} - Random Forest RMSE: {rmse_rf}, R2: {r2_rf}')

    with open(f'./submission/concatenated/combined/results/fold_{i+1}/model_xg.pkl', 'wb') as f:
        pkl.dump(xg_model, f)
    with open(f'./submission/concatenated/combined/results/fold_{i+1}/model_rf.pkl', 'wb') as f:
        pkl.dump(rf_model, f)
    
    # text file
    with open(f'./submission/concatenated/combined/results/fold_{i+1}/results.txt', 'w') as f:
        f.write(f'Fold {i+1} - XGBoost RMSE: {rmse_xg}, R2: {r2_xg}\n')
        f.write(f'Fold {i+1} - Random Forest RMSE: {rmse_rf}, R2: {r2_rf}')

In [None]:
# Testing Data
test_X = combined_X_no_extreme_test[combined_X_no_extreme_train.columns[3:-1]].values
test_y = combined_X_no_extreme_test['expected'].values
test_X.shape

In [None]:
test_X = combined_X_no_extreme_test.iloc[:, 3:-1].values
test_y = combined_X_no_extreme_test['expected'].values
test_X.shape, test_y.shape

In [None]:
# Testing Data
test_X = combined_X_no_extreme_test.iloc[:, 3:-1].values
test_y = combined_X_no_extreme_test['expected'].values
for i in tqdm(range(4)):
    # Load models
    xg_model = pkl.load(open(f'./submission/concatenated/combined/results/fold_{i+1}/model_xg.pkl', 'rb'))
    rf_model = pkl.load(open(f'./submission/concatenated/combined/results/fold_{i+1}/model_rf.pkl', 'rb'))

    y_pred_xg = xg_model.predict(test_X)
    y_pred_rf = rf_model.predict(test_X)

    result_df = pd.DataFrame({'index': combined_X_no_extreme_test.index.values, 'expected': test_y, 'xg': y_pred_xg, 'rf': y_pred_rf})
    result_df.to_csv(f'./submission/concatenated/combined/results/fold_{i+1}/test_result.csv', index=False)

    rmse_xg = root_mean_squared_error(test_y, y_pred_xg)
    rmse_rf = root_mean_squared_error(test_y, y_pred_rf)
    r2_xg = r2_score(test_y, y_pred_xg)
    r2_rf = r2_score(test_y, y_pred_rf)

    plt.figure(figsize=(6, 4))
    sns.regplot(x='expected', y='xg', data=result_df, line_kws={'color': 'red'})
    plt.title(f'XGBoost - Fold {i+1}')
    plt.xlabel('Expected')
    plt.ylabel('Predicted')
    plt.savefig(f'./submission/concatenated/combined/results/fold_{i+1}/test_xg.png')
    plt.close()

    plt.figure(figsize=(6, 4))
    sns.regplot(x='expected', y='rf', data=result_df, line_kws={'color': 'red'})
    plt.title(f'Random Forest - Fold {i+1}')
    plt.xlabel('Expected')
    plt.ylabel('Predicted')
    plt.savefig(f'./submission/concatenated/combined/results/fold_{i+1}/test_rf.png')
    plt.close()

    print(f'Fold {i+1} - XGBoost RMSE: {rmse_xg}, R2: {r2_xg}')
    print(f'Fold {i+1} - Random Forest RMSE: {rmse_rf}, R2: {r2_rf}')

    with open(f'./submission/concatenated/combined/results/fold_{i+1}/test_results.txt', 'w') as f:
        f.write(f'Fold {i+1} - XGBoost RMSE: {rmse_xg}, R2: {r2_xg}\n')
        f.write(f'Fold {i+1} - Random Forest RMSE: {rmse_rf}, R2: {r2_rf}')

In [None]:
for i in range(4):
    result_df = pd.read_csv(f'./submission/concatenated/combined/results/fold_{i+1}/test_result.csv')
    xg_pred = result_df['xg'].values
    rf_pred = result_df['rf'].values
    expected = result_df['expected'].values
    plt.figure(figsize=(6, 4))
    sns.histplot(xg_pred, color='blue', label='XGBoost', kde=True)
    sns.histplot(rf_pred, color='red', label='Random Forest', kde=True)
    sns.histplot(expected, color='green', label='Expected', kde=True)
    plt.title(f'Fold {i+1} - Prediction Distribution')
    plt.legend()
    plt.show()



result_df = pd.read_csv(f'./submission/concatenated/combined/results/fold_1/test_result.csv')
xg_avr = result_df['xg'].values
rf_avr = result_df['rf'].values
expected = result_df['expected'].values
for i in range(1, 4):
    result_df = pd.read_csv(f'./submission/concatenated/combined/results/fold_{i+1}/test_result.csv')
    xg_pred = result_df['xg'].values
    rf_pred = result_df['rf'].values
    expected = result_df['expected'].values
    xg_avr += xg_pred
    rf_avr += rf_pred
xg_avr /= 4
rf_avr /= 4
rmse_xg = root_mean_squared_error(expected, xg_avr)
rmse_rf = root_mean_squared_error(expected, rf_avr)
print(f'XGBoost RMSE: {rmse_xg}')
print(f'Random Forest RMSE: {rmse_rf}')

In [None]:
# Reg Plot
plt.figure(figsize=(6, 4))
sns.regplot(x=expected, y=xg_avr, line_kws={'color': 'red'})
plt.title(f'XGBoost - Average')
plt.xlabel('Expected')
plt.ylabel('Predicted')
# plt.savefig(f'./submission/concatenated/combined/results/average_xg.png')
plt.close()

plt.figure(figsize=(6, 4))
sns.regplot(x=expected, y=rf_avr, line_kws={'color': 'red'})
plt.title(f'Random Forest - Average')
plt.xlabel('Expected')
plt.ylabel('Predicted')
# plt.savefig(f'./submission/concatenated/combined/results/average_rf.png')
# Add text - RMSE
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
sns.histplot(xg_avr, color='blue', label='XGBoost', kde=True)
sns.histplot(rf_avr, color='red', label='Random Forest', kde=True)
sns.histplot(expected, color='green', label='Expected', kde=True)
plt.title(f'Average - Prediction Distribution')
plt.legend()
plt.show()

In [None]:
rmse_xg_avr = root_mean_squared_error(expected, xg_avr)
rmse_rf_avr = root_mean_squared_error(expected, rf_avr)
r2_xg_avr = r2_score(expected, xg_avr)
r2_rf_avr = r2_score(expected, rf_avr)
print(f'Average - XGBoost RMSE: {rmse_xg_avr}, R2: {r2_xg_avr}')
print(f'Average - Random Forest RMSE: {rmse_rf_avr}, R2: {r2_rf_avr}')

In [None]:
corr_xg = pearsonr(expected, xg_avr)
corr_rf = pearsonr(expected, rf_avr)
print(f'Average - XGBoost Correlation: {corr_xg}')
print(f'Average - Random Forest Correlation: {corr_rf}')