In [None]:
import os

import numpy as np 
import pandas as pd 
import seaborn as sns
import math

In [None]:
%matplotlib inline 
import matplotlib.pyplot as plt

In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
hlth_df = pd.read_csv(filepath)

In [None]:
hlth_df.describe()

In [None]:
hlth_df.info()

In [None]:
hlth_df.hist(bins=50, figsize=(20, 15))
plt.show()

In [None]:
from pandas.plotting import scatter_matrix

axes = scatter_matrix(hlth_data, figsize=(32, 16))

In [None]:
plt.figure(figsize=(14, 10))
sns.heatmap(hlth_df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap of Transformer Health Metrics')
plt.show()

In [None]:
plt.figure(figsize=(14, 10))
sns.heatmap(hlth_df.corr("kendall"), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap of Transformer Health Metrics')
plt.show()

In [None]:
plt.figure(figsize=(14, 10))
sns.heatmap(hlth_df.corr("spearman"), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap of Transformer Health Metrics')
plt.show()

In [None]:
def find_top_k_correlated_features(data, k, correlation_type='pearson'):
    """
    Find the k features that are best correlated with all other features in the dataset.

    Parameters:
    - data (pd.DataFrame): The dataset containing the features.
    - k (int): The number of top features to select.
    - correlation_type (str): The type of correlation to compute ('pearson', 'spearman', 'kendall').

    Returns:
    - list: The names of the top k correlated features.
    """
    if correlation_type not in ['pearson', 'spearman', 'kendall']:
        raise ValueError("Invalid correlation_type. Choose from 'pearson', 'spearman', or 'kendall'.")

    # Compute the correlation matrix
    corr_matrix = data.corr(method=correlation_type)

    # Calculate the sum of the absolute correlations for each feature (excluding self-correlation)
    absolute_corr_sum = corr_matrix.abs().sum() - 1

    # Select the top k features with the highest sum of absolute correlations
    top_k_features = absolute_corr_sum.nlargest(k).index.tolist()

    return top_k_features

In [None]:
def find_top_k_features_to_targets(data, targets, k, correlation_type='pearson'):
    """
    Find the k features that are best correlated with a given set of target columns.

    Parameters:
    - data (pd.DataFrame): The dataset containing the features.
    - targets (list of str): The names of the target columns to find correlations against.
    - k (int): The number of top features to select.
    - correlation_type (str): The type of correlation to compute ('pearson', 'spearman', 'kendall').

    Returns:
    - list: The names of the top k features most correlated with the target columns.
    """
    if correlation_type not in ['pearson', 'spearman', 'kendall']:
        raise ValueError("Invalid correlation_type. Choose from 'pearson', 'spearman', or 'kendall'.")
    
    # Ensure target columns are in the dataset
    missing_targets = [col for col in targets if col not in data.columns]
    if missing_targets:
        raise ValueError(f"Target columns {missing_targets} are not present in the dataset.")
    
    # Compute the correlation matrix for the whole dataset
    corr_matrix = data.corr(method=correlation_type)
    
    # Filter the correlation matrix for only the target columns
    target_corr = corr_matrix[targets]
    
    # Sum the absolute correlations for each feature across all target columns
    absolute_corr_sum = target_corr.abs().sum(axis=1)
    
    # Exclude the target columns themselves from being selected
    absolute_corr_sum = absolute_corr_sum.drop(targets, errors='ignore')
    
    # Select the top k features with the highest sum of absolute correlations
    top_k_features = absolute_corr_sum.nlargest(k).index.tolist()
    
    return top_k_features


In [None]:
targets = ['Life expectation', 'Health index']
top_correlated = find_top_k_features_to_targets(hlth_df, targets, 7)
top_correlated

In [None]:
def plot_most_correlated_grid(df, top_correlated_columns):
    """
    Plot regression plots for the most correlated columns, where each column is plotted
    against all other columns in the provided list (excluding itself).

    Parameters:
    - df (pd.DataFrame): The dataset containing the features.
    - top_correlated_columns (list of str): A list of feature names representing the most correlated columns.
    """
    # Generate all column pairs (each column with every other column excluding itself)
    column_pairs = [(col1, col2) for i, col1 in enumerate(top_correlated_columns) 
                    for j, col2 in enumerate(top_correlated_columns) if i != j]

    # Deduplicate pairs (order doesn't matter)
    unique_pairs = []
    seen = set()
    for col1, col2 in column_pairs:
        if (col1, col2) not in seen and (col2, col1) not in seen:
            unique_pairs.append((col1, col2))
            seen.add((col1, col2))

    # Determine the grid size
    n = len(unique_pairs)
    grid_size = int(math.ceil(math.sqrt(n)))

    # Plotting
    fig, axes = plt.subplots(grid_size, grid_size, figsize=(5 * grid_size, 5 * grid_size))
    axes = axes.flatten()  # Flatten the axes array for easy indexing

    for ax, (feature1, feature2) in zip(axes, unique_pairs):
        sns.regplot(x=feature1, y=feature2, data=df, ax=ax)
        ax.set_title(f'{feature1} vs {feature2}')
        ax.grid()

    # Remove unused subplots if n doesn't exactly fill grid
    for i in range(len(unique_pairs), len(axes)):
        fig.delaxes(axes[i])
    plt.tight_layout()
    plt.show()



In [None]:
plot_most_correlated_grid(hlth_df, top_correlated + targets)

In [None]:
column_pairs = [(col1, col2) for i, col1 in enumerate(top_correlated) 
                    for j, col2 in enumerate(top_correlated) if i != j]
# Use a set to ensure unique pairs, ignoring order
column_pairs = set(tuple(sorted((col1, col2))) for col1, col2 in column_pairs)

# Convert the set back to a list if needed
column_pairs = list(column_pairs)

column_pairs

In [None]:
enhancend_hlth_df = hlth_data.copy()
for (feat1, feat2) in column_pairs:
    enhancend_hlth_df[feat1 + '_' + feat2 + '_pondavg'] = np.sqrt(hlth_df[feat1] * hlth_df[feat2])
    if feat1 in enhancend_hlth_df.columns:
        enhancend_hlth_df.drop(columns=feat1, inplace=True)
    elif feat2 in enhancend_hlth_df.columns:
        enhancend_hlth_df.drop(columns=feat2, inplace=True)

In [None]:
enhancend_hlth_df.info()

In [None]:
enhancend_hlth_df.tail()

In [None]:
enhancend_hlth_df.replace([float('inf'), np.nan], 0.0, inplace=True)

In [None]:
plt.figure(figsize=(24, 12))
sns.heatmap(enhancend_hlth_df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap of Transformer Health Metrics')
plt.show()

In [None]:
new_top_correlated = find_top_k_features_to_targets(enhancend_hlth_df, targets, 7)
new_top_correlated

In [None]:
plot_most_correlated_grid(new_df, new_top_correlated + targets)

In [None]:
input_df = new_df[find_top_k_features_to_targets(enhancend_hlth_df, targets, 12)].copy()

In [None]:
input_df.tail()

In [None]:
from scipy import stats

for col in input_df:
    z_scores = stats.zscore(input_df[col])
    outliers = input_df[(z_scores > 3) | (z_scores < -3)]
    print(f"Outliers in {col}: {len(outliers)}")

In [None]:
outlier_idxs = []
for col in input_df:
    z_scores = stats.zscore(input_df[col])
    outlier_idxs.extend(list(input_df[(z_scores > 3) | (z_scores < -3)].index))
outlier_idxs = list(set(outlier_idxs))

In [None]:
input_df.drop(outlier_idxs, inplace=True)

In [None]:
plt.figure(figsize=(16, 10))
sns.heatmap(pd.concat([input_df, new_df.loc[input_df.index][targets]], axis=1).corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap of Transformer Health Metrics')
plt.show()

In [None]:
plot_most_correlated_grid(pd.concat([input_df, new_df.loc[input_df.index][targets]], axis=1), new_top_correlated + targets)

In [None]:
input_df.to_numpy().shape

In [None]:
from sklearn.model_selection import train_test_split

# Features and target
X = input_df.to_numpy()
y = new_df.loc[input_df.index][targets[1]].to_numpy().reshape((-1, 1)) #Health Index

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression

# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Predictions
y_pred = model.predict(X_test)

# Evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Mean Absolute Error: {mae}")
print(f"Root Mean Squared Error: {rmse}")

In [None]:
model.predict(X_test[[0]]), y_test[[0]]

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize and train a Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train.ravel())

# Evaluate the Random Forest model
rf_pred = rf_model.predict(X_test)
rf_mae = mean_absolute_error(y_test, rf_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))

print(f"Random Forest MAE: {rf_mae}")
print(f"Random Forest RMSE: {rf_rmse}")

In [None]:
rf_model.predict(X_test[[0]]), y_test[0]

In [None]:
X_scaled = StandardScaler().fit_transform(X)

In [None]:
from sklearn.preprocessing import KBinsDiscretizer, StandardScaler, OrdinalEncoder

In [None]:
discretizer = KBinsDiscretizer(n_bins=5, encode="ordinal", strategy = 'uniform', random_state=42)

In [None]:
# Features and target
y_discretized = discretizer.fit_transform(new_df.loc[input_df.index][targets[0]].to_numpy().reshape((-1, 1))).ravel()

In [None]:
discretizer.bin_edges_

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit 
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) 

In [None]:
# Splitting the data
train_set_split, test_set_split = next(split.split(X_scaled, y_discretized))

In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the model
clf = LogisticRegression(max_iter=1000, random_state=42)
clf.fit(X_train, y_train.ravel())

In [None]:
classes = ['Very Short', 'Short', 'Moderate', 'Long', 'Very Long']

In [204]:
y_discretized

array([0., 1., 0., 0., 0., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 1., 0., 0., 1., 1., 2., 0.,
       2., 0., 0., 0., 0., 1., 1., 1., 1., 1., 2., 0., 0., 0., 2., 2., 0.,
       2., 1., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 2., 1., 2., 1., 2.,
       2., 1., 2., 2., 0., 1., 2., 2., 2., 2., 0., 0., 1., 2., 1., 2., 2.,
       2., 1., 1., 0., 0., 0., 1., 1., 2., 1., 1., 0., 1., 2., 2., 2., 1.,
       2., 1., 2., 2., 1., 0., 1., 2., 2., 1., 0., 0., 2., 2., 2., 0., 1.,
       2., 0., 4., 0., 4., 1., 4., 1., 4., 4., 0., 4., 4., 4., 3., 4., 4.,
       4., 4., 4., 4., 4., 0., 0., 4., 4., 4., 4., 4., 4., 4., 4., 1., 4.,
       4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.,
       4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.,
       1., 4., 4., 4., 4., 4., 4., 4., 3., 4., 4., 4., 4., 4., 4., 4., 4.,
       4., 4., 4., 4., 4., 4., 4., 2., 4., 4., 4., 0., 0., 4., 4., 4., 4.,
       0., 4., 4., 4., 4.

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Predictions
y_pred = clf.predict(X_test)

# Evaluation metrics
print(classification_report(y_test, y_pred, target_names=classes, zero_division=1))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
