In [35]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Load data from a TSV file


file_path = '/home/venkat/Projects/workbook/hb-predicition/data/readings.tsv'
data = pd.read_csv(file_path, sep=',')

# Assuming the data columns are 'label' and 'reading'
X = data[['reading']]  # features (independent variables)
y = data['label']  # target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_regressor.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

def predict_with_confidence(model, samples):
    """
    Predict the labels for a given array of samples and estimate the confidence.

    Parameters:
    model: Trained Random Forest Regressor
    samples: An array of sample values for which to predict the labels

    Returns:
    A list of tuples, each containing the predicted label and the standard deviation
    of the predictions from individual trees as a measure of uncertainty for each sample.
    """
    results = []
    
    for sample in samples:
        # Convert the sample to a 2D array (if it is not already)
        if np.isscalar(sample):
            sample = np.array([[sample]])

        # Predict using all trees
        predictions = np.array([tree.predict(sample) for tree in model.estimators_])

        # Calculate the mean prediction and standard deviation
        mean_prediction = np.mean(predictions)
        std_deviation = np.std(predictions)

        results.append((mean_prediction, std_deviation))

    return results

# Example usage
sample_values = [19822, 15479, 24863]  # Replace with your array of sample values
predictions = predict_with_confidence(rf_regressor, sample_values)

for i, (label, confidence) in enumerate(predictions):
    print(f"Sample {i+1}: Predicted label: {label}, Confidence (std dev): {confidence}")


Mean Squared Error: 0.11798888888888877
R^2 Score: 0.9959087756849315
Sample 1: Predicted label: 11.73, Confidence (std dev): 0.4659399102888698
Sample 2: Predicted label: 16.34, Confidence (std dev): 0.4737087712930804
Sample 3: Predicted label: 7.9, Confidence (std dev): 0.3872983346207417


In [45]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import scipy.stats as stats

file_path = '/home/venkat/Projects/workbook/hb-predicition/data/readings.tsv'
data = pd.read_csv(file_path, sep=',')

# Define the degree of the polynomial model
degree = 3  # You can adjust this based on your data

# Transform features into polynomial features
poly_features = PolynomialFeatures(degree=degree)
X_poly = poly_features.fit_transform(data[['reading']])

# Fit a linear regression model
model = LinearRegression()
model.fit(X_poly, data['label'])

# Function to make predictions and compute prediction intervals
def predict_with_interval(model, poly_features, x_new, confidence=0.95):
    # Transform the new sample to polynomial features
    x_new_poly = poly_features.transform(np.array([[x_new]]))
    
    # Predict
    y_new_pred = model.predict(x_new_poly)[0]

    # Compute the standard error and prediction interval
    sum_errors = np.sum((model.predict(X_poly) - data['label'])**2)
    residual_std_error = np.sqrt(sum_errors / (len(X_poly) - degree - 1))
    
    mean_x = np.mean(data['reading'])
    n = len(data)
    t_value = stats.t.ppf((1 + confidence) / 2, df=n - 2)
    interval = t_value * residual_std_error * np.sqrt(1/n + (x_new - mean_x)**2 / np.sum((data['reading'] - mean_x)**2))
    
    return y_new_pred, (y_new_pred - interval, y_new_pred + interval)

# Example usage
sample_reading = 19822  # Replace with your sample value
predicted_label, confidence_interval = predict_with_interval(model, poly_features, sample_reading)
print(f"Predicted label: {predicted_label}, Confidence Interval: {confidence_interval}")


Predicted label: 11.75148179916318, Confidence Interval: (11.675433535472264, 11.827530062854096)




In [44]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

file_path = '/home/venkat/Projects/workbook/hb-predicition/data/readings.tsv'
data = pd.read_csv(file_path, sep=',')

# Convert labels to discrete classes if not already
# data['label'] = pd.cut(data['label'], bins, labels=class_labels) # Optional: Use if you need to discretize

X = data[['reading']]
y = data['label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict probabilities for a new sample
sample_reading = [[19822]]  # Replace with your sample value
confidence_scores = clf.predict_proba(sample_reading)

confidence_scores



array([[0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.28, 0.71, 0.01,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ]])