In [11]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
infrared_thermography_temperature = fetch_ucirepo(id=925)

# data (as pandas dataframes)
X = infrared_thermography_temperature.data.features
y = infrared_thermography_temperature.data.targets

# metadata
print(infrared_thermography_temperature.metadata)

# variable information
print(infrared_thermography_temperature.variables)

{'uci_id': 925, 'name': 'Infrared Thermography Temperature', 'repository_url': 'https://archive.ics.uci.edu/dataset/925/infrared+thermography+temperature+dataset', 'data_url': 'https://archive.ics.uci.edu/static/public/925/data.csv', 'abstract': 'The Infrared Thermography Temperature Dataset contains temperatures read from various locations of inferred images about patients, with the addition of oral temperatures measured for each individual. The 33 features consist of gender, age, ethnicity, ambiant temperature, humidity, distance, and other temperature readings from the thermal images. The dataset is intended to be used in a regression task to predict the oral temperature using the environment information as well as the thermal image readings. ', 'area': 'Health and Medicine', 'tasks': ['Regression'], 'characteristics': ['Tabular'], 'num_instances': 1020, 'num_features': 33, 'feature_types': ['Real', 'Categorical'], 'demographics': ['Gender', 'Age', 'Ethnicity'], 'target_col': ['aveO

In [12]:
print(X.head())
print(y.head())


   Gender    Age                  Ethnicity  T_atm  Humidity  Distance  \
0    Male  41-50                      White   24.0      28.0       0.8   
1  Female  31-40  Black or African-American   24.0      26.0       0.8   
2  Female  21-30                      White   24.0      26.0       0.8   
3  Female  21-30  Black or African-American   24.0      27.0       0.8   
4    Male  18-20                      White   24.0      27.0       0.8   

   T_offset1  Max1R13_1  Max1L13_1  aveAllR13_1  ...  T_FHCC1  T_FHRC1  \
0     0.7025    35.0300    35.3775      34.4000  ...  33.5775  33.4775   
1     0.7800    34.5500    34.5200      33.9300  ...  34.0325  34.0550   
2     0.8625    35.6525    35.5175      34.2775  ...  34.9000  34.8275   
3     0.9300    35.2225    35.6125      34.3850  ...  34.4400  34.4225   
4     0.8950    35.5450    35.6650      34.9100  ...  35.0900  35.1600   

   T_FHLC1  T_FHBC1  T_FHTC1  T_FH_Max1  T_FHC_Max1   T_Max1    T_OR1  \
0  33.3725  33.4925  33.0025    34.53

In [13]:
# Number of independent variables
num_independent_vars = X.shape[1]

# Number of dependent variables
num_dependent_vars = y.shape[1]

print(f'Number of Independent Variables: {num_independent_vars}')
print(f'Number of Dependent Variables: {num_dependent_vars}')

Number of Independent Variables: 33
Number of Dependent Variables: 2


In [14]:
missing_counts_X = X.isnull().sum()
missing_counts_y = y.isnull().sum()


# Display the counts of missing values in each column
print("\nCounts of missing values in each column:")
print(missing_counts_X)
print(missing_counts_y)


Counts of missing values in each column:
Gender         0
Age            0
Ethnicity      0
T_atm          0
Humidity       0
Distance       2
T_offset1      0
Max1R13_1      0
Max1L13_1      0
aveAllR13_1    0
aveAllL13_1    0
T_RC1          0
T_RC_Dry1      0
T_RC_Wet1      0
T_RC_Max1      0
T_LC1          0
T_LC_Dry1      0
T_LC_Wet1      0
T_LC_Max1      0
RCC1           0
LCC1           0
canthiMax1     0
canthi4Max1    0
T_FHCC1        0
T_FHRC1        0
T_FHLC1        0
T_FHBC1        0
T_FHTC1        0
T_FH_Max1      0
T_FHC_Max1     0
T_Max1         0
T_OR1          0
T_OR_Max1      0
dtype: int64
aveOralF    0
aveOralM    0
dtype: int64


In [15]:
import pandas as pd
# Combine X and y into a single DataFrame
data_combined = pd.concat([X, y], axis=1)

# Drop rows with any missing values
data_combined = data_combined.dropna()

# Separate X and y after dropping missing values
X = data_combined.iloc[:, :-2]  # All columns except the last two
y = data_combined.iloc[:, -2:]  # The last two columns

#checking with NaN elements still exist
print(X.isnull().sum())
print(y.isnull().sum())

Gender         0
Age            0
Ethnicity      0
T_atm          0
Humidity       0
Distance       0
T_offset1      0
Max1R13_1      0
Max1L13_1      0
aveAllR13_1    0
aveAllL13_1    0
T_RC1          0
T_RC_Dry1      0
T_RC_Wet1      0
T_RC_Max1      0
T_LC1          0
T_LC_Dry1      0
T_LC_Wet1      0
T_LC_Max1      0
RCC1           0
LCC1           0
canthiMax1     0
canthi4Max1    0
T_FHCC1        0
T_FHRC1        0
T_FHLC1        0
T_FHBC1        0
T_FHTC1        0
T_FH_Max1      0
T_FHC_Max1     0
T_Max1         0
T_OR1          0
T_OR_Max1      0
dtype: int64
aveOralF    0
aveOralM    0
dtype: int64


### one-hot encoding using sklearn OneHotEncoder

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

# Select dependent and independent features
dependent_feature = 'aveOralM'
independent_features = ['Age', 'Distance', 'Humidity', 'T_atm', 'T_offset1'] 
# Extract the relevant columns
X_selected = X[independent_features]
y_selected = y[dependent_feature]

# One-hot encode the "Age" feature
# Keep other numerical features as is, and concatenate with the encoded "Age"
cat_encoder = OneHotEncoder(drop='first', sparse_output=False)
X_encoded_age = cat_encoder.fit_transform(X_selected[['Age']])
X_remaining_features = X_selected.drop(columns=['Age'])

# Combine the one-hot encoded "Age" with the other features
X_final = pd.concat([pd.DataFrame(X_encoded_age, columns=cat_encoder.get_feature_names_out(['Age'])), 
                     X_remaining_features.reset_index(drop=True)], axis=1)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_final, y_selected, test_size=0.2, random_state=0)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Estimate the coefficients
coefficients = model.coef_

# List the estimated coefficients with 4 decimal points
for feature, coef in zip(X_final.columns, coefficients):
    print(f'Coefficient for {feature}: {coef:.4f}')

# Print the intercept with 4 decimal points
intercept = model.intercept_
print(f'Intercept: {intercept:.4f}')


Coefficient for Age_21-25: 0.0213
Coefficient for Age_21-30: 0.1344
Coefficient for Age_26-30: -0.1200
Coefficient for Age_31-40: -0.0316
Coefficient for Age_41-50: -0.1375
Coefficient for Age_51-60: -0.3805
Coefficient for Age_>60: 0.0132
Coefficient for Distance: -0.3523
Coefficient for Humidity: 0.0013
Coefficient for T_atm: -0.0016
Coefficient for T_offset1: 0.1939
Intercept: 37.0677


### one-hot encoding using pandas pd.get_dummies

In [17]:
# Select dependent and independent features
dependent_feature = 'aveOralM'
independent_features = ['Age', 'Distance', 'Humidity', 'T_atm', 'T_offset1'] 

# Extract the relevant columns
X_selected = X[independent_features]
y_selected = y[dependent_feature]

# One-hot encode the "Age" feature using pd.get_dummies
X_final = pd.get_dummies(X_selected, columns=['Age'], drop_first=True)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_final, y_selected, test_size=0.2, random_state=0)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Estimate the coefficients
coefficients = model.coef_

# List the estimated coefficients
for feature, coef in zip(X_final.columns, coefficients):
    print(f'Coefficient for {feature}: {coef}')

# Print the intercept
intercept = model.intercept_
print(f'Intercept: {intercept}')


Coefficient for Distance: -0.3523080919385941
Coefficient for Humidity: 0.0012849527536311616
Coefficient for T_atm: -0.0015667543247122708
Coefficient for T_offset1: 0.19385383302856599
Coefficient for Age_21-25: 0.02134420060099466
Coefficient for Age_21-30: 0.13439604794404944
Coefficient for Age_26-30: -0.120031553987434
Coefficient for Age_31-40: -0.03160729518785476
Coefficient for Age_41-50: -0.13745863944369677
Coefficient for Age_51-60: -0.380531666333965
Coefficient for Age_>60: 0.013202726708834649
Intercept: 37.06774005940642


In [18]:
# Select independent features
independent_features = ['T_OR1', 'T_OR_Max1', 'T_FHC_Max1', 'T_FH_Max1']

# Extract the relevant columns
X_selected = X[independent_features]
y_selected = y['aveOralM']  

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_selected, y_selected, test_size=0.2, random_state=0)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Estimate the coefficients
coefficients = model.coef_

# List the estimated coefficients
for feature, coef in zip(independent_features, coefficients):
    print(f'Coefficient for {feature}: {coef:.4f}')

# Print the intercept
intercept = model.intercept_
print(f'Intercept: {intercept:.4f}')


Coefficient for T_OR1: 0.5034
Coefficient for T_OR_Max1: 0.0217
Coefficient for T_FHC_Max1: -0.0602
Coefficient for T_FH_Max1: 0.3594
Intercept: 7.6115


In [19]:
import numpy as np
import statsmodels.api as sm

#model predictions using training data
y_pred = model.predict(X_train)

# Calculate RSS
RSS = np.sum((y_train - y_pred) ** 2)

# Calculate RSE
N = len(y_train)
d = X_train.shape[1]
RSE = np.sqrt(RSS / (N - d - 1))

# Calculate MSE
MSE = RSS / N

# Calculate R^2
TSS = np.sum((y_train - np.mean(y_train)) ** 2)
R2 = 1 - (RSS / TSS)

# Use statsmodels to get standard errors, t-statistics, and p-values
X_train_sm = sm.add_constant(X_train) 
model_sm = sm.OLS(y_train, X_train_sm).fit()

# Standard errors
std_errors = model_sm.bse

# t-statistics
t_values = model_sm.tvalues

# p-values
p_values = model_sm.pvalues

# Print the results
print(f"RSS: {RSS:.4f}")
print(f"RSE: {RSE:.4f}")
print(f"MSE: {MSE:.4f}")
print(f"R^2: {R2:.4f}")
print(f"Standard Errors: \n{std_errors}")
print(f"t-statistics: \n{t_values}")
print(f"p-values: \n{p_values}")


RSS: 75.0953
RSE: 0.3047
MSE: 0.0923
R^2: 0.6429
Standard Errors: 
const         0.793406
T_OR1         0.859272
T_OR_Max1     0.857762
T_FHC_Max1    0.043671
T_FH_Max1     0.048925
dtype: float64
t-statistics: 
const         9.593404
T_OR1         0.585833
T_OR_Max1     0.025287
T_FHC_Max1   -1.379061
T_FH_Max1     7.345249
dtype: float64
p-values: 
const         1.025961e-20
T_OR1         5.581512e-01
T_OR_Max1     9.798324e-01
T_FHC_Max1    1.682570e-01
T_FH_Max1     5.018475e-13
dtype: float64
