# 3 Linear regression on real world data

### 1. Loading the Dataset

In [137]:
# If package not installed, install it using pip install ucimlrepo
from ucimlrepo import fetch_ucirepo

# fetch dataset
infrared_thermography_temperature = fetch_ucirepo(id=925)

# data (as pandas dataframes)
X = infrared_thermography_temperature.data.features
y = infrared_thermography_temperature.data.targets

# metadata
print(infrared_thermography_temperature.metadata)

# variable information
print(infrared_thermography_temperature.variables)

{'uci_id': 925, 'name': 'Infrared Thermography Temperature', 'repository_url': 'https://archive.ics.uci.edu/dataset/925/infrared+thermography+temperature+dataset', 'data_url': 'https://archive.ics.uci.edu/static/public/925/data.csv', 'abstract': 'The Infrared Thermography Temperature Dataset contains temperatures read from various locations of inferred images about patients, with the addition of oral temperatures measured for each individual. The 33 features consist of gender, age, ethnicity, ambiant temperature, humidity, distance, and other temperature readings from the thermal images. The dataset is intended to be used in a regression task to predict the oral temperature using the environment information as well as the thermal image readings. ', 'area': 'Health and Medicine', 'tasks': ['Regression'], 'characteristics': ['Tabular'], 'num_instances': 1020, 'num_features': 33, 'feature_types': ['Real', 'Categorical'], 'demographics': ['Gender', 'Age', 'Ethnicity'], 'target_col': ['aveO

### 2. Independent and Dependent Variables
- Independent variables: These are the features in X.
- Dependent variables: These are the target values in y.

In [138]:
print(infrared_thermography_temperature.data)

{'ids':       SubjectID
0      161117-1
1      161117-2
2      161117-3
3      161117-4
4      161117-5
...         ...
1015  180425-05
1016  180425-06
1017  180502-01
1018  180507-01
1019  180514-01

[1020 rows x 1 columns], 'features':       Gender    Age                  Ethnicity  T_atm  Humidity  Distance  \
0       Male  41-50                      White   24.0      28.0       0.8   
1     Female  31-40  Black or African-American   24.0      26.0       0.8   
2     Female  21-30                      White   24.0      26.0       0.8   
3     Female  21-30  Black or African-American   24.0      27.0       0.8   
4       Male  18-20                      White   24.0      27.0       0.8   
...      ...    ...                        ...    ...       ...       ...   
1015  Female  21-25                      Asian   25.7      50.8       0.6   
1016  Female  21-25                      White   25.7      50.8       0.6   
1017  Female  18-20  Black or African-American   28.0      24.3      

In [139]:
print(f"Number of Independent Variables: {X.shape[1]}")
print(f"Number of Dependent Variables: {y.shape[1]}")
print(X,y)

Number of Independent Variables: 33
Number of Dependent Variables: 2
      Gender    Age                  Ethnicity  T_atm  Humidity  Distance  \
0       Male  41-50                      White   24.0      28.0       0.8   
1     Female  31-40  Black or African-American   24.0      26.0       0.8   
2     Female  21-30                      White   24.0      26.0       0.8   
3     Female  21-30  Black or African-American   24.0      27.0       0.8   
4       Male  18-20                      White   24.0      27.0       0.8   
...      ...    ...                        ...    ...       ...       ...   
1015  Female  21-25                      Asian   25.7      50.8       0.6   
1016  Female  21-25                      White   25.7      50.8       0.6   
1017  Female  18-20  Black or African-American   28.0      24.3       0.6   
1018    Male  26-30            Hispanic/Latino   25.0      39.8       0.6   
1019  Female  18-20                      White   23.8      45.6       0.6   

      

### 3. Is it possible to apply linear regression?

In this dataset, we have non-numeric data such as age ranges, sex, and other categorical variables. To apply linear regression to these types of data, they need to be converted into a numerical format. This can be achieved using the following methods:

1. **Label Encoding**: Assigns a unique integer to each category. This is suitable for ordinal data where categories have a meaningful order, such as 'low', 'medium', and 'high'.

2. **One-Hot Encoding**: Creates binary columns for each category, indicating the presence or absence of each category. This method is ideal for nominal data without an inherent order, such as 'sex' or 'ethnicity'.

3. **Ordinal Encoding**: Assigns integer values to categories based on their inherent order. This method is appropriate for ordinal variables where the sequence of categories carries significance, such as 'age ranges'.

4. **Binning**: Converts continuous variables into discrete categories or bins. This is useful for grouping continuous data, like 'age', into meaningful ranges.

By employing these encoding techniques, non-numeric data can be effectively transformed into a numerical format suitable for linear regression analysis.

**If the dataset doesn't meet these criteria, preprocessing steps could include:**
- Checking for and transforming non-linear relationships.
- Using techniques like PCA to address multicollinearity.
- Scaling features if needed.
- Removing outliers.

### 4. Handling NaN/Missing Values

The provided code is not correct. Because we must remove both the X and y values corresponding to a missing value.

table.dropna() ensures that we remove rows with any missing values across the entire dataset, maintaining consistency and alignment.
X.dropna() and y.dropna() separately might lead to mismatched data and additional complexity, especially when dealing with feature and target data.

In [141]:
import pandas as pd

table = pd.concat([X, y], axis = 1)
# Count missing values for each column
missing_values_per_column = table.isnull().sum()
print("Missing values per column:")
print(missing_values_per_column)
# Count the total number of missing values in the DataFrame
total_missing_values = table.isnull().sum().sum()
print(f"Total number of missing values in the DataFrame: {total_missing_values}")


Missing values per column:
Gender         0
Age            0
Ethnicity      0
T_atm          0
Humidity       0
Distance       2
T_offset1      0
Max1R13_1      0
Max1L13_1      0
aveAllR13_1    0
aveAllL13_1    0
T_RC1          0
T_RC_Dry1      0
T_RC_Wet1      0
T_RC_Max1      0
T_LC1          0
T_LC_Dry1      0
T_LC_Wet1      0
T_LC_Max1      0
RCC1           0
LCC1           0
canthiMax1     0
canthi4Max1    0
T_FHCC1        0
T_FHRC1        0
T_FHLC1        0
T_FHBC1        0
T_FHTC1        0
T_FH_Max1      0
T_FHC_Max1     0
T_Max1         0
T_OR1          0
T_OR_Max1      0
aveOralF       0
aveOralM       0
dtype: int64
Total number of missing values in the DataFrame: 2


In [142]:
table = table.dropna()

In [143]:
# Count missing values for each column
missing_values_per_column = table.isnull().sum()
print("Missing values per column:")
print(missing_values_per_column)
# Count the total number of missing values in the DataFrame
total_missing_values = table.isnull().sum().sum()
print(f"Total number of missing values in the DataFrame: {total_missing_values}")

Missing values per column:
Gender         0
Age            0
Ethnicity      0
T_atm          0
Humidity       0
Distance       0
T_offset1      0
Max1R13_1      0
Max1L13_1      0
aveAllR13_1    0
aveAllL13_1    0
T_RC1          0
T_RC_Dry1      0
T_RC_Wet1      0
T_RC_Max1      0
T_LC1          0
T_LC_Dry1      0
T_LC_Wet1      0
T_LC_Max1      0
RCC1           0
LCC1           0
canthiMax1     0
canthi4Max1    0
T_FHCC1        0
T_FHRC1        0
T_FHLC1        0
T_FHBC1        0
T_FHTC1        0
T_FH_Max1      0
T_FHC_Max1     0
T_Max1         0
T_OR1          0
T_OR_Max1      0
aveOralF       0
aveOralM       0
dtype: int64
Total number of missing values in the DataFrame: 0


### 5. and 6. Selecting Features and Splitting Data

In [144]:
# Selecting 'aveOralM' as the dependent variable
y = y[['aveOralM']]

# Selecting 'Age' and four other features based on preference
X = X[['Age', 'T_OR1', 'T_OR_Max1', 'T_FHC_Max1', 'T_FH_Max1']]

print(X,y)

# Splitting the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


        Age    T_OR1  T_OR_Max1  T_FHC_Max1  T_FH_Max1
0     41-50  35.6350    35.6525     34.0075    34.5300
1     31-40  35.0925    35.1075     34.6600    34.6825
2     21-30  35.8600    35.8850     35.2225    35.3450
3     21-30  34.9650    34.9825     35.3150    35.6025
4     18-20  35.5875    35.6175     35.3725    35.4175
...     ...      ...        ...         ...        ...
1015  21-25  35.6775    35.7100     35.7475    35.8525
1016  21-25  36.4525    36.4900     35.5525    35.7650
1017  18-20  35.9650    35.9975     35.7100    36.3750
1018  26-30  35.4150    35.4350     35.3100    35.4150
1019  18-20  35.8900    35.9175     35.1175    35.1525

[1020 rows x 5 columns]       aveOralM
0        36.59
1        37.19
2        37.34
3        37.09
4        37.04
...        ...
1015     36.99
1016     37.19
1017     37.59
1018     37.29
1019     37.19

[1020 rows x 1 columns]


### 7. Training a Linear Regression Model

In [145]:
print(X.columns)

Index(['Age', 'T_OR1', 'T_OR_Max1', 'T_FHC_Max1', 'T_FH_Max1'], dtype='object')


In [146]:
print(X.Age)

0       41-50
1       31-40
2       21-30
3       21-30
4       18-20
        ...  
1015    21-25
1016    21-25
1017    18-20
1018    26-30
1019    18-20
Name: Age, Length: 1020, dtype: object


In [147]:
def convert_age_range(age_range):
    """Converts the age range to a single average value"""
    if '>' in age_range:
        return int(age_range.replace('>', '').strip())
    lower, upper = map(int, age_range.split('-'))
    return (lower + upper) / 2

In [148]:
X.Age = X.Age.apply(convert_age_range)
print(X.Age)

0       45.5
1       35.5
2       25.5
3       25.5
4       19.0
        ... 
1015    23.0
1016    23.0
1017    19.0
1018    28.0
1019    19.0
Name: Age, Length: 1020, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.Age = X.Age.apply(convert_age_range)


In [150]:
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

# Coefficients corresponding to independent variables
coefficients = model.coef_
print(f"Estimated Coefficients: {coefficients}")

Estimated Coefficients: [[ 0.00113644  0.05647584  0.49937613 -0.08398371  0.36994022]]


### 8. Identifying the Most Contributing Variable
The variable with the highest absolute value in the coefficient array contributes the most:

In [151]:
import numpy as np

max_contributor_index = np.argmax(np.abs(coefficients))
most_contributing_feature = X.columns[max_contributor_index]
print(f"Most contributing feature: {most_contributing_feature}")

Most contributing feature: T_OR_Max1


### 9. Additional Feature Selection and Model Training

In [154]:
X = X[['T_OR1', 'T_OR_Max1', 'T_FHC_Max1', 'T_FH_Max1']]
print(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)
coefficients = model.coef_
print(f"Estimated Coefficients: {coefficients}")

        T_OR1  T_OR_Max1  T_FHC_Max1  T_FH_Max1
0     35.6350    35.6525     34.0075    34.5300
1     35.0925    35.1075     34.6600    34.6825
2     35.8600    35.8850     35.2225    35.3450
3     34.9650    34.9825     35.3150    35.6025
4     35.5875    35.6175     35.3725    35.4175
...       ...        ...         ...        ...
1015  35.6775    35.7100     35.7475    35.8525
1016  36.4525    36.4900     35.5525    35.7650
1017  35.9650    35.9975     35.7100    36.3750
1018  35.4150    35.4350     35.3100    35.4150
1019  35.8900    35.9175     35.1175    35.1525

[1020 rows x 4 columns]
Estimated Coefficients: [[ 0.09199696  0.4640698  -0.08733171  0.37088645]]


### 10. Calculating Statistical Measures

In [155]:
from sklearn.metrics import mean_squared_error

# Residual sum of squares (RSS)
y_pred = model.predict(X_test)
RSS = np.sum(np.square(y_test - y_pred))

# Residual Standard Error (RSE)
N = len(y_test)
d = X_train.shape[1]
RSE = np.sqrt(RSS / (N - d - 1))

# Mean Squared Error (MSE)
MSE = mean_squared_error(y_test, y_pred)

# R-squared statistic
R_squared = model.score(X_test, y_test)

# Standard Error, t-statistic, p-value
import statsmodels.api as sm

X_train_with_const = sm.add_constant(X_train)
ols_model = sm.OLS(y_train, X_train_with_const).fit()
standard_errors = ols_model.bse
t_statistics = ols_model.tvalues
p_values = ols_model.pvalues

print(f"RSS: {RSS}")
print(f"RSE: {RSE}")
print(f"MSE: {MSE}")
print(f"R-squared: {R_squared}")
print(f"Standard Errors: {standard_errors}")
print(f"t-statistics: {t_statistics}")
print(f"p-values: {p_values}")

  return reduction(axis=axis, out=out, **passkwargs)


RSS: aveOralM    15.170504
dtype: float64
RSE: aveOralM    0.276104
dtype: float64
MSE: 0.07436521744807979
R-squared: 0.6468420800555861
Standard Errors: const         0.803926
T_OR1         0.883501
T_OR_Max1     0.882069
T_FHC_Max1    0.044464
T_FH_Max1     0.049258
dtype: float64
t-statistics: const         8.753146
T_OR1         0.104128
T_OR_Max1     0.526115
T_FHC_Max1   -1.964102
T_FH_Max1     7.529419
dtype: float64
p-values: const         1.191574e-17
T_OR1         9.170938e-01
T_OR_Max1     5.989521e-01
T_FHC_Max1    4.985945e-02
T_FH_Max1     1.358512e-13
dtype: float64


### 11. Significant and Insignificant features
In linear regression, we consider a feature significant if its p-value is less than 0.05. Conversely, if the p-value is greater than or equal to 0.05, we regard the feature as insignificant.

In [156]:
significant_features = p_values[p_values < 0.05]
insignificant_features = p_values[p_values >= 0.05]

print(f"Significant Features: {significant_features}")
print(f"Insignificant Features: {insignificant_features}")

Significant Features: const         1.191574e-17
T_FHC_Max1    4.985945e-02
T_FH_Max1     1.358512e-13
dtype: float64
Insignificant Features: T_OR1        0.917094
T_OR_Max1    0.598952
dtype: float64
