In [1]:
#Q1 start

import numpy as np
import cv2
import os

# Define the path to the folder containing the images
notebook_path = os.path.abspath("Q1.ipynb")
path_to_folder = os.path.join(os.path.dirname(notebook_path), "data/ORL_dataset/")

# Initialize an empty numpy array to store the image data
data = np.empty((400, 2576))

# Initialize an empty list to store the gender labels
labels = []

# Loop through each image in the folder
for i, filename in enumerate(os.listdir(path_to_folder)):
    # Read the image using OpenCV
    img = cv2.imread(os.path.join(path_to_folder, filename), cv2.IMREAD_GRAYSCALE)
    
    # Flatten the image into a 1D numpy array
    img_flat = img.flatten()
    
    # Add the flattened image data to the data matrix
    data[i, :] = img_flat
    
    # Determine the gender label based on the filename (assuming filenames are formatted as "f01-01.png" or "m01-01.png")
    if filename.startswith('1_'):
        labels.append(0)
    elif filename.startswith('8_'):
        labels.append(0)
    elif filename.startswith('10_'):
        labels.append(0)
    elif filename.startswith('32_'):
        labels.append(0)
    else:
        labels.append(1)

# Convert the labels list to a numpy array and reshape to a column vector
labels = np.array(labels).reshape((400, 1))

# Concatenate the labels column to the data matrix
data_with_labels = np.concatenate((data, labels), axis=1)

print('Data shape:', data.shape)
print('Labels shape:', labels.shape)
print('Data with Labels', data_with_labels.shape)


Data shape: (400, 2576)
Labels shape: (400, 1)
Data with Labels (400, 2577)


In [2]:
import statsmodels.api as sm

# Split the data matrix into predictor variables (pixels) and response variable (gender label)
X = data_with_labels[:, :-1]
y = data_with_labels[:, -1]

# Add a constant term to the predictor variables to fit an intercept in the regression model
X = sm.add_constant(X)

# Fit a linear regression model
model = sm.OLS(y, X).fit()

# Print the summary statistics for the model
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                    nan
Method:                 Least Squares   F-statistic:                       nan
Date:                Sun, 12 Mar 2023   Prob (F-statistic):                nan
Time:                        18:24:08   Log-Likelihood:                 12603.
No. Observations:                 400   AIC:                        -2.441e+04
Df Residuals:                       0   BIC:                        -2.281e+04
Df Model:                         399                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       4.245e-06        inf          0        n

In [3]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector

In [4]:
X = data_with_labels[:, :-1]
Y = data_with_labels[:, -1]

model = LinearRegression()
model.fit(X, Y)

print("R-squared:", model.score(X, Y))

R-squared: 1.0


In [None]:
sfs = SequentialFeatureSelector(model, n_features_to_select=10, direction='forward', scoring='r2', cv=5)
sfs.fit(X, Y)

important_pixels = sfs.get_support(indices=True)
print("Important pixels:", important_pixels)

In [None]:
#Q1 end
canvas = np.zeros((46, 56))

for idx in important_pixels:
    row = idx // 56
    col = idx % 56
    canvas[row][col] = 255

cv2.imshow("Important Pixels", canvas)
cv2.waitKey(0)

In [None]:
#Q2 start

import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LinearRegression

# Load the volcano dataset
notebook_path = os.path.abspath("Q2.ipynb")
data_file = os.path.join(os.path.dirname(notebook_path), "Volcano.csv")

volcano = pd.read_csv(data_file, header=None)

# Set the grid coordinates
x1 = np.arange(1, 88)
x2 = np.arange(1, 62)

# Initialize the starting point
current_point = np.array([87, 1])

# Set the domain size for each regression model
domain_size = 5

# Iterate until convergence
while True:
    # Get the current coordinates
    i = current_point[0] - 1
    j = current_point[1] - 1
    
    # Extract the domain for regression
    i_start = int(max(i - domain_size, 0))
    i_end = int(min(i + domain_size, 86))
    j_start = int(max(j - domain_size, 0))
    j_end = int(min(j + domain_size, 60))
    X1, X2 = np.meshgrid(x1[i_start:i_end+1], x2[j_start:j_end+1])
    Y = volcano.values[j_start:j_end+1, i_start:i_end+1].ravel()
    
    # Fit the regression model
    model = LinearRegression()
    X = np.column_stack((X1.ravel(), X2.ravel()))
    model.fit(X, Y)
    
    # Find the direction of improvement
    gradient = model.coef_
    direction = np.sign(gradient)
    
    # Update the current point
    new_point = current_point + direction
    if np.all(new_point == current_point):
        # Converged to a local maximum
        break
    else:
        current_point = new_point

# Print the final result
print("The highest point is:", tuple(current_point))

#Q2 end

In [None]:
#Q3 start
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
import statsmodels.api as sm

# Generate data
X, y = make_regression(n_samples=50000, n_features=2, noise=10)

# Add intercept to X
X = sm.add_constant(X)

# Fit linear regression model
model = sm.OLS(y, X).fit()

# Print model summary
print(model.summary())

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression

# Generate data
X, y = make_regression(n_samples=50000, n_features=2, noise=10)

# Add intercept to X
X = np.c_[np.ones(X.shape[0]), X]

# Define learning rate and number of iterations
learning_rate = 0.01
n_iterations = 1000

# Initialize coefficients
beta = np.zeros(X.shape[1])

# Define error function and gradient function
def error(X, y, beta):
    return np.mean((y - X @ beta)**2)

def gradient(X, y, beta):
    return -2*np.mean((y - X @ beta)*X.T, axis=1)

# Perform gradient descent
errors = []
betas = [beta]
for i in range(n_iterations):
    beta = beta - learning_rate * gradient(X, y, beta)
    errors.append(error(X, y, beta))
    betas.append(beta)

# Print coefficients
print("Coefficients:", beta)
print("Regression package:", model.params)
print("Gradient descent:", beta)

In [None]:
import matplotlib.pyplot as plt

# Plot errors
plt.plot(errors)
plt.title("Evolution of error during gradient descent")
plt.xlabel("Iteration")
plt.ylabel("Mean squared error")
plt.show()

# Plot searching path
betas = np.array(betas)
plt.plot(betas[:,1], betas[:,2], 'bo-')
plt.title("Searching path in the domain of the error function")
plt.xlabel("beta1")
plt.ylabel("beta2")
plt.show()

#Q3 end