In [1]:
# Importing necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Step 1: Data Exploration

# a) Load the dataset and examine its structure.
data = {'Study Hours': [3.7, 9.5, 7.3, 6.0, 1.6, 1.6, 0.6, 8.7, 6.0, 7.1,
                        0.2, 9.7, 8.3, 2.1, 1.8, 1.8, 3.0, 5.2, 4.3, 2.9,
                        6.1, 1.4, 2.9, 3.7, 4.6, 7.9, 2.0, 5.1, 5.9, 0.5,
                        6.1, 1.7, 0.7, 9.5, 9.7, 8.1, 3.0, 1.0, 6.8, 4.4,
                        1.2, 5.0, 0.3, 9.1, 2.6, 6.6, 3.1, 5.2, 5.5, 1.8,
                        9.7, 7.8, 9.4, 8.9, 6.0, 9.2, 0.9, 2.0, 0.5, 3.3,
                        3.9, 2.7, 8.3, 3.6, 2.8, 5.4, 1.4, 8.0, 0.7, 9.9,
                        7.7, 2.0, 0.1, 8.2, 7.1, 7.3, 7.7, 0.7, 3.6, 1.2,
                        8.6, 6.2, 3.3, 0.6, 3.1, 3.3, 7.3, 6.4, 8.9, 4.7,
                        1.2, 7.1, 7.6, 5.6, 7.7, 4.9, 5.2, 4.3, 0.3, 1.1],
        'Exam Scores': [87.9, 143.6, 123.7, 99.9, 64.5, 67.4, 63.2, 134.0, 106.1, 118.3,
                        56.6, 148.6, 130.6, 73.8, 68.7, 73.2, 76.9, 100.8, 91.2, 71.8,
                        112.7, 65.3, 79.2, 85.5, 88.5, 126.4, 68.3, 121.0, 63.3, 53.2,
                        133.0, 121.9, 124.6, 123.7, 58.6, 87.3, 58.0, 145.6, 114.7,
                        77.1, 59.6, 76.2, 86.5, 128.8, 109.7, 143.5, 99.3, 66.1,
                        130.8, 124.9, 102.4, 122.6, 95.3, 101.9, 94.5, 53.9, 64.9]}

df = pd.DataFrame(data)

# Display the structure of the dataset
print(df.head())

# b) Identify the features and the target variable.
features = df[['Study Hours']]
target = df['Exam Scores']

# c) Check for missing values and handle them.
print(df.isnull().sum())

# No missing values, so no handling required.

# d) Visualize the relationship between variables.
plt.scatter(df['Study Hours'], df['Exam Scores'])
plt.title('Study Hours vs Exam Scores')
plt.xlabel('Study Hours')
plt.ylabel('Exam Scores')
plt.show()

# Step 2: Data Preprocessing

# a) Split the dataset into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# b) Standardize the independent variables using appropriate scaling techniques.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 3: Linear Regression Model

# a) Train a linear regression model on the training data.
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# b) Evaluate the performance on the testing data.
y_pred = model.predict(X_test_scaled)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'R-squared: {r2}')

# c) Interpret the coefficients of the linear regression model.
coefficients = model.coef_
intercept = model.intercept_

print(f'Coefficients: {coefficients}')
print(f'Intercept: {intercept}')

# Step 4: Model Improvement

# a) No feature engineering is needed in this simple example.

# b) Re-train the linear regression model on the updated dataset.
model.fit(X_train_scaled, y_train)

# c) Evaluate the performance of the improved model.
y_pred_updated = model.predict(X_test_scaled)

mae_updated = mean_absolute_error(y_test, y_pred_updated)
mse_updated = mean_squared_error(y_test, y_pred_updated)
r2_updated = r2_score(y_test, y_pred_updated)

print(f'Improved Model - MAE: {mae_updated}')
print(f'Improved Model - MSE: {mse_updated}')
print(f'Improved Model - R-squared: {r2_updated}')


ValueError: All arrays must be of the same length