# Aim: To demonstrate Emperically the effect of scaling on Correlation coefficient and model coefficients

In [182]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Step 1: Generate synthetic data
np.random.seed(42)  # For reproducibility
X = 2 * np.random.rand(100, 1)
noise = np.random.randn(100, 1)
y = 3 * X + 4 + noise

# Step 2: Create a pandas DataFrame
data = pd.DataFrame(np.hstack([X, y]), columns=['X', 'y'])

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[['X']], data['y'], test_size=0.2, random_state=42)

# Step 4: Perform linear regression
model = LinearRegression()
model.fit(X_train, y_train)

# Step 5: Evaluate the model
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

train_mse = mean_squared_error(y_train, y_pred_train)
test_mse = mean_squared_error(y_test, y_pred_test)
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)


# Coefficients and intercept
coefficients = model.coef_
intercept = model.intercept_

model_info = {
    "Coefficients": coefficients,
    "Intercept": intercept,
    "Training MSE": train_mse,
    "Testing MSE": test_mse,
    "Training R^2": train_r2,
    "Testing R^2": test_r2
}

model_info


{'Coefficients': array([2.79932366]),
 'Intercept': 4.142913319458566,
 'Training MSE': 0.8476788564209705,
 'Testing MSE': 0.6536995137170021,
 'Training R^2': 0.7582381034538057,
 'Testing R^2': 0.8072059636181392}

In [183]:
pearson_corr_before = np.corrcoef(X_train.values.flatten(), y_train.values)[0, 1]
print("Pearson correlation coefficient:", pearson_corr_before)

Pearson correlation coefficient: 0.8707686853888381


In [184]:
# Scale The data
scale_x = StandardScaler()

X_train_scaled =  scale_x.fit(X_train).transform(X_train)
X_test_scaled = scale_x.transform(X_test)




# Perform linear regression
model_scaled = LinearRegression()
model_scaled.fit(X_train_scaled, y_train)

#Evaluate the model
y_pred_train = model_scaled.predict(X_train_scaled)
y_pred_test = model_scaled.predict(X_test_scaled)

train_mse = mean_squared_error(X_train_scaled, y_pred_train)
test_mse = mean_squared_error(y_test, y_pred_test)
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)


# Coefficients and intercept
coefficients_scaled = model_scaled.coef_
intercept_scaled = model_scaled.intercept_

model_info = {
    "Coefficients": coefficients_scaled,
    "Intercept": intercept_scaled,
    "Training MSE": train_mse,
    "Testing MSE": test_mse,
    "Training R^2": train_r2,
    "Testing R^2": test_r2
}

model_info




{'Coefficients': array([1.63051407]),
 'Intercept': 6.763632074988782,
 'Training MSE': 46.14426683913695,
 'Testing MSE': 0.653699513717002,
 'Training R^2': 0.7582381034538057,
 'Testing R^2': 0.8072059636181392}

In [185]:
pearson_corr_after = np.corrcoef(X_train_scaled.flatten(), y_train.values)[0, 1]
print("Pearson correlation coefficient:", pearson_corr_after)

Pearson correlation coefficient: 0.870768685388838


In [186]:
coef = [coefficients,coefficients_scaled]
inter = [intercept,intercept_scaled]
corr = [pearson_corr_before,pearson_corr_after]

res_dict = {"Coefficient": coef, "Intercept": inter, "Pearson Correlation": corr}
df_res = pd.DataFrame(res_dict,index = ['before Scaling','after Scaling'])

df_res

Unnamed: 0,Coefficient,Intercept,Pearson Correlation
before Scaling,[2.7993236574802762],4.142913,0.870769
after Scaling,[1.6305140706755856],6.763632,0.870769


# Conclusion: The model cefficient changes while correlation coefficient remains the same

---------------------------------------------------------------------------------------

# Aim: To demonstrate Emperically the effect of increasing the range on Correlation coefficient and model coefficients

In [187]:
# Step 1: Generate synthetic data
np.random.seed(42)  # For reproducibility
X = 2 *100*np.random.rand(100, 1)
noise = np.random.randn(100, 1)
y = 3 * X + 4 + noise

# Step 2: Create a pandas DataFrame
data_range = pd.DataFrame(np.hstack([X, y]), columns=['X', 'y'])

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data_range[['X']], data_range['y'], test_size=0.2, random_state=42)

# Step 4: Perform linear regression
model_range = LinearRegression()
model_range.fit(X_train, y_train)

# Step 5: Evaluate the model
y_pred_train = model_range.predict(X_train)
y_pred_test = model_range.predict(X_test)

train_mse = mean_squared_error(y_train, y_pred_train)
test_mse = mean_squared_error(y_test, y_pred_test)
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)


# Coefficients and intercept
coefficients_range = model_range.coef_
intercept_range = model_range.intercept_

model_info = {
    "Coefficients": coefficients_range,
    "Intercept": intercept_range,
    "Training MSE": train_mse,
    "Testing MSE": test_mse,
    "Training R^2": train_r2,
    "Testing R^2": test_r2
}
model_info




{'Coefficients': array([2.99799324]),
 'Intercept': 4.1429133194585575,
 'Training MSE': 0.8476788564209754,
 'Testing MSE': 0.6536995137169872,
 'Training R^2': 0.9999722019003462,
 'Testing R^2': 0.9999815648311831}

In [188]:
pearson_corr_range = np.corrcoef(X_train.values.flatten(), y_train.values)[0, 1]
print("Pearson correlation coefficient:", pearson_corr_range)

Pearson correlation coefficient: 0.9999861008535798


In [189]:
coef2 = [coefficients,coefficients_range]
inter2 = [intercept,intercept_range]
corr2 = [pearson_corr_before,pearson_corr_range]

res_dict2 = {"Coefficient": coef2, "Intercept": inter2, "Pearson Correlation": corr2}
df_res2 = pd.DataFrame(res_dict2,index = ['before Ranging','after Ranging'])

df_res2

Unnamed: 0,Coefficient,Intercept,Pearson Correlation
before Ranging,[2.7993236574802762],4.142913,0.870769
after Ranging,[2.997993236574803],4.142913,0.999986


#Conclusion: The model cefficient doesnt change while correlation coefficient increases

-----------------------