In [None]:
###### number 7

import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols

data = pd.read_excel(r'/content/Copy of jaggia_ba_2e_ch08_data.xlsx', sheet_name='Overweight', engine='openpyxl')


required_columns = ['BMI', 'Female', 'Black']
if not all(col in data.columns for col in required_columns):
    raise ValueError(f"Missing one or more required columns: {required_columns}")


model = ols('BMI ~ Female + Black + Female:Black', data=data).fit()

white_male_bmi = model.params['Intercept']
white_female_bmi = model.params['Intercept'] + model.params['Female']
black_male_bmi = model.params['Intercept'] + model.params['Black']
black_female_bmi = model.params['Intercept'] + model.params['Female'] + model.params['Black'] + model.params['Female:Black']

print("\na. Predicted BMI for different groups:")
print(f"White Males: {white_male_bmi:.2f}")
print(f"White Females: {white_female_bmi:.2f}")
print(f"Black Males: {black_male_bmi:.2f}")
print(f"Black Females: {black_female_bmi:.2f}")


print("\nb. Difference between white females and white males:")
print(f"Coefficient: {model.params['Female']:.4f}")
print(f"P-value: {model.pvalues['Female']:.4f}")
print("Statistically significant at 5% level:", "Yes" if model.pvalues['Female'] < 0.05 else "No")


print("\nc. Difference between white males and Black males:")
print(f"Coefficient: {model.params['Black']:.4f}")
print(f"P-value: {model.pvalues['Black']:.4f}")
print("Statistically significant at 5% level:", "Yes" if model.pvalues['Black'] < 0.05 else "No")


a. Predicted BMI for different groups:
White Males: 28.26
White Females: 24.82
Black Males: 26.95
Black Females: 30.17

b. Difference between white females and white males:
Coefficient: -3.4456
P-value: 0.0000
Statistically significant at 5% level: Yes

c. Difference between white males and Black males:
Coefficient: -1.3139
P-value: 0.0030
Statistically significant at 5% level: Yes


In [None]:
###### number 11

import pandas as pd
import statsmodels.api as sm
import numpy as np

# Load the data
data = pd.read_excel(r'/content/Copy of jaggia_ba_2e_ch08_data.xlsx', sheet_name='Urban', engine='openpyxl')

# Part a: Simple linear regression
X = data['Income']
y = data['Consumption']
X = sm.add_constant(X)
model_a = sm.OLS(y, X).fit()

# Predict consumption for income of $75,000
income_75k = 75000
predicted_consumption_a = model_a.predict([1, income_75k])[0]

print("Part a:")
print(f"Predicted consumption for income $75,000: ${predicted_consumption_a:.2f}")

# Part b: Including dummy variable for Urban
data['Urban'] = (data['Location'] == 'Urban').astype(int)
X_b = sm.add_constant(data[['Income', 'Urban']])
model_b = sm.OLS(y, X_b).fit()

# Predict consumption for income of $75,000 in urban and rural areas
predicted_consumption_b_urban = model_b.predict([1, income_75k, 1])[0]
predicted_consumption_b_rural = model_b.predict([1, income_75k, 0])[0]

print("\nPart b:")
print(f"Predicted consumption for income $75,000 in urban area: ${predicted_consumption_b_urban:.2f}")
print(f"Predicted consumption for income $75,000 in rural area: ${predicted_consumption_b_rural:.2f}")

# Part c: Including interaction term
data['Income_Urban'] = data['Income'] * data['Urban']
X_c = sm.add_constant(data[['Income', 'Urban', 'Income_Urban']])
model_c = sm.OLS(y, X_c).fit()

# Predict consumption for income of $75,000 in urban and rural areas
predicted_consumption_c_urban = model_c.predict([1, income_75k, 1, income_75k])[0]
predicted_consumption_c_rural = model_c.predict([1, income_75k, 0, 0])[0]

print("\nPart c:")
print(f"Predicted consumption for income $75,000 in urban area: ${predicted_consumption_c_urban:.2f}")
print(f"Predicted consumption for income $75,000 in rural area: ${predicted_consumption_c_rural:.2f}")

# Part d: Model comparison
aic_a = model_a.aic
aic_b = model_b.aic
aic_c = model_c.aic

r2_a = model_a.rsquared
r2_b = model_b.rsquared
r2_c = model_c.rsquared

print("\nPart d:")
print("Model comparison:")
print(f"Model A - AIC: {aic_a:.2f}, R-squared: {r2_a:.4f}")
print(f"Model B - AIC: {aic_b:.2f}, R-squared: {r2_b:.4f}")
print(f"Model C - AIC: {aic_c:.2f}, R-squared: {r2_c:.4f}")

best_model = min([(aic_a, "A"), (aic_b, "B"), (aic_c, "C")])[1]
print(f"\nThe most suitable model based on AIC is Model {best_model}.")
print("Explanation: The model with the lowest AIC provides the best balance between model fit and complexity.")
print("However, consider also the R-squared values and the specific research question when making the final decision.")

Part a:
Predicted consumption for income $75,000: $49300.29

Part b:
Predicted consumption for income $75,000 in urban area: $52880.16
Predicted consumption for income $75,000 in rural area: $46335.73

Part c:
Predicted consumption for income $75,000 in urban area: $55995.18
Predicted consumption for income $75,000 in rural area: $48050.56

Part d:
Model comparison:
Model A - AIC: 1060.31, R-squared: 0.5892
Model B - AIC: 1058.82, R-squared: 0.6169
Model C - AIC: 1055.48, R-squared: 0.6557

The most suitable model based on AIC is Model C.
Explanation: The model with the lowest AIC provides the best balance between model fit and complexity.
However, consider also the R-squared values and the specific research question when making the final decision.


In [None]:
###### number 13

import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Load the data
data = pd.read_excel(r'/content/Copy of jaggia_ba_2e_ch08_data.xlsx', sheet_name='BMI', engine='openpyxl')

# Part a: Estimate model with BMI and White as predictor variables
model_a = ols('Salary ~ BMI + White', data=data).fit()
print("Part a: Model with BMI and White as predictors")
print(model_a.summary())

# Part b: Re-estimate model with BMI, White, and their interaction
data['BMI_White'] = data['BMI'] * data['White']
model_b = ols('Salary ~ BMI + White + BMI_White', data=data).fit()
print("\nPart b: Model with BMI, White, and their interaction")
print(model_b.summary())

# Part c: Model comparison and salary estimates
print("\nPart c: Model Comparison")
print(f"Model A - R-squared: {model_a.rsquared:.4f}, AIC: {model_a.aic:.2f}")
print(f"Model B - R-squared: {model_b.rsquared:.4f}, AIC: {model_b.aic:.2f}")

# Determine which model is more suitable
if model_b.aic < model_a.aic:
    better_model = model_b
    print("Model B (with interaction) is more suitable based on lower AIC.")
else:
    better_model = model_a
    print("Model A (without interaction) is more suitable based on lower AIC.")

print("\nExplanation: The model with the lower AIC is generally considered more suitable as it balances goodness of fit with model simplicity.")

# Estimate salary for a white man with BMI of 30
white_salary = better_model.predict(pd.DataFrame({'BMI': [30], 'White': [1], 'BMI_White': [30]})).values[0]
print(f"\nEstimated salary for a white college-educated man with BMI of 30: ${white_salary:.2f}")

# Estimate salary for a non-white man with BMI of 30
nonwhite_salary = better_model.predict(pd.DataFrame({'BMI': [30], 'White': [0], 'BMI_White': [0]})).values[0]
print(f"Estimated salary for a non-white college-educated man with BMI of 30: ${nonwhite_salary:.2f}")

Part a: Model with BMI and White as predictors
                            OLS Regression Results                            
Dep. Variable:                 Salary   R-squared:                       0.728
Model:                            OLS   Adj. R-squared:                  0.708
Method:                 Least Squares   F-statistic:                     36.19
Date:                Sun, 08 Sep 2024   Prob (F-statistic):           2.29e-08
Time:                        00:03:45   Log-Likelihood:                -79.576
No. Observations:                  30   AIC:                             165.2
Df Residuals:                      27   BIC:                             169.4
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Inter

In [9]:
###### number 15

import pandas as pd
import statsmodels.formula.api as sm

# Load the data
df = pd.read_excel(r'/content/Copy of jaggia_ba_2e_ch08_data.xlsx', sheet_name='IPO', engine='openpyxl')

# Check the unique values in the HighTech column to understand its levels
print(df['HighTech'].unique())

# Model with Return as the response variable and Revision and HighTech as predictor variables
model1 = sm.ols('Return ~ Revision + HighTech', data=df).fit()

# Print the summary of the initial model
print(model1.summary())

# Model with interaction between Revision and HighTech
model2 = sm.ols('Return ~ Revision * HighTech', data=df).fit()

# Print the summary of the extended model
print(model2.summary())

# Predicting for a high-tech firm with a 15% price revision
prediction_high_tech = model2.predict({'Revision': [0.15], 'HighTech': ['Yes']})

# Predicting for a non-high-tech firm with a 15% price revision
prediction_non_high_tech = model2.predict({'Revision': [0.15], 'HighTech': ['No']})

# Display the predictions
print(f'Predicted Initial Return for High-Tech Firm: {prediction_high_tech[0]}')
print(f'Predicted Initial Return for Non-High-Tech Firm: {prediction_non_high_tech[0]}')


['No' 'Yes']
                            OLS Regression Results                            
Dep. Variable:                 Return   R-squared:                       0.165
Model:                            OLS   Adj. R-squared:                  0.158
Method:                 Least Squares   F-statistic:                     25.77
Date:                Sun, 08 Sep 2024   Prob (F-statistic):           6.13e-11
Time:                        14:26:58   Log-Likelihood:                -979.18
No. Observations:                 264   AIC:                             1964.
Df Residuals:                     261   BIC:                             1975.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept           8.4199   

In [10]:
###### number 18

import pandas as pd
import numpy as np
from statsmodels.formula.api import ols

# Load the actual dataset
data = pd.read_excel(r'/content/Copy of jaggia_ba_2e_ch08_data.xlsx', sheet_name='College', engine='openpyxl')

# Create interaction term
data['Cost_Grad'] = data['Cost'] * data['Grad']

# a. Estimate regression model
model = ols('Earnings ~ Cost + Grad + Debt + City + Cost_Grad', data=data).fit()
print(model.summary())

# Interpret the interaction coefficient
print("\nInterpretation of the interaction coefficient:")
print(f"The coefficient of Cost_Grad is {model.params['Cost_Grad']:.4f}")
if model.params['Cost_Grad'] > 0:
    print("As the graduation rate increases, the effect of cost on earnings becomes more positive (or less negative).")
else:
    print("As the graduation rate increases, the effect of cost on earnings becomes more negative (or less positive).")

# b. Predict earnings for given scenarios
def predict_earnings(cost, grad=60, debt=80, city=1):
    return model.predict(pd.DataFrame({
        'Cost': [cost],
        'Grad': [grad],
        'Debt': [debt],
        'City': [city],
        'Cost_Grad': [cost * grad]
    }))[0]

costs = [20000, 30000, 40000]
print("\nb. Predictions for 60% graduation rate:")
for cost in costs:
    print(f"Cost: ${cost}, Predicted Earnings: ${predict_earnings(cost):.2f}")

# c. Repeat analysis with 80% graduation rate
print("\nc. Predictions for 80% graduation rate:")
for cost in costs:
    print(f"Cost: ${cost}, Predicted Earnings: ${predict_earnings(cost, grad=80):.2f}")

# Comparison
print("\nComparison of 60% vs 80% graduation rates:")
for cost in costs:
    diff = predict_earnings(cost, grad=80) - predict_earnings(cost, grad=60)
    print(f"Cost: ${cost}, Difference in Earnings: ${diff:.2f}")

                            OLS Regression Results                            
Dep. Variable:               Earnings   R-squared:                       0.462
Model:                            OLS   Adj. R-squared:                  0.437
Method:                 Least Squares   F-statistic:                     18.86
Date:                Sun, 08 Sep 2024   Prob (F-statistic):           1.68e-13
Time:                        14:42:16   Log-Likelihood:                -1160.8
No. Observations:                 116   AIC:                             2334.
Df Residuals:                     110   BIC:                             2350.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   3.417e+04    1.2e+04      2.847      0.0

In [21]:
###### number 19

import pandas as pd
import numpy as np
from statsmodels.formula.api import ols

# Load the data
data = pd.read_excel(r'/content/Copy of jaggia_ba_2e_ch08_data.xlsx', sheet_name='Rental', engine='openpyxl')

# Create interaction term
data['Bed_Sqft'] = data['Bed'] * data['Sqft']

# a. Estimate regression model
model = ols('Rent ~ Bed + Bath + Sqft + Bed_Sqft', data=data).fit()
print(model.summary())

# Interpret the interaction coefficient
print("\na. Interpretation of the interaction coefficient:")
print(f"The coefficient of Bed_Sqft is {model.params['Bed_Sqft']:.4f}")
if model.params['Bed_Sqft'] > 0:
    print("As the square footage increases, the effect of the number of bedrooms on rent becomes more positive.")
else:
    print("As the square footage increases, the effect of the number of bedrooms on rent becomes more negative.")

# b. Predict monthly rent for homes with 1,600 sqft, 2 baths, and 2, 3, 4 bedrooms
def predict_rent(sqft, bath, bed):
    return model.predict(pd.DataFrame({
        'Sqft': [sqft],
        'Bath': [bath],
        'Bed': [bed],
        'Bed_Sqft': [bed * sqft]
    }))[0]

print("\nb. Predictions for homes with 1,600 sqft and 2 baths:")
for bed in [2, 3, 4]:
    rent = predict_rent(1600, 2, bed)
    print(f"{bed} bedrooms: ${rent:.2f}")

# Calculate incremental rent
rent_2to3 = predict_rent(1600, 2, 3) - predict_rent(1600, 2, 2)
rent_3to4 = predict_rent(1600, 2, 4) - predict_rent(1600, 2, 3)
print(f"\nIncremental rent from 2 to 3 bedrooms: ${rent_2to3:.2f}")
print(f"Incremental rent from 3 to 4 bedrooms: ${rent_3to4:.2f}")

# c. Repeat analysis with 2,400 sqft
print("\nc. Predictions for homes with 2,400 sqft and 2 baths:")
for bed in [2, 3, 4]:
    rent = predict_rent(2400, 2, bed)
    print(f"{bed} bedrooms: ${rent:.2f}")

# Calculate incremental rent for 2,400 sqft
rent_2to3_2400 = predict_rent(2400, 2, 3) - predict_rent(2400, 2, 2)
rent_3to4_2400 = predict_rent(2400, 2, 4) - predict_rent(2400, 2, 3)
print(f"\nIncremental rent from 2 to 3 bedrooms: ${rent_2to3_2400:.2f}")
print(f"Incremental rent from 3 to 4 bedrooms: ${rent_3to4_2400:.2f}")

                            OLS Regression Results                            
Dep. Variable:                   Rent   R-squared:                       0.777
Model:                            OLS   Adj. R-squared:                  0.765
Method:                 Least Squares   F-statistic:                     65.47
Date:                Sun, 08 Sep 2024   Prob (F-statistic):           1.03e-23
Time:                        15:04:22   Log-Likelihood:                -579.11
No. Observations:                  80   AIC:                             1168.
Df Residuals:                      75   BIC:                             1180.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    950.4669    299.648      3.172      0.0

In [12]:
###### number 21

import numpy as np

# Define the models
def linear_model(x, intercept, coefficient):
    return intercept + coefficient * x

def quadratic_model(x, intercept, coefficient1, coefficient2):
    return intercept + coefficient1 * x + coefficient2 * x**2

# Model parameters
linear_params = {
    'intercept': 13.3087,
    'coefficient': 0.3392
}

quadratic_params = {
    'intercept': 1.7656,
    'coefficient1': 4.0966,
    'coefficient2': -0.2528
}

# R-squared values
r_squared = {
    'linear': 0.1317,
    'quadratic': 0.5844
}

# a. Determine which model fits the data better
def best_model():
    if r_squared['quadratic'] > r_squared['linear']:
        return "quadratic"
    else:
        return "linear"

# b. Predict y for given x values
def predict_y(x_values):
    best = best_model()
    if best == "linear":
        return [linear_model(x, **linear_params) for x in x_values]
    else:
        return [quadratic_model(x, **quadratic_params) for x in x_values]

# Main execution
if __name__ == "__main__":
    print("a. Model comparison:")
    better_model = best_model()
    print(f"The {better_model} model fits the data better based on the higher R-squared value.")

    print("\nb. Predictions:")
    x_values = [4, 8, 12]
    y_predictions = predict_y(x_values)
    for x, y in zip(x_values, y_predictions):
        print(f"For x = {x}, predicted y = {y:.4f}")

a. Model comparison:
The quadratic model fits the data better based on the higher R-squared value.

b. Predictions:
For x = 4, predicted y = 14.1072
For x = 8, predicted y = 18.3592
For x = 12, predicted y = 14.5216


In [13]:
###### number 22


import math

def linear_model(x, intercept, coefficient):
    return intercept + coefficient * x

def logarithmic_model(x, intercept, coefficient):
    return intercept + coefficient * math.log(x)

def exponential_model(x, intercept, coefficient):
    return math.exp(intercept + coefficient * x)

def log_log_model(x, intercept, coefficient):
    return math.exp(intercept + coefficient * math.log(x))

# Model parameters
models = {
    'Linear': {'intercept': 18.52, 'coefficient': 1.68},
    'Logarithmic': {'intercept': -6.74, 'coefficient': 29.96},
    'Exponential': {'intercept': 1.48, 'coefficient': 0.06},
    'Log-Log': {'intercept': 1.02, 'coefficient': 0.96}
}

# Predict y for x = 50
x = 50

predictions = {
    'Linear': linear_model(x, **models['Linear']),
    'Logarithmic': logarithmic_model(x, **models['Logarithmic']),
    'Exponential': exponential_model(x, **models['Exponential']),
    'Log-Log': log_log_model(x, **models['Log-Log'])
}

for model, prediction in predictions.items():
    print(f"{model} model prediction for x = 50: {prediction:.2f}")

Linear model prediction for x = 50: 102.52
Logarithmic model prediction for x = 50: 110.46
Exponential model prediction for x = 50: 88.23
Log-Log model prediction for x = 50: 118.57


In [14]:
###### number 24

import math

def log_log_model(x, intercept, coefficient):
    return math.exp(intercept + coefficient * math.log(x))

def exponential_model(x, intercept, coefficient):
    return math.exp(intercept + coefficient * x)

# Model parameters
models = {
    'Log-Log': {'intercept': 1.8826, 'coefficient': 0.3663},
    'Exponential': {'intercept': 2.0219, 'coefficient': 0.0513}
}

# R-squared values
r_squared = {
    'Log-Log': 0.5187,
    'Exponential': 0.6660
}

def best_model():
    return max(r_squared, key=r_squared.get)

# Predict y for given x
def predict_y(x):
    better_model = best_model()
    if better_model == 'Log-Log':
        return log_log_model(x, **models['Log-Log'])
    else:
        return exponential_model(x, **models['Exponential'])

# Main execution
if __name__ == "__main__":
    print("a. Model comparison:")
    better_model = best_model()
    print(f"The {better_model} model fits the data better based on the higher R-squared value.")

    print("\nb. Prediction:")
    x = 20
    y_prediction = predict_y(x)
    print(f"For x = {x}, predicted y = {y_prediction:.4f}")

a. Model comparison:
The Exponential model fits the data better based on the higher R-squared value.

b. Prediction:
For x = 20, predicted y = 21.0710


In [19]:
###### number 36

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import math

# Load the data
data = pd.read_excel(r'/content/Copy of jaggia_ba_2e_ch08_data.xlsx', sheet_name='Happiness', engine='openpyxl')

# Prepare the features
data['Age_squared'] = data['Age'] ** 2
data['ln_Income'] = np.log(data['Income'])

# Prepare X (features) and y (target)
X = data[['Age', 'Age_squared', 'ln_Income']]
y = data['Happiness']

# Fit the model
model = LinearRegression()
model.fit(X, y)

def predict_happiness(age, income):
    age_squared = age ** 2
    ln_income = math.log(income)
    X_new = pd.DataFrame({
        'Age': [age],
        'Age_squared': [age_squared],
        'ln_Income': [ln_income]
    })
    return model.predict(X_new)[0]

# Print model coefficients and intercept
print("Regression Model:")
print(f"Intercept: {model.intercept_:.4f}")
print(f"Coefficient for Age: {model.coef_[0]:.4f}")
print(f"Coefficient for Age^2: {model.coef_[1]:.4f}")
print(f"Coefficient for ln(Income): {model.coef_[2]:.4f}")

# Predictions for part b
income_b = 80000
ages_b = [30, 45, 60]
print("\nPredictions for part b:")
for age in ages_b:
    happiness = predict_happiness(age, income_b)
    print(f"Age: {age}, Income: ${income_b}, Predicted Happiness: {happiness:.2f}")

# Predictions for part c
age_c = 60
incomes_c = [25000, 75000, 125000]
print("\nPredictions for part c:")
for income in incomes_c:
    happiness = predict_happiness(age_c, income)
    print(f"Age: {age_c}, Income: ${income}, Predicted Happiness: {happiness:.2f}")

Regression Model:
Intercept: -13.3021
Coefficient for Age: -2.4296
Coefficient for Age^2: 0.0241
Coefficient for ln(Income): 12.7210

Predictions for part b:
Age: 30, Income: $80000, Predicted Happiness: 79.09
Age: 45, Income: $80000, Predicted Happiness: 69.72
Age: 60, Income: $80000, Predicted Happiness: 71.18

Predictions for part c:
Age: 60, Income: $25000, Predicted Happiness: 56.39
Age: 60, Income: $75000, Predicted Happiness: 70.36
Age: 60, Income: $125000, Predicted Happiness: 76.86
