# Core Statistics Using Python
### Hana Choi, Simon Business School, University of Rochester


# Handling Nonlinearity   

## Topics covered

- Quadratic regression
- Logarithms
- Dummies and interactions

## Required packages

In [None]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns

# Quadratic regression: CA School Data

## Load data

In [None]:
# Load caschool.csv dataset
caschool = pd.read_csv("/Users/hanachoi/Dropbox/teaching/core_statistics/Data/caschool.csv")

# Display first few rows of the dataframe
caschool.head()

## Scatter plot of income vs. test scores

In [None]:
plt.scatter(caschool['income'], caschool['testscr'])
plt.xlabel('Income')
plt.ylabel('Test Scores')
plt.title('Scatter plot of Income vs Test Scores')
plt.show()

## Regression analysis

In [None]:
# Linear regression
linear_model = smf.ols('testscr ~ income', data=caschool).fit()
print(linear_model.summary().tables[1])

In [None]:
# Quadratic regression
quadratic_model = smf.ols('testscr ~ income + I(income**2)', data=caschool).fit()
print(quadratic_model.summary().tables[1])

In [None]:
# Plotting linear vs quadratic fits
plt.scatter(caschool['income'], caschool['testscr'], color='black', label='Data Points', s=9)
plt.scatter(caschool['income'], linear_model.fittedvalues, color='green', label='Linear Fit', s=10)
plt.scatter(caschool['income'], quadratic_model.fittedvalues, color='red', label='Quadratic Fit', s=10)
plt.legend()
plt.show()

# Logarithms and interpretations

## Example1: Sales data

### Load and describe data

In [None]:
# Load caschool.csv dataset
salesadvert = pd.read_csv("/Users/hanachoi/Dropbox/teaching/core_statistics/Data/SalesAdvert.csv")

# Display first few rows of the dataframe
salesadvert.head()

In [None]:
# Display summary statistics
salesadvert.describe()

### Level-level regression

In [None]:
# Level-level
model_ll = smf.ols('sales ~ advert', data=salesadvert).fit()
print(model_ll.summary().tables[1])
print('----')

# R-square
print(f"Level-level model R-squared: {model_ll.rsquared}")

### Level-log regression

In [None]:
# Level-log
model_ll_log = smf.ols('sales ~ np.log(advert)', data=salesadvert).fit() 
print(model_ll_log.summary().tables[1])
print('----')

# R-square
print(f"Level-log model R-squared: {model_ll_log.rsquared}")

### Log-level regression

In [None]:
# Log-level
model_log_ll = smf.ols('np.log(sales) ~ advert', data=salesadvert).fit()
print(model_log_ll.summary().tables[1])
print('----')

# R-square
print(f"Log-level model R-squared: {model_log_ll.rsquared}")

### Log-log regression

In [None]:
# Log-log
model_log_log = smf.ols('np.log(sales) ~ np.log(advert)', data=salesadvert).fit()
print(model_log_log.summary().tables[1])
print('----')

# R-square
print(f"Log-log model R-squared: {model_log_log.rsquared}")

## Example2: CA school data, again

### Level-level regression

- $R^2 = 0.51$
- Interpretation: A \$1000 increase in income is expected to increase test score by 1.88 points

In [None]:
# Level-level regression
print(linear_model.summary().tables[1]) # linear regression model fitted earlier
print('----')
print(f"Level-level model R-squared: {linear_model.rsquared}")

### Level-log regression

- $R2 = 0.56$
- Interpretation: A 1% increase in income is associated with an increase in test scores of 0.01*36.4 = 0.364.
- mean(income) is 15.3K, so 1% increase in income is about 153 dollar increase in income. 
- So level-log model says about \$153 increase in income is associated with an increase in test scores of 0.364.
- Let's compare this result with the level-level model.
- From the level-level model, 153 dollar increase in income is associated with 0.153*1.88=0.288 increase in test scores.
- Which model and the result should we use?
- The $R^2$ is higher for the level-log model (0.56) than the level-level model (0.51), so the level-log model is preferred. Also OLS Assumption1 is better satisfied (see the plot below).


In [None]:
# Level-log
caschool_ll_log = smf.ols('testscr ~ np.log(income)', data=caschool).fit() 
print(caschool_ll_log.summary().tables[1])
print('----')

# R-square
print(f"Level-log model R-squared: {caschool_ll_log.rsquared}")
print('----')

# Mean(Income)
mean_income = caschool['income'].mean()
print(f"Mean Income: {mean_income}")

In [None]:
# Plotting quadratic vs. level-log fits: they are pretty similar
plt.scatter(caschool['income'], caschool['testscr'], color='black', label='Data Points', s=9)
# plt.scatter(caschool['income'], linear_model.fittedvalues, color='green', label='Linear Fit', s=10)
plt.scatter(caschool['income'], quadratic_model.fittedvalues, color='red', label='Quadratic Fit', s=10)
plt.scatter(caschool['income'], caschool_ll_log.fittedvalues, color='blue', label='Level-log Fit', s=10)
plt.legend()
plt.show()

### Log-level regression

- $R^2= 0.50$
- Interpretation: A \$1000 increase in income is associated with a 100*0.00284=0.284 percent increase in test scores.
- mean(test score) is 654.2. Evaluated at the mean test score level, this is 0.00284*654.16=1.86 increase in the test scores.
- The log-level result is quite similar to the level-level model result.
- Note that we cannot compare the log-level model $R^2$ with either level-level or level-log model $R^2$, because $R^2$ can only be used to compare regressions with the same dependent variable.

In [None]:
# Log-level
caschool_log_ll = smf.ols('np.log(testscr) ~ income', data=caschool).fit() 
print(caschool_log_ll.summary().tables[1])
print('----')

# R-square
print(f"Log-level model R-squared: {caschool_log_ll.rsquared}")
print('----')

# Mean(Income)
mean_testscr = caschool['testscr'].mean()
print(f"Mean Test Scores: {mean_testscr}")

### Log-log regression

- $R^2= 0.56$
- Interpretation: A 1% increase in income is associated with a 0.055% increase in test scores.
- mean(test scores) is 654.2. Evaluated at the mean level, this is 0.00055*654.16=0.36 increase in test scores, which is similar in magnitude to the level-log model.
- The $R^2$ is higher for the log-log model (0.56) than the log-level model (0.50). We can make this comparison, because the dependent variable is the same, log(testscr). Therefore the log-log model is preferred to the log-level model.

In [None]:
# Log-log
caschool_log_log = smf.ols('np.log(testscr) ~ np.log(income)', data=caschool).fit() 
print(caschool_log_log.summary().tables[1])
print('----')

# R-square
print(f"Log-log model R-squared: {caschool_log_log.rsquared}")

## Example3: RFJ data

### Load data

In [None]:
# Load dataset
rfj_small = pd.read_csv("/Users/hanachoi/Dropbox/teaching/core_statistics/Data/rfj_small.csv")

### Level-level regression

In [None]:
# Level-level
model_rfj = smf.ols('q1 ~ p1', data=rfj_small).fit()
print(model_rfj.summary().tables[1])
print('----')

# Get slope estimate and compute elasticity
slope_estimate = model_rfj.params['p1']
avg_price = rfj_small['p1'].mean()
avg_quantity = rfj_small['q1'].mean()
elasticity = slope_estimate * avg_price / avg_quantity
print("Price Elasticity Level-level model:", elasticity)

### Log-log regression

- Note that elasticity is computed automatically (since it's the slope here)
- However, it's a different value than what you found in the problem set.
- This is because the demand model is different: the log-log model (namely "Constant Elasticity Demand Model") is curved, but the linear model is straight.

In [None]:
# Log-log
model_rfj_log_log = smf.ols('np.log(q1) ~ np.log(p1)', data=rfj_small).fit()
print(model_rfj_log_log.summary().tables[1])
print('----')

# Get slope estimate = elasticity 
elasticity_log_log = model_rfj_log_log.params['np.log(p1)'] # elasticity is estimated to be -2.8965 (slope)
print("Price Elasticity Log-log model:", elasticity_log_log)

# Dummies and interactions: Earnings Data

## Load data

In [None]:
# Load earnings data
earnings = pd.read_csv("/Users/hanachoi/Dropbox/teaching/core_statistics/Data/earnings.csv")

# Converting column names to lowercase (for ease of writing code)
earnings.columns = [col.lower() for col in earnings.columns]

# Display first few rows of the dataframe
earnings.head()

## Various regressions with interactions

In [None]:
# Various regressions with interactions
model_wage_female = smf.ols('wage ~ female', data=earnings).fit()
model_wage_yrseduc = smf.ols('wage ~ yrseduc', data=earnings).fit()
model_wage_female_yrseduc = smf.ols('wage ~ female + yrseduc', data=earnings).fit()
model_wage_interaction = smf.ols('wage ~ female + yrseduc + female:yrseduc', data=earnings).fit()

print(model_wage_female.summary())
print(model_wage_yrseduc.summary())
print(model_wage_female_yrseduc.summary())
print(model_wage_interaction.summary())

## (Optional) Earnings Plot 

- Creating a pretty (advanced) plot with seaborn package
- seaborn allows us to adjust labels, legends, ticks, grids, etc easily and flexibly.

In [None]:
# Set the aesthetic style of the plots
sns.set(style="whitegrid")

# Create a regression plot with scatter points
# lmplot combines regression lines and scatter plot
plot = sns.lmplot(data=earnings, x='yrseduc', y='wage', hue='female', palette=['blue', 'grey'], 
                  legend=True, ci=None, height=6, aspect=1.6)

# Customize the legend
plot._legend.set_title('') # remove the legend title

new_labels = ['Male', 'Female'] # update the labels from (0, 1) to ('Male', 'Female')
for t, l in zip(plot._legend.texts, new_labels):  
    t.set_text(l)

plot._legend.set_bbox_to_anchor((0.15, 0.9, 0, 0), transform=plt.gca().transAxes) # move the legend to the top left

# Customize the plot with labels and title
plot.set_xlabels('Education Years', fontsize=15)
plot.set_ylabels('Avg Hourly Wage ($)', fontsize=15)
plot.fig.suptitle('Impact of Education and Gender on Earnings', fontsize=20, y=1.03)

# Show the plot
plt.show()