In [1]:
import pandas as pd    #import pandas for data
from patsy import dmatrices    #import patsy for easy matrix specification
import statsmodels.api as sm    #import statsmodels for the stats models
import statsmodels.formula.api as smf    #import .formula for R-like formula specification
from sklearn.linear_model import LinearRegression    #import linear regression from sklearn for yet another way to do it

In [2]:
url = "http://data.princeton.edu/wws509/datasets/salary.dat"    #define the url
data = pd.read_csv(url, sep='\s+')    #read in the data
data.head()    #have a look at the beginning of the data

Unnamed: 0,sx,rk,yr,dg,yd,sl
0,male,full,25,doctorate,35,36350
1,male,full,13,doctorate,22,35350
2,male,full,10,doctorate,23,28200
3,female,full,7,doctorate,27,26775
4,male,full,19,masters,30,33696


In [3]:
y, X = dmatrices('sl ~ sx + yr + rk', data=data, return_type='dataframe')    #set up dependent and independent variable matrices

In [4]:
print(y.head())    #the dependent variable is salary
print(X.head())    #the independent variances are: constant, gender, associate dummy, full dummy, years at current rank

      sl
0  36350
1  35350
2  28200
3  26775
4  33696
   Intercept  sx[T.male]  rk[T.associate]  rk[T.full]  yr
0          1           1                0           1  25
1          1           1                0           1  13
2          1           1                0           1  10
3          1           0                0           1   7
4          1           1                0           1  19


In [5]:
model = sm.OLS(y, X)    #define the statsmodels model object
results = model.fit()    #fit it
print('type of model:', type(model))    #double check what we have created
print('type of results:', type(results))

type of model: <class 'statsmodels.regression.linear_model.OLS'>
type of results: <class 'statsmodels.regression.linear_model.RegressionResultsWrapper'>


In [6]:
results.summary()    #summarize the results

0,1,2,3
Dep. Variable:,sl,R-squared:,0.846
Model:,OLS,Adj. R-squared:,0.833
Method:,Least Squares,F-statistic:,64.64
Date:,"Fri, 15 Apr 2016",Prob (F-statistic):,1.64e-18
Time:,13:08:50,Log-Likelihood:,-476.26
No. Observations:,52,AIC:,962.5
Df Residuals:,47,BIC:,972.3
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,1.643e+04,737.966,22.265,0.000,1.49e+04 1.79e+04
sx[T.male],-524.1492,834.687,-0.628,0.533,-2203.323 1155.024
rk[T.associate],4373.9154,906.124,4.827,0.000,2551.030 6196.801
rk[T.full],9483.8419,912.795,10.390,0.000,7647.536 1.13e+04
yr,390.9358,75.383,5.186,0.000,239.285 542.587

0,1,2,3
Omnibus:,23.039,Durbin-Watson:,1.832
Prob(Omnibus):,0.0,Jarque-Bera (JB):,38.727
Skew:,1.41,Prob(JB):,3.9e-09
Kurtosis:,6.15,Cond. No.,32.3


In [None]:
results = smf.ols(formula="sl ~ yr", data=data).fit()    #fit a model using this call
results.summary()    #summarize the results

In [None]:
results = smf.ols(formula="sl ~ sx + yr + rk", data=data).fit()    #fit another model using this call
results.summary()    #summarize the results

In [None]:
model = LinearRegression()    #define a linear model using sklearn
model.fit(X, y)    #fit the model

In [None]:
model.score(X, y)    #return the R2 associated with this estimation

In [None]:
model.coef_    #return the coefficients

In [None]:
model.intercept_    #return the constant