In [1]:
import pandas as pd

In [2]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [3]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
abalone = fetch_ucirepo(id=1)

# data (as pandas dataframes)
X = abalone.data.features
y = abalone.data.targets

# metadata
print(abalone.metadata)

# variable information
print(abalone.variables)


{'uci_id': 1, 'name': 'Abalone', 'repository_url': 'https://archive.ics.uci.edu/dataset/1/abalone', 'data_url': 'https://archive.ics.uci.edu/static/public/1/data.csv', 'abstract': 'Predict the age of abalone from physical measurements', 'area': 'Biology', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Tabular'], 'num_instances': 4177, 'num_features': 8, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': [], 'target_col': ['Rings'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1994, 'last_updated': 'Mon Aug 28 2023', 'dataset_doi': '10.24432/C55C7W', 'creators': ['Warwick Nash', 'Tracy Sellers', 'Simon Talbot', 'Andrew Cawthorn', 'Wes Ford'], 'intro_paper': None, 'additional_info': {'summary': 'Predicting the age of abalone from physical measurements.  The age of abalone is determined by cutting the shell through the cone, staining it, and counting the number of rings through a microscope -- 

In [4]:
y

Unnamed: 0,Rings
0,15
1,7
2,9
3,10
4,7
...,...
4172,11
4173,10
4174,9
4175,10


In [5]:
df = pd.concat((X,y),axis=1)

In [6]:
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [7]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [8]:
model = ols('Rings ~ (Length+Height+Viscera_weight):C(Sex)',data=df).fit()
#model = ols('Rings ~ Length+Height:Viscera_weight',data=df).fit()

In [9]:
sm.stats.anova_lm(model, typ=2)

Unnamed: 0,sum_sq,df,F,PR(>F)
Length:C(Sex),1165.169542,3.0,59.610786,9.971382e-38
Height:C(Sex),1355.129851,3.0,69.329272,9.539669999999999e-44
Viscera_weight:C(Sex),640.705886,3.0,32.778905,6.196963e-21
Residual,27149.792687,4167.0,,


# Exercício

Usando os dados conhecidos do molusco abalone. Encontre um modelo (relação entre variáveis dependentes e indenpendente) em que todas as relações são estatisticamente relevantes e que reduza o resíduo com menos termos.

[Exemplo](https://www.statology.org/two-way-anova-python/)



Modelo intermediário (menos interações)

In [10]:
model_mid = ols("Rings ~ C(Sex) + Length + Height + Viscera_weight", data=df).fit()
anova_mid = sm.stats.anova_lm(model_mid, typ=2)

print(model_mid.summary())
anova_mid


                            OLS Regression Results                            
Dep. Variable:                  Rings   R-squared:                       0.361
Model:                            OLS   Adj. R-squared:                  0.360
Method:                 Least Squares   F-statistic:                     470.6
Date:                Sat, 29 Nov 2025   Prob (F-statistic):               0.00
Time:                        00:40:44   Log-Likelihood:                -9882.0
No. Observations:                4177   AIC:                         1.978e+04
Df Residuals:                    4171   BIC:                         1.981e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          3.2252      0.311     10.

Unnamed: 0,sum_sq,df,F,PR(>F)
C(Sex),861.131587,2.0,64.709178,2.110262e-28
Length,741.520675,1.0,111.442187,9.967308999999999e-26
Height,1110.057613,1.0,166.829129,1.887865e-37
Viscera_weight,125.357745,1.0,18.839854,1.455344e-05
Residual,27753.248747,4171.0,,


Modelo parcimonioso

In [11]:
model_pars = ols("Rings ~ C(Sex) + Length + Diameter + Whole_weight", data=df).fit()
anova_pars = sm.stats.anova_lm(model_pars, typ=2)

print(model_pars.summary())
anova_pars


                            OLS Regression Results                            
Dep. Variable:                  Rings   R-squared:                       0.353
Model:                            OLS   Adj. R-squared:                  0.352
Method:                 Least Squares   F-statistic:                     454.9
Date:                Sat, 29 Nov 2025   Prob (F-statistic):               0.00
Time:                        00:41:33   Log-Likelihood:                -9907.3
No. Observations:                4177   AIC:                         1.983e+04
Df Residuals:                    4171   BIC:                         1.986e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        4.7347      0.336     14.105   

Unnamed: 0,sum_sq,df,F,PR(>F)
C(Sex),750.195637,2.0,55.69496,1.3468799999999999e-24
Length,161.838007,1.0,24.029895,9.845678e-07
Diameter,709.341564,1.0,105.323859,2.0194909999999998e-24
Whole_weight,17.71424,1.0,2.630231,0.104922
Residual,28091.105701,4171.0,,
