In [1]:
# Import dependencies
import pandas as pd
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [2]:
# Load the data and create a DataFrame for the data
short_filtered_cleaned_byarea_df = pd.read_csv('short_filtered_cleaned_byarea.csv')
short_filtered_cleaned_byarea_df.head()

Unnamed: 0,AREA,EVENT_TYPE,RACE,SEX,SITE,YEAR,AGE_ADJUSTED_RATE
0,Alabama,Incidence,American Indian/Alaska Native,Female,Brain and Other Nervous System,1999,0.0
1,Alabama,Mortality,American Indian/Alaska Native,Female,Brain and Other Nervous System,1999,0.0
2,Alabama,Incidence,American Indian/Alaska Native,Female,Brain and Other Nervous System,2000,0.0
3,Alabama,Mortality,American Indian/Alaska Native,Female,Brain and Other Nervous System,2000,0.0
4,Alabama,Incidence,American Indian/Alaska Native,Female,Brain and Other Nervous System,2001,0.0


In [3]:
# Check shape of DataFrame
short_filtered_cleaned_byarea_df.shape

(653591, 7)

In [4]:
# Check for info DataFrame
short_filtered_cleaned_byarea_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 653591 entries, 0 to 653590
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   AREA               653591 non-null  object 
 1   EVENT_TYPE         653591 non-null  object 
 2   RACE               653591 non-null  object 
 3   SEX                653591 non-null  object 
 4   SITE               653591 non-null  object 
 5   YEAR               653591 non-null  int64  
 6   AGE_ADJUSTED_RATE  653591 non-null  float64
dtypes: float64(1), int64(1), object(5)
memory usage: 34.9+ MB


In [5]:
# Convert YEAR to category
short_filtered_cleaned_byarea_df['YEAR'] = short_filtered_cleaned_byarea_df['YEAR'].astype('object')

In [6]:
# Check Datatypes of DataFrame
short_filtered_cleaned_byarea_df.dtypes

AREA                  object
EVENT_TYPE            object
RACE                  object
SEX                   object
SITE                  object
YEAR                  object
AGE_ADJUSTED_RATE    float64
dtype: object

In [7]:
# Generate our categorical variable list
short_filtered_cleaned_byarea_df_cat = short_filtered_cleaned_byarea_df.dtypes[
    short_filtered_cleaned_byarea_df.dtypes == "object"].index.tolist()


# Check the number of unique values in each column
short_filtered_cleaned_byarea_df[short_filtered_cleaned_byarea_df_cat].nunique()

AREA          51
EVENT_TYPE     2
RACE           5
SEX            3
SITE          26
YEAR          21
dtype: int64

In [8]:
# Check unique values of categories in DataFrame
print(short_filtered_cleaned_byarea_df['AREA'].unique())
print(short_filtered_cleaned_byarea_df['EVENT_TYPE'].unique())
print(short_filtered_cleaned_byarea_df['RACE'].unique())
print(short_filtered_cleaned_byarea_df['SEX'].unique())
print(short_filtered_cleaned_byarea_df['SITE'].unique())
print(short_filtered_cleaned_byarea_df['YEAR'].unique())

['Alabama' 'Alaska' 'Arizona' 'Arkansas' 'California' 'Colorado'
 'Connecticut' 'Delaware' 'District of Columbia' 'Florida' 'Georgia'
 'Hawaii' 'Idaho' 'Illinois' 'Indiana' 'Iowa' 'Kansas' 'Kentucky'
 'Louisiana' 'Maine' 'Maryland' 'Massachusetts' 'Michigan' 'Minnesota'
 'Mississippi' 'Missouri' 'Montana' 'Nebraska' 'Nevada' 'New Hampshire'
 'New Jersey' 'New Mexico' 'New York' 'North Carolina' 'North Dakota'
 'Ohio' 'Oklahoma' 'Oregon' 'Pennsylvania' 'Rhode Island' 'South Carolina'
 'South Dakota' 'Tennessee' 'Texas' 'Utah' 'Vermont' 'Virginia'
 'Washington' 'West Virginia' 'Wisconsin' 'Wyoming']
['Incidence' 'Mortality']
['American Indian/Alaska Native' 'Asian/Pacific Islander' 'Black'
 'Hispanic' 'White']
['Female' 'Male' 'Male and Female']
['Brain and Other Nervous System' 'Cervix' 'Colon and Rectum'
 'Corpus and Uterus, NOS' 'Esophagus' 'Female Breast'
 'Female Breast, <i>in situ</i>' 'Hodgkin Lymphoma' 'Kaposi Sarcoma'
 'Kidney and Renal Pelvis' 'Larynx' 'Leukemias'
 'Liver and I

In [9]:
# ANOVA Variables vs AGE_ADJUSTED_RATE
model = ols('AGE_ADJUSTED_RATE ~ AREA + EVENT_TYPE + RACE + SEX + SITE + YEAR', data = short_filtered_cleaned_byarea_df).fit()
anova_table = sm.stats.anova_lm(model)
anova_table

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
AREA,50.0,659243.0,13184.86,481.926115,0.0
EVENT_TYPE,1.0,1892231.0,1892231.0,69163.836219,0.0
RACE,4.0,1011297.0,252824.4,9241.104794,0.0
SEX,2.0,139407.6,69703.81,2547.777328,0.0
SITE,25.0,10022750.0,400910.1,14653.855704,0.0
YEAR,20.0,19303.39,965.1697,35.278382,1.759682e-136
Residual,653488.0,17878560.0,27.35867,,


In [12]:
# ANOVA Features Interactions vs AGE_ADJUSTED_RATE
model = ols('AGE_ADJUSTED_RATE ~ AREA:EVENT_TYPE + AREA:RACE + AREA:SEX', data = short_filtered_cleaned_byarea_df).fit()
anova_table = sm.stats.anova_lm(model)
anova_table

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
AREA:EVENT_TYPE,101.0,2621552.0,25955.965234,631.048868,0.0
AREA:RACE,204.0,1973529.0,9674.162616,235.201015,0.0
AREA:SEX,102.0,161342.0,1581.78457,38.4568,0.0
Residual,653183.0,26866370.0,41.131466,,


In [13]:
# ANOVA Features Interactions vs AGE_ADJUSTED_RATE
model = ols('AGE_ADJUSTED_RATE ~ EVENT_TYPE:RACE + EVENT_TYPE:SEX + EVENT_TYPE:SITE + EVENT_TYPE:YEAR', data = short_filtered_cleaned_byarea_df).fit()
anova_table = sm.stats.anova_lm(model)
anova_table

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
EVENT_TYPE:RACE,9.0,2983125.0,331458.298217,12804.329023,0.0
EVENT_TYPE:SEX,4.0,146412.5,36603.114904,1413.988816,0.0
EVENT_TYPE:SITE,50.0,11519340.0,230386.88256,8899.911279,0.0
EVENT_TYPE:YEAR,40.0,57546.71,1438.667828,55.57615,0.0
Residual,653489.0,16916490.0,25.886425,,


In [14]:
# ANOVA Features Interactions vs AGE_ADJUSTED_RATE
model = ols('AGE_ADJUSTED_RATE ~ RACE:SEX + RACE:SITE + RACE:YEAR', data = short_filtered_cleaned_byarea_df).fit()
anova_table = sm.stats.anova_lm(model)
anova_table

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
RACE:SEX,14.0,1203170.0,85940.688184,2847.748204,0.0
RACE:SITE,125.0,10666680.0,85333.408564,2827.625263,0.0
RACE:YEAR,100.0,35817.95,358.179485,11.868709,4.165162e-185
Residual,653351.0,19717130.0,30.178472,,


In [16]:
# ANOVA Features Interactions vs AGE_ADJUSTED_RATE
model = ols('AGE_ADJUSTED_RATE ~ SEX:SITE + SEX:YEAR', data = short_filtered_cleaned_byarea_df).fit()
anova_table = sm.stats.anova_lm(model)
anova_table

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
SEX:SITE,77.0,9297732.0,120749.762373,3537.743416,0.0
SEX:YEAR,60.0,21331.54,355.52559,10.416238,6.133182e-95
Residual,653460.0,22303810.0,34.13186,,


In [17]:
# ANOVA Features Interactions vs AGE_ADJUSTED_RATE
model = ols('AGE_ADJUSTED_RATE ~ SITE:YEAR', data = short_filtered_cleaned_byarea_df).fit()
anova_table = sm.stats.anova_lm(model)
anova_table

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
SITE:YEAR,545.0,8619676.0,15815.918536,449.004541,0.0
Residual,653045.0,23003120.0,35.224407,,


In [18]:
# ANOVA Features Interactions vs AGE_ADJUSTED_RATE
model = ols('AGE_ADJUSTED_RATE ~ AREA:SITE', data = short_filtered_cleaned_byarea_df).fit()
anova_table = sm.stats.anova_lm(model)
anova_table

MemoryError: Unable to allocate 6.46 GiB for an array with shape (1326, 653591) and data type float64