In [None]:
from linearmodels import PanelOLS
import pandas as pd

# Load dataset
df = pd.read_csv('merged_dataset.csv')

# Ensure numeric fiscal year (fy)
df['fy'] = pd.to_numeric(df['fy'], errors='coerce')

year = pd.Categorical(df['fy'])
data = df.set_index(['ticker', 'fy'])
data['fy'] = year
# Sanity check: confirm column present
assert 'gross_profit' in df.columns, "gross_profit not found in dataframe columns"

# Define model
m = PanelOLS(dependent=df['revenue'],
             exog=df[['boycotted','operating_expenses','r_and_d']],
             entity_effects=True,
             time_effects=False,
             other_effects= None)
# Fit model with robust clustered SEs


results = m.fit(cov_type='clustered', cluster_entity=True)

print(results.summary)



ValueError: Series can only be used with a 2-level MultiIndex

In [None]:
# Formula API (easiest)
mod = PanelOLS.from_formula(
    'revenue ~ 1 + boycotted + operating_expenses + r_and_d + year',
    data=data
)

# Or 2-D endogenous:
y = df[['gross_profit']]              # DataFrame, not Series
X = df[['boycotted','operating_expenses','r_and_d']]
mod = PanelOLS(y, X, entity_effects=True, time_effects=True)


ValueError: Columns with duplicate values are not supported in stack

In [None]:
from linearmodels import PanelOLS
import pandas as pd

df = pd.read_csv('merged_dataset.csv')

# panel index
df['fy'] = pd.to_numeric(df['fy'], errors='coerce')
df = df.dropna(subset=['ticker','fy'])
df = df.set_index(['ticker','fy']).sort_index()

# Clean any accidental duplicate columns (just in case)
df = df.loc[:, ~df.columns.duplicated()]

mod = PanelOLS.from_formula(
    'revenue ~ 1 + boycotted + operating_expenses + r_and_d + EntityEffects + TimeEffects',
    data=df
)
res = mod.fit(cov_type='clustered', cluster_entity=True)
print(res.summary)


Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


ValueError: exog does not have full column rank. If you wish to proceed with model estimation irrespective of the numerical accuracy of coefficient estimates, you can set check_rank=False.

In [None]:
# Does "boycotted" vary within firm over time?
vary_within_entity = (
    df.reset_index()
      .groupby('ticker')['boycotted'].nunique()
      .gt(1).mean()
)
print(f"Share of firms where 'boycotted' changes over time: {vary_within_entity:.2%}")

# Does "boycotted" vary across firms *within the same year*?
vary_within_time = (
    df.reset_index()
      .groupby('fy')['boycotted'].nunique()
      .gt(1).mean()
)
print(f"Share of years with cross-sectional variation in 'boycotted': {vary_within_time:.2%}")


Share of firms where 'boycotted' changes over time: 0.00%
Share of years with cross-sectional variation in 'boycotted': 100.00%
