In [20]:
%reset -f
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from tabulate import tabulate
from matplotlib import pyplot as plt
import scipy.stats as st
import statsmodels.api as sm
import seaborn as sns


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
df = pd.read_csv('dataset.csv')
Nobs=df['ID'].count()
df['const']=np.ones((Nobs,1))
data = df[df['Market share'] != 0]
data.head(20)

Unnamed: 0,ID,Year,Market share,Manufacturer,Model,Range,Price,HP,Chargetime,Type,Segment,Country,Sales,const
8,1,2021,0.010373,Aiways,U5,400,284621.7,201,34,SUV,C,CH,257,1.0
9,1,2022,0.005976,Aiways,U5,400,313681.829,201,34,SUV,C,CH,183,1.0
10,1,2023,0.00286,Aiways,U5,400,264524.0,201,34,SUV,C,CH,177,1.0
21,2,2023,4.8e-05,Aiways,U6,405,360638.0,214,34,SUV,C,CH,3,1.0
28,3,2019,0.04063,Audi,e-tron,375,979704.475,402,17,SUV,F,DE,222,1.0
29,3,2020,0.03468,Audi,e-tron,375,890101.41,402,17,SUV,F,DE,491,1.0
30,3,2021,0.010494,Audi,e-tron,375,800035.193,402,17,SUV,F,DE,260,1.0
31,3,2022,0.01757,Audi,e-tron,375,789723.656,402,17,SUV,F,DE,538,1.0
32,3,2023,0.001099,Audi,e-tron,375,673037.728,402,17,SUV,F,DE,68,1.0
41,4,2021,0.003391,Audi,e-tron GT,472,1278896.11,522,17,Sedan,F,DE,84,1.0


# Creating dummies

In [22]:
# Copy the dataframe
df2 = data.copy()

In [23]:
# Creating dummies for each segment
df2 = pd.get_dummies(df2, columns=['Segment'], drop_first=True)

# Creating dummies for each year
df2 = pd.get_dummies(df2, columns=['Year'], drop_first=True)

# Creating dummy for china
df2['China'] = (df2['Country'] == 'CH').astype(int)

In [24]:
df2.head(20)

Unnamed: 0,ID,Market share,Manufacturer,Model,Range,Price,HP,Chargetime,Type,Country,...,Year_2015,Year_2016,Year_2017,Year_2018,Year_2019,Year_2020,Year_2021,Year_2022,Year_2023,China
8,1,0.010373,Aiways,U5,400,284621.7,201,34,SUV,CH,...,0,0,0,0,0,0,1,0,0,1
9,1,0.005976,Aiways,U5,400,313681.829,201,34,SUV,CH,...,0,0,0,0,0,0,0,1,0,1
10,1,0.00286,Aiways,U5,400,264524.0,201,34,SUV,CH,...,0,0,0,0,0,0,0,0,1,1
21,2,4.8e-05,Aiways,U6,405,360638.0,214,34,SUV,CH,...,0,0,0,0,0,0,0,0,1,1
28,3,0.04063,Audi,e-tron,375,979704.475,402,17,SUV,DE,...,0,0,0,0,1,0,0,0,0,0
29,3,0.03468,Audi,e-tron,375,890101.41,402,17,SUV,DE,...,0,0,0,0,0,1,0,0,0,0
30,3,0.010494,Audi,e-tron,375,800035.193,402,17,SUV,DE,...,0,0,0,0,0,0,1,0,0,0
31,3,0.01757,Audi,e-tron,375,789723.656,402,17,SUV,DE,...,0,0,0,0,0,0,0,1,0,0
32,3,0.001099,Audi,e-tron,375,673037.728,402,17,SUV,DE,...,0,0,0,0,0,0,0,0,1,0
41,4,0.003391,Audi,e-tron GT,472,1278896.11,522,17,Sedan,DE,...,0,0,0,0,0,0,1,0,0,0


# Creating log market share

In [25]:
# Take the log of the market share
df2['log_market_share'] = np.log(df2['Market share'])

# Independent OLS

In [26]:
y_ols = df2['log_market_share']
z_ols = df2[['const', 'Range', 'Price', 'HP', 'Chargetime']]
dummies = df2[['Segment_B', 'Segment_C', 'Segment_D', 'Segment_E', 'Segment_F', 'Segment_M', 'Segment_J',
                #'year_2014', 'year_2015', 'year_2016', 'year_2017', 'year_2018', 'year_2019', 'year_2020', 'year_2021', 'year_2022', 'year_2023',
                'China']]
X_ols = pd.concat([z_ols, dummies], axis=1)

In [27]:
OLS_model = sm.OLS(y_ols, X_ols)
OLS_result = OLS_model.fit()
# Print summary of the regression results
print(OLS_result.summary())

                            OLS Regression Results                            
Dep. Variable:       log_market_share   R-squared:                       0.266
Model:                            OLS   Adj. R-squared:                  0.239
Method:                 Least Squares   F-statistic:                     9.704
Date:                Wed, 06 Mar 2024   Prob (F-statistic):           3.78e-16
Time:                        16:26:31   Log-Likelihood:                -703.51
No. Observations:                 334   AIC:                             1433.
Df Residuals:                     321   BIC:                             1483.
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -6.4982      0.787     -8.258      0.0

# IV-Regression

In [28]:
y_iv = df2['log_market_share']
z_iv = df2[['Range', 'HP', 'Chargetime']].sum(axis=1)
constant = df2['const']
dummies = df2[['Segment_B', 'Segment_C', 'Segment_D', 'Segment_E', 'Segment_F', 'Segment_M', 'Segment_J',
                #'year_2012', 'year_2013', 'year_2014', 'year_2015', 'year_2016', 
                'China']]

In [29]:
# Step 1: First Stage
# Regress 'Price' on the sum of car attributes
X_first_stage = df2[['Range', 'HP', 'Chargetime']].sum(axis=1)  # Sum of car attributes
X_first_stage = sm.add_constant(X_first_stage)  # Add constant term
model_first_stage = sm.OLS(df2['Price'], X_first_stage)
results_first_stage = model_first_stage.fit()
predicted_price = results_first_stage.predict(X_first_stage)  # Predicted values of 'Price'
residuals_first_stage = results_first_stage.resid  # Residuals from the first stage
print(results_first_stage.summary())

# Step 2: Second Stage
# Include predicted 'Price' from the first stage along with other exogenous variables
X_second_stage = df2[['Range', 'HP', 'Chargetime', 'Segment_B', 'Segment_C', 'Segment_D', 'Segment_E', 'Segment_F', 'Segment_M', 'Segment_J',
                #'year_2014', 'year_2015', 'year_2016', 'year_2017', 'year_2018', 'year_2019', 'year_2020', 'year_2021', 'year_2022', 'year_2023',
                'China']]
X_second_stage['Predicted_Price'] = predicted_price  # Add predicted 'Price' from the first stage
X_second_stage = sm.add_constant(X_second_stage)  # Add constant term
model_second_stage = sm.OLS(df2['log_market_share'], X_second_stage)
results_second_stage = model_second_stage.fit()
print(results_second_stage.summary())

# Optionally, you can check for instrument validity and other diagnostics.


                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.251
Model:                            OLS   Adj. R-squared:                  0.249
Method:                 Least Squares   F-statistic:                     111.1
Date:                Wed, 06 Mar 2024   Prob (F-statistic):           1.32e-22
Time:                        16:26:31   Log-Likelihood:                -4772.7
No. Observations:                 334   AIC:                             9549.
Df Residuals:                     332   BIC:                             9557.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -1.137e+05   6.07e+04     -1.872      0.0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_second_stage['Predicted_Price'] = predicted_price  # Add predicted 'Price' from the first stage


In [30]:
# Calculate the total count for all models
total_count_all_models = data['Sales'].sum()

# Group by Model and sum the counts
model_counts = data.groupby('Model')['Sales'].sum()

# Calculate the market share proportion
data['Market share proportion'] = data['Sales'] / total_count_all_models

# Print the modified DataFrame
print(data)


       ID  Year  Market share Manufacturer   Model  Range       Price   HP  \
8       1  2021      0.010373       Aiways      U5    400  284621.700  201   
9       1  2022      0.005976       Aiways      U5    400  313681.829  201   
10      1  2023      0.002860       Aiways      U5    400  264524.000  201   
21      2  2023      0.000048       Aiways      U6    405  360638.000  214   
28      3  2019      0.040630         Audi  e-tron    375  979704.475  402   
...   ...   ...           ...          ...     ...    ...         ...  ...   
1174  189  2023      0.000065        Volvo    EX30    475  368245.000  268   
1183  190  2021      0.014652        Volvo    XC40    457  462060.600  402   
1184  190  2022      0.033310        Volvo    XC40    457  416263.400  402   
1185  190  2023      0.031752        Volvo    XC40    457  439266.600  402   
1196  192  2023      0.000129        Voyah    Free    500  504768.500  482   

      Chargetime Type Segment Country  Sales  const  Market sha

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Market share proportion'] = data['Sales'] / total_count_all_models
