# Data Cleaning

Master thesis of Nikolai Popov, MAE-2025


In [None]:
# Libraries import
import pandas as pd # dataframes
import glob # for reading several files one by one
import numpy as np # for matrices/vectors
from tqdm import tqdm # for progress bar
import gc # to delete a dataframe from the memory
import re # regular expression to work with strings
import ast # for correcting the list type
import warnings # to supress warninings
warnings.simplefilter(action='ignore', category=Warning)
import dask.dataframe as dd # to optimize the merging
import duckdb # to optimize the merging

In [None]:
# Mount Google Drive in Colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Define the path to the file in Google Drive
file_path = "/content/drive/My Drive/Coding/Merged_raw_data_SPARK.csv"
file_path_labor = "/content/drive/My Drive/Coding/Merged_raw_data_SVETLANA.csv"

# Load CSV with correct encoding and delimiter
raw_dataset = pd.read_csv(file_path, sep=",", on_bad_lines="skip", low_memory=False)
raw_dataset_labor = pd.read_csv(file_path_labor, sep=",", on_bad_lines="skip", low_memory=False)

# Show first few rows
print(raw_dataset.head(2))
print(raw_dataset_labor.head(2))

   SparkID ShortNameEn  INN                okveds  OKVED  \
0      325        SSZH  NaN  ['68.32', '68.32.2']  01.19   
1      325        SSZH  NaN  ['68.32', '68.32.2']  01.19   

                                     Address_Address Address_Region  \
0  г. Москва, проспект Олимпийский, д. 10 корп. 1...      г. Москва   
1  г. Москва, проспект Олимпийский, д. 10 корп. 1...      г. Москва   

  Address_City Address_Longitude Address_Latitude  ...  Status_Code  \
0    г. Москва         37,622423        55,777127  ...           36   
1    г. Москва         37,622423        55,777127  ...           36   

   Status_Date  Year  Staff  Source Form_1_Field_1150  Form_2_Field_2120  \
0          NaN  2011    NaN     CUR               NaN                NaN   
1          NaN  2011    NaN     NXT               NaN                NaN   

   Form_2_Field_2400  Form_4_Field_4121  Form_2_Field_2110  
0                NaN                NaN                NaN  
1                NaN                NaN 

## Variables description

### SPARK
'Address_Address' - address of the enterprise headquarters; <br>
'Address_City' - city of the enterprise headquarters; <br>
'Address_Latitude' - latitude of the headquarters;  <br>
'Address_Longitude' - longitude of the headquarters; <br>
'Address_Region' - region of the headquarters;<br>
'Form_1_Field_1150' - fixed assets, RUB (form of the financial statement - proxy for capital); <br>
'Form_2_Field_2110' - revenue, RUB (proxy for output); <br>
'Form_2_Field_2120' - cost of sales, RUB;<br>
'Form_2_Field_2400' - net profit, RUB; <br>
'Form_4_Field_4121' - payments to suppliers and contractors for raw materials, goods, work, and services, RUB;<br>
'INN' - taxpayer number (Russian Federation); <br>
'OCVED' - main activity of a firm code;<br>
'OCVEDs' - all activities codes;<br>
'ShortNameEn' - short name of a company in English;<br>
'Source' - initial source of data; CUR means that the data for the year t was reported at the beginning of year t+1; NXT - corrected and reported at the beginning of year t+2; <br>
'SparkID' - SPARK's indentifier for a firm;<br>
'Staff' - number of employees (available only from 2018);<br>
'Status_Code' - detailed firm's status (see SPARK's documentation);<br>
'Status_Date' - date when the status was verified;<br>
'Status_IsActing' - dummy for the firm being active;<br>
'Year' - year of observation and financial statement report.<br>

### SVETLANA
'Company Name' - firm's name in English;<br>
'Tax number (INN/Tax/BIN)' - company's taxpayer number, INN;<br>
'Number of Employees'- number of firm's employees;<br>
'Year' - year of observation.<br>


## Cleaning: Ruslana

### Columns

#### Labor columns

For each row, if multiple Number of Employees XXXX columns have non-NaN values, take the mean of those values (row-wise), and assign the rounded integer to a single column: 'Number of Employees'. If all values are NaNs- leaves NaN. Non-NaN values are picked ove NaN values.

In [None]:
# Step 1: Find all 'Number of Employees *' columns
employee_cols = [col for col in raw_dataset_labor.columns if col.startswith("Number of Employees")]

# Step 2: Compute row-wise mean of non-NaN values across those columns
raw_dataset_labor['Number of Employees'] = raw_dataset_labor[employee_cols].mean(axis=1, skipna=True).round().astype('Int64')  # 'Int64' keeps NaNs

# Step 3: Drop the original year-specific columns
raw_dataset_labor.drop(columns=employee_cols, inplace=True)
raw_dataset_labor.head()

Unnamed: 0.1,Unnamed: 0,Company Name,Tax number (INN/Tax/BIN),Year,Number of Employees
0,0,SPORTMASTER LTD.,7728552000.0,2015,1998
1,1,JSC FORTUM,7203163000.0,2015,2771
2,2,VTORCHERMET NLMK,7705742000.0,2015,410
3,3,GAZPROMNEFT-URAL,6661002000.0,2015,2419
4,4,LIMITED LIABILITY COMPANY ULYANOVSKY AVTOMOBIL...,7327077000.0,2015,6759


### Rename and exclude

In [None]:
raw_dataset_labor = raw_dataset_labor[['Tax number (INN/Tax/BIN)', 'Year', 'Number of Employees']]


# Rename columns (according to financial statement)
raw_dataset_labor = raw_dataset_labor.rename(columns={
    "Tax number (INN/Tax/BIN)": "INN",
    "Number of Employees": "Labor"
})

raw_dataset_labor["INN"] = pd.to_numeric(raw_dataset_labor["INN"],
                                     errors="coerce").astype("Int64") # skips NaNs
raw_dataset_labor.head()

Unnamed: 0,INN,Year,Labor
0,7728551528,2015,1998
1,7203162698,2015,2771
2,7705741770,2015,410
3,6661002209,2015,2419
4,7327077188,2015,6759


In [None]:
raw_dataset_labor.dtypes

Unnamed: 0,0
INN,Int64
Year,int64
Labor,Int64


### Duplicate check

In [None]:
duplicates_labor = raw_dataset_labor.groupby(['INN', 'Year']).size()
duplicates_labor[duplicates_labor > 1]

Unnamed: 0_level_0,Unnamed: 1_level_0,0
INN,Year,Unnamed: 2_level_1
101000021,2007,2
101000021,2008,2
101000021,2009,2
101000021,2010,2
101000021,2011,2
...,...,...
991242005967,2015,2
991242006777,2015,2
991242006787,2015,2
991242011288,2015,2


In [None]:
raw_dataset_labor[(raw_dataset_labor['INN'] == 101000021)]

Unnamed: 0,INN,Year,Labor
790413,101000021,2015,41
3541197,101000021,2016,46
6367190,101000021,2007,2
6925382,101000021,2008,5
7635390,101000021,2009,12
8315714,101000021,2010,12
8998857,101000021,2011,19
9489670,101000021,2012,24
10442116,101000021,2013,31
11961383,101000021,2014,36


Let's take the mean value of labor, based on the several sources of data.

In [None]:
raw_dataset_labor = raw_dataset_labor.groupby(['INN', 'Year'], as_index=False)['Labor'].mean()

duplicates_labor = raw_dataset_labor.groupby(['INN', 'Year']).size()
duplicates_labor[duplicates_labor > 1] # no dulicates found

Unnamed: 0_level_0,Unnamed: 1_level_0,0
INN,Year,Unnamed: 2_level_1


## Cleaning: SPARK

Test for the correctness of columns.

In [None]:
# Set of columns, manually exstrated from one of the CSV files
columns_set_original = {
    "SparkID", "ShortNameEn", "INN", "okveds", "OKVED",
    "Address_Address", "Address_Region", "Address_City",
    "Address_Longitude", "Address_Latitude", "Status_IsActing",
    "Status_Code", "Status_Date", "Year", "Staff", "Source",
    "Form_1_Field_1150", "Form_2_Field_2120", "Form_2_Field_2400", "Form_4_Field_4121", "Form_2_Field_2110"
}

# Set of columns, existing in the merged dataframe
columns_set_dataframe = set(raw_dataset.columns)

# print differences
differences = columns_set_original ^ columns_set_dataframe
print(f"Differences between sets: {differences}")

Differences between sets: set()


Well, no discrepancies.

In [None]:
raw_dataset.head(3)

Unnamed: 0,SparkID,ShortNameEn,INN,okveds,OKVED,Address_Address,Address_Region,Address_City,Address_Longitude,Address_Latitude,...,Status_Code,Status_Date,Year,Staff,Source,Form_1_Field_1150,Form_2_Field_2120,Form_2_Field_2400,Form_4_Field_4121,Form_2_Field_2110
0,325,SSZH,,"['68.32', '68.32.2']",1.19,"г. Москва, проспект Олимпийский, д. 10 корп. 1...",г. Москва,г. Москва,37622423,55777127,...,36,,2011,,CUR,,,,,
1,325,SSZH,,"['68.32', '68.32.2']",1.19,"г. Москва, проспект Олимпийский, д. 10 корп. 1...",г. Москва,г. Москва,37622423,55777127,...,36,,2011,,NXT,,,,,
2,325,SSZH,,"['68.32', '68.32.2']",1.19,"г. Москва, проспект Олимпийский, д. 10 корп. 1...",г. Москва,г. Москва,37622423,55777127,...,36,,2012,,CUR,,,,,


In [None]:
print(f"Number of observations in the raw merged dataset: {raw_dataset.shape[0]}")

Number of observations in the raw merged dataset: 8131760


### Coordinates

In [None]:
# Change the separating sign for the latitude/longitude and convert to float
raw_dataset["Address_Longitude"] = raw_dataset["Address_Longitude"].astype(str).str.replace(",", ".").astype(float)
raw_dataset["Address_Latitude"] = raw_dataset["Address_Latitude"].astype(str).str.replace(",", ".").astype(float)

# Creat column of coordinates
raw_dataset["coord"] = raw_dataset["Address_Latitude"].astype(str) + ", " + raw_dataset["Address_Longitude"].astype(str)
raw_dataset = raw_dataset.drop(columns=\
                 ["Address_Longitude", "Address_Latitude"])

raw_dataset["coord"].sample(3)

Unnamed: 0,coord
7272552,"59.766692, 60.177463"
4195342,"59.663631, 30.523767"
5945091,"45.052695, 42.020498"


### Data types

In [None]:
# See the types of the columns
raw_dataset.dtypes

Unnamed: 0,0
SparkID,int64
ShortNameEn,object
INN,float64
okveds,object
OKVED,object
Address_Address,object
Address_Region,object
Address_City,object
Status_IsActing,bool
Status_Code,int64


In [None]:
# Change columns to the proper data format
raw_dataset["Status_Date"] = pd.to_datetime(raw_dataset["Status_Date"], format="%Y-%m-%d")
raw_dataset["Staff"] = pd.to_numeric(raw_dataset["Staff"],
                                     errors="coerce").astype("Int64") # skips NaNs
raw_dataset["INN"] = pd.to_numeric(raw_dataset["INN"],
                                     errors="coerce").astype("Int64") # skips NaNs

### Renaming columns
Note that fixed assets = Capital <br>
revenue = Output

In [None]:
# Rename columns (according to financial statement)
raw_dataset = raw_dataset.rename(columns={
    "Form_1_Field_1150": "Capital",
    "Form_2_Field_2110": "Output",
    "Form_2_Field_2120": "Cost_of_sales",
    "Form_2_Field_2400": "Net_Profit",
    "Form_4_Field_4121": "Payments",
    "OKVED": "Main_OKVED"
})

### Removing duplicate obs

Apparently, there are INN-Year duplicate observations: same data but from different sources ('Source' - for the description see the Variables Description section).

As 'Source' = 'NXT' is considered a bit more accurate than CUR (as the data could be correcting during an additional year), let's exclude dublicate observations for which 'Source' = 'CUR'.

In [None]:
raw_dataset = raw_dataset[(raw_dataset["Source"] == "NXT")]

## Datasets Formation

### Merging two datasets

In [None]:
# # Step 1: Convert to Dask DataFrames with 100 partitions
# raw_dataset = dd.from_pandas(raw_dataset, npartitions=100)
# raw_dataset_labor = dd.from_pandas(raw_dataset_labor, npartitions=40)

# # Step 2: LEFT merge using Dask
# merged = dd.merge(raw_dataset, raw_dataset_labor, on=['INN', 'Year'], how='left')

# # Step 3: Initialize empty list to collect chunks
# chunks = []

# # Step 4: Process each partition individually with tqdm progress bar
# for delayed_partition in tqdm(merged.to_delayed(), desc="Merging partitions"):
#     df_part = delayed_partition.compute()  # Load one partition
#     chunks.append(df_part)
#     del df_part  # Free memory

# # Step 5: Concatenate all chunks into a single final DataFrame
# whole_merged_dataset = pd.concat(chunks, ignore_index=True)

# # Step 6: Cleanup
# del chunks
# del raw_dataset
# del raw_dataset_labor
# gc.collect()

In [None]:
# Step 1: Create a DuckDB in-memory connection
con = duckdb.connect(database=':memory:')

# Step 2: Register the in-memory pandas DataFrames
con.register('df1', raw_dataset)
con.register('df2', raw_dataset_labor)

# Step 3: Perform LEFT JOIN on INN and Year
query = """
    SELECT *
    FROM df1
    LEFT JOIN df2
    USING (INN, Year)
"""

# Step 4: Execute and fetch merged result into pandas
whole_merged_dataset = con.execute(query).fetchdf()

# Step 5: Clean up
con.close()
del raw_dataset, raw_dataset_labor
gc.collect()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

30

### Whole cleaned dataset

In [None]:
whole_merged_dataset.columns

Index(['SparkID', 'ShortNameEn', 'INN', 'okveds', 'Main_OKVED',
       'Address_Address', 'Address_Region', 'Address_City', 'Status_IsActing',
       'Status_Code', 'Status_Date', 'Year', 'Staff', 'Source', 'Capital',
       'Cost_of_sales', 'Net_Profit', 'Payments', 'Output', 'coord', 'Labor'],
      dtype='object')

In [None]:
print(f"The number of observations in the whole dataset: {whole_merged_dataset.shape[0]}")

The number of observations in the whole dataset: 4065880


In [None]:
whole_merged_dataset.to_csv('/content/drive/My Drive/Coding/whole_dataset_file_collab.csv', index=False, encoding="utf-8-sig")

### Balanced panel


Apparently, there are INN-Year duplicate observations: same data but from different sources ('Source').

#### Necessary columns
Exclude all unnecessary columns

In [None]:
balanced_df = \
whole_merged_dataset[['INN', 'Year', 'okveds', 'coord', 'Capital', 'Labor', 'Output', 'Staff']]

# Delete the old datasets
# del whole_merged_dataset

# Run garbage collection to free up memory
gc.collect() # prints the number of deleted referene objects

0

#### Use both "Staff" and "Labor"

Combine the columns Labor and Staff

If both are non-NaN → take the mean, rounded to integer

If only one is non-NaN → take that value

If both are NaN → keep NaN

Store the result back into the Labor column

In [None]:
# Step 1: Combine 'Labor' and 'Staff' into one cleaned 'Labor' column
balanced_df['Labor'] = balanced_df[['Labor', 'Staff']].mean(axis=1, skipna=True).round().astype('Int64')

# Step 2: Drop 'Staff' column
balanced_df.drop(columns='Staff', inplace=True)

# Step 3: Convert 'okveds' from string to list (if needed)
balanced_df['okveds'] = balanced_df['okveds'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else x
)

# Step 4: Keep only rows where 'okveds' is a non-empty list
balanced_df = balanced_df[balanced_df['okveds'].apply(lambda x: isinstance(x, list) and len(x) > 0)]

# Step 5: Reset index
balanced_df.reset_index(drop=True, inplace=True)

# Preview
balanced_df.head()

Unnamed: 0,INN,Year,okveds,coord,Capital,Labor,Output
0,5515003072,2011,"[01.11.1, 01.11.2, 01.19.1, 01.42, 01.61, 10.6...","55.044426, 74.454369",16824000.0,100,49878000.0
1,5515003072,2012,"[01.11.1, 01.11.2, 01.19.1, 01.42, 01.61, 10.6...","55.044426, 74.454369",4755000.0,167,46245000.0
2,5515003072,2013,"[01.11.1, 01.11.2, 01.19.1, 01.42, 01.61, 10.6...","55.044426, 74.454369",2711000.0,156,31557000.0
3,5523003421,2011,"[01.11.1, 01.11.2, 01.42]","55.527, 70.808037",,21,
4,5523003421,2012,"[01.11.1, 01.11.2, 01.42]","55.527, 70.808037",20091000.0,28,376000.0


In [None]:
# see whether are still observations with empty 'okveds'
balanced_df['okveds'].apply(lambda x: isinstance(x, list) and len(x) == 0).sum()

np.int64(0)

#### Years range

In [None]:
# Find the range of years
balanced_df['Year'].unique()

array([2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021,
       2022, 2023])

From below the analysis is bounded by the data limitations, while from above it is bounded by the local validity of the DID - the further the period is from the treatment period, the less believable the assumption of parallel trends becomes (due to new shocks, structural breaks etc.). As my empirical model is a spatial DID with the treatment year of 2014, I take the years from 2011 to 2017: 3 years prior to 2014 and after. Actually, I might have taken 2018 into account as well (but not 2019 - COVID, Pension reform are  structurals break which violate the parallel trends assumption for sure), but due to the requirement of my dataset being a balanced panel, I decided not to loose additional 10% of the firms. ALSO- SPARK data starts from 2011, while SVETLANA data ends at 2016. ALSO- SPARK data starts from 2011, while SVETLANA data ends at 2016.

In [None]:
# leave only required years
balanced_df = balanced_df[(balanced_df["Year"] >= 2011) &\
    (balanced_df["Year"] <= 2016)]

2011 - 2016: 8492 firms

2011 - 2017: 5016 firms

2011 - 2018: 4682 firms

#### Balancing

Balanced panel is required for spatial DID models.

Due to the specification of the empirical model, Capital, Output should be strictly greater than zero, as
they are used in ln().

In [None]:
# Keep only rows where 'capital' > 0 and 'output' > 0
balanced_df = \
balanced_df[(balanced_df['Capital'] > 0) & (balanced_df['Output'] > 0)& (balanced_df['Labor'] > 0)]
print(f'Dataframe shape: {balanced_df.shape}')

Dataframe shape: (185673, 7)


Print the number of missing values in each column

In [None]:
print(f'Dataframe shape: {balanced_df.shape}')
print(balanced_df.isna().sum())

Dataframe shape: (185673, 7)
INN        0
Year       0
okveds     0
coord      0
Capital    0
Labor      0
Output     0
dtype: int64


See the number of nans in each columns by years

In [None]:
# Count NaNs per column for each year
nan_table = balanced_df.groupby('Year', as_index=True).apply(lambda g: g.isna().sum())

# Display the table
nan_table

Unnamed: 0_level_0,INN,Year,okveds,coord,Capital,Labor,Output
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011,0,0,0,0,0,0,0
2012,0,0,0,0,0,0,0
2013,0,0,0,0,0,0,0
2014,0,0,0,0,0,0,0
2015,0,0,0,0,0,0,0
2016,0,0,0,0,0,0,0


In [None]:
duplicates = balanced_df.groupby(['INN', 'Year']).size()
duplicates[duplicates > 1]

Unnamed: 0_level_0,Unnamed: 1_level_0,0
INN,Year,Unnamed: 2_level_1


In [None]:
# Drop all firms with NaNs in any columns
balanced_df = balanced_df.dropna()

# Check if every firm has a permanent adress (required for spatial weights
# to be time-invariant)
num_adress = balanced_df.groupby("INN")["coord"].nunique()

# Find identifiers where multiple unique values exist
changed_adress_firms = num_adress[num_adress > 1]
changed_adress_firms

Unnamed: 0_level_0,coord
INN,Unnamed: 1_level_1


All firms stayed at their permanent adress and did not changed it throughout observation (most likely, it's just specifics of SPARK data collection).

Now lets find the firms which don't have observations for each year of observation.

In [None]:
# Define the expected range of years
expected_years = set(range(balanced_df["Year"].min(),
                           balanced_df["Year"].max() + 1))

# Find firms missing one or more years
missing_years_firms = balanced_df.groupby("INN")["Year"].\
apply(lambda x: expected_years - set(x))

# Extract firms that are missing years
firms_to_exclude =\
set(missing_years_firms[missing_years_firms.apply(len) > 0].index)

# Exclude these firms from the dataset
balanced_df = balanced_df[~balanced_df["INN"].isin(firms_to_exclude)]
print(f"Previous number of firms: {len(missing_years_firms)}")
print(f"Number of firms excluded: {len(firms_to_exclude)}")
print(f"New number of firms: {len(missing_years_firms) -len(firms_to_exclude)}")
print(f"Number of years: {len(balanced_df['Year'].unique())}")
print(f"Expected number of obs. (balanced panel):\n \
# years * # firms: {(len(missing_years_firms) -len(firms_to_exclude))*len(balanced_df['Year'].unique())}")
print(f"Actual number of obs.: {balanced_df.shape[0]}")

Previous number of firms: 52152
Number of firms excluded: 43660
New number of firms: 8492
Number of years: 6
Expected number of obs. (balanced panel):
 # years * # firms: 50952
Actual number of obs.: 50952


In [None]:
panel_counts = balanced_df.groupby('INN')['Year'].nunique()
is_balanced = panel_counts.nunique() == 1
print("Balanced panel" if is_balanced else "Unbalanced panel")

Balanced panel


In [None]:
balanced_df.describe()

Unnamed: 0,INN,Year,Capital,Labor,Output
count,50952.0,50952.0,50952.0,50952.0,50952.0
mean,4281890096.536858,2013.5,197779900.0,154.489657,492561200.0
std,2156444340.007967,1.707842,826217300.0,320.262696,2594794000.0
min,101000776.0,2011.0,1000.0,1.0,1000.0
25%,2353004726.5,2012.0,9145000.0,36.0,15608250.0
50%,4421005618.5,2013.5,41207000.0,64.0,69587500.0
75%,6119007606.25,2015.0,125614500.0,155.0,247118000.0
max,8912002578.0,2016.0,36044920000.0,21634.0,141109000000.0


#### Post dummy

In [None]:
# Create a new column "Post_d" which is 1 if Year >= 2014, else 0
balanced_df["Post_d"] = (balanced_df["Year"] >= 2014).astype(int)
balanced_df[["Year", "Post_d"]].head()

Unnamed: 0,Year,Post_d
61,2011,0
62,2012,0
63,2013,0
64,2014,1
65,2015,1


#### Treatment dummy

OCVED - firm's activity code; TNVED - good's code. They have to be matched manually. The world's standards, which correspinds: OCVED - Nice, TN_VED (6 digits) - HS. Note that there is not direct correspondance between TNVED and OCVED. There is also OKPD (European analog is CPA) - it's the code, linking the goods produced to company activity code. There is a direct correspondance with TNVED (I used this webiste to convert TNVED to OKPD: https://classinform.ru/perevod-tnved-v-okpd2/search.html). Even though there is still no direct correspondance between OKPD and OCVED, they coinside up to 4-th digit - meaning that they can be matched, even though only on the first levels of classification.


Steps to implement the mapping:
1) Convert OCVED to OKPD;
2) Convert OKPD to OCVED;

The procedure is done manually to increase the presition of the ambiguous matching. Note that due to the matching on only 4 digits, exclusions to the sanctioned categories cannot be taken into account; the whole category considered embargoed instead.

See the link to the official Russian Government statement about TN_VED codes (first 6 digits of which corresponds to the HS system) of emabargoed goods:

https://base.garant.ru/70712500/53f89421bbdaf741eb2d1ecc4ddb4c33/

In [None]:
# manually matched okveds corresponding to activities of fimrs, which were
# directly affected by embargo (counter-sanctions)

embargoed_sectors = {'01.46', '10.11', '10.12', '10.13',
'03.11', '03.12', '03.22', '10.20', '10.51', '01.13',
'01.11', '01.26', '01.28', '10.31', '10.39', '01.25',
'01.22', '01.23', '01.21', '01.24', '10.41', '08.93'}

In [None]:
balanced_df['okveds'].head()

Unnamed: 0,okveds
61,"[01.11.1, 01.11.2, 01.41, 01.42, 01.61, 10.11,..."
62,"[01.11.1, 01.11.2, 01.41, 01.42, 01.61, 10.11,..."
63,"[01.11.1, 01.11.2, 01.41, 01.42, 01.61, 10.11,..."
64,"[01.11.1, 01.11.2, 01.41, 01.42, 01.61, 10.11,..."
65,"[01.11.1, 01.11.2, 01.41, 01.42, 01.61, 10.11,..."


Truncate all the okveds to 4 digits (for the proper matching).

In [None]:
# Ensure that the okveds column contains actual lists, not string representations.
def ensure_list(x):
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except Exception as e:
            print("Conversion error:", e)
            return None
    return x
balanced_df['okveds'] = balanced_df['okveds'].apply(ensure_list)

def modify_okved(codes_list):
    return [re.sub(r'^(\d+\.\d+)\.\d+$', r'\1', code) for code in codes_list]

# Apply the modification
balanced_df['okveds'] = balanced_df['okveds'].apply(modify_okved)

print(balanced_df['okveds'].head())

61    [01.11, 01.11, 01.41, 01.42, 01.61, 10.11, 10....
62    [01.11, 01.11, 01.41, 01.42, 01.61, 10.11, 10....
63    [01.11, 01.11, 01.41, 01.42, 01.61, 10.11, 10....
64    [01.11, 01.11, 01.41, 01.42, 01.61, 10.11, 10....
65    [01.11, 01.11, 01.41, 01.42, 01.61, 10.11, 10....
Name: okveds, dtype: object


See whether firms have the same okveds over the years:

In [None]:
# Convert the list of okveds into a canonical form.
# Sort the codes and convert them to a tuple so that order doesn't affect equality.
def canonical_okved(okved_list):
    if okved_list is None:
        return None
    return tuple(sorted(okved_list))

balanced_df['okveds_canonical'] = balanced_df['okveds'].apply(canonical_okved)

# Group by INN and check if the set of okveds remains the same over the years.
def is_consistent(group):
    # If there's only one unique canonical set across the group's rows, it's consistent.
    return group['okveds_canonical'].nunique() == 1

# Create a DataFrame with consistency check results
consistency = balanced_df.groupby('INN').apply(is_consistent).reset_index()
consistency.columns = ['INN', 'consistent']

# Count the number of inconsistent firms
num_inconsistent = (consistency['consistent'] == False).sum()

balanced_df = balanced_df.drop(columns=\
                 ["okveds_canonical"])
print("Number of inconsistent firms:", num_inconsistent)

Number of inconsistent firms: 0


With that out of the way, let's mark treated (embargoed) firms. Note that if at least one of the okveds of the firms coincides with an element form the embargoed list => this firm is treated as embargoed (it's a simplification).

In [None]:
balanced_df['Treated_d'] = balanced_df['okveds'].apply(
    lambda codes: int(bool(set(codes).intersection(embargoed_sectors)))
)
# Reset index
balanced_df = balanced_df.reset_index(drop=True)

# Display a few rows to verify the results
print(balanced_df[['INN', 'Year', 'okveds', 'Treated_d']].head())

          INN  Year                                             okveds  \
0  2435000715  2011  [01.11, 01.11, 01.41, 01.42, 01.61, 10.11, 10....   
1  2435000715  2012  [01.11, 01.11, 01.41, 01.42, 01.61, 10.11, 10....   
2  2435000715  2013  [01.11, 01.11, 01.41, 01.42, 01.61, 10.11, 10....   
3  2435000715  2014  [01.11, 01.11, 01.41, 01.42, 01.61, 10.11, 10....   
4  2435000715  2015  [01.11, 01.11, 01.41, 01.42, 01.61, 10.11, 10....   

   Treated_d  
0          1  
1          1  
2          1  
3          1  
4          1  


In [None]:
print(f"Total number of observations: {balanced_df.shape[0]}")
print(f"Number of treated observations: {balanced_df['Treated_d'].sum()}")

Total number of observations: 50952
Number of treated observations: 39144


In [None]:
balanced_df.sample(3)

Unnamed: 0,INN,Year,okveds,coord,Capital,Labor,Output,Post_d,Treated_d
22436,3406006496,2012,"[01.11, 01.11, 10.41, 10.61, 10.61, 10.71, 46.2]","51.081456, 44.168045",89656000.0,52,190646000.0,0,1
11633,3651008894,2011,"[01.11, 01.19, 01.47, 01.49, 01.50, 01.6, 01.6...","51.271369, 39.202446",280860000.0,24,14500000.0,0,1
3443,3245508453,2016,"[01.11, 01.11, 01.11, 01.13, 01.19, 01.19, 01....","53.1661, 34.083259",31146000.0,102,61665000.0,1,1


In [None]:
balanced_df.to_csv('/content/drive/My Drive/Coding/balanced_dataset_file_collab.csv', index=False, encoding="utf-8-sig")

### The list of INN

In [None]:
# Get unique firm IDs (INN)
unique_firms = balanced_df["INN"].unique()

# Convert to a standard NumPy array (optional, but without it didn't work)
unique_firms = unique_firms.astype(np.int64)  # Ensures consistent integer format

# Print results
print(f"Number of unique firms: {len(unique_firms)}")

Number of unique firms: 8492


In [None]:
# Convert unique firms list to a DataFrame
unique_firms_df = pd.DataFrame(unique_firms, columns=["INN"])

unique_firms_df.to_csv('/content/drive/My Drive/Coding/unique_firms_df_collab.csv', index=False, encoding="utf-8-sig")