In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sbn


In [4]:
pd.set_option('display.max_rows', None)      # Показывает все строки
pd.set_option('display.max_columns', None)   # Показывает все столбцы
pd.set_option('display.width', 1000)         # Расширяет консольную ширину
pd.set_option('display.colheader_justify', 'left')  # Выравнивание заголовков


In [6]:
df = pd.read_csv('data.csv', sep=';')


In [31]:
df = df.astype(str).apply(lambda col: col.map(lambda x: x.encode('latin1').decode('utf-8')))


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4109 entries, 0 to 4108
Data columns (total 77 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   region                4109 non-null   object 
 1   municipality          4109 non-null   object 
 2   settlement            4109 non-null   object 
 3   oktmo                 4109 non-null   int64  
 4   latitude_dd           4109 non-null   float64
 5   longitude_dd          4109 non-null   float64
 6   year                  4109 non-null   int64  
 7   assets                2134 non-null   float64
 8   assets_depreciation   2134 non-null   float64
 9   assets_exhausted      2130 non-null   float64
 10  assets_new            1790 non-null   float64
 11  birth                 4104 non-null   float64
 12  build_flat            3133 non-null   float64
 13  catering              4017 non-null   float64
 14  catering_growth       2715 non-null   float64
 15  collective_foreign   

In [37]:
text_cols = ['region', 'municipality', 'settlement']

In [10]:
missing_count = df.isna().sum()

# Подсчёт процента пропусков
missing_percent = (missing_count / len(df)) * 100

# Объединение в одну таблицу
missing_summary = pd.DataFrame({
    'Missing_Count': missing_count,
    'Missing_Percent': missing_percent.round(2)
}).sort_values(by='Missing_Percent', ascending=False)

print(missing_summary)


                      Missing_Count  Missing_Percent
volume_water          3525           85.79          
preschool_waiting     3425           83.35          
collective_foreign    3281           79.85          
collective_people     3255           79.22          
collective_russian    3255           79.22          
preschool_coverage    3073           74.79          
industry              2848           69.31          
volume_mining         2768           67.36          
comp_mining           2600           63.28          
invest_reg            2546           61.96          
migration             2319           56.44          
assets_new            2319           56.44          
job_regist            2220           54.03          
job_regist_unempl     2216           53.93          
servises              2187           53.22          
domesticserv          2150           52.32          
comp_manufact         2138           52.03          
comp_electr           2132           51.89    

In [12]:
columns_to_drop = [
    'assets_depreciation', 'assets_exhausted', 'assets_new',
    'catering', 'catering_growth',
    'collective_foreign', 'collective_people', 'collective_russian',
    'comp_electr', 'comp_manufact', 'comp_mining',
    'crimes', 'criminals',
    'doctors_per10', 'domesticserv', 'hospital_beds_per10',
    'invest_reg',
    'job_regist', 'job_seeker', 'job_seeker_unempl', 'migration',
    'nurses_per10', 'polycl_visits_per10',
    'pop_old', 'preschool_coverage', 'preschool_waiting',
    'retail_growth',
    'volume_electr', 'volume_manufact', 'volume_mining', 'volume_water'
]
df = df.drop(columns=columns_to_drop)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4109 entries, 0 to 4108
Data columns (total 46 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   region                4109 non-null   object 
 1   municipality          4109 non-null   object 
 2   settlement            4109 non-null   object 
 3   oktmo                 4109 non-null   int64  
 4   latitude_dd           4109 non-null   float64
 5   longitude_dd          4109 non-null   float64
 6   year                  4109 non-null   int64  
 7   assets                2134 non-null   float64
 8   birth                 4104 non-null   float64
 9   build_flat            3133 non-null   float64
 10  construction          3913 non-null   float64
 11  death                 4101 non-null   float64
 12  doctors               4015 non-null   float64
 13  hospital_beds         4008 non-null   float64
 14  hospitals             3938 non-null   float64
 15  industry             

In [16]:
df_var = df.copy()
df_med = df.copy()

In [18]:
fill_zero_cols = [
    'industry', 'servises', 'assets', 'retail', 'invest_fed', 'invest_budg',
    'investment', 'job_regist_unempl', 'workers',
    'pupils', 'schools', 'preschool', 'preschool_places', 'preschool_child',
    'polyclinic', 'hospitals', 'polycl_visits', 'hospital_beds', 'doctors', 'nurses',
    'build_flat', 'new_hospital_beds', 'new_school_places', 'new_polycl_visits',
    'new_preschool_places', 'construction', 'new_housing'
]

fill_median_cols = ['wage', 'pension']

# --- df_var: по логике ---
# 1. Заполняем 0
df_var[fill_zero_cols] = df_var[fill_zero_cols].fillna(0)

# 2. Медиана по региону
for col in fill_median_cols:
    df_var[col] = df_var.groupby('region')[col].transform(lambda x: x.fillna(x.median()))

# 3. Остальные пропуски остаются как есть

# --- df_med: медиана по колонке ---
df_med = df_med.fillna(df_med.median(numeric_only=True))

In [20]:
df_var.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4109 entries, 0 to 4108
Data columns (total 46 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   region                4109 non-null   object 
 1   municipality          4109 non-null   object 
 2   settlement            4109 non-null   object 
 3   oktmo                 4109 non-null   int64  
 4   latitude_dd           4109 non-null   float64
 5   longitude_dd          4109 non-null   float64
 6   year                  4109 non-null   int64  
 7   assets                4109 non-null   float64
 8   birth                 4104 non-null   float64
 9   build_flat            4109 non-null   float64
 10  construction          4109 non-null   float64
 11  death                 4101 non-null   float64
 12  doctors               4109 non-null   float64
 13  hospital_beds         4109 non-null   float64
 14  hospitals             4109 non-null   float64
 15  industry             

In [22]:
df_med.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4109 entries, 0 to 4108
Data columns (total 46 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   region                4109 non-null   object 
 1   municipality          4109 non-null   object 
 2   settlement            4109 non-null   object 
 3   oktmo                 4109 non-null   int64  
 4   latitude_dd           4109 non-null   float64
 5   longitude_dd          4109 non-null   float64
 6   year                  4109 non-null   int64  
 7   assets                4109 non-null   float64
 8   birth                 4109 non-null   float64
 9   build_flat            4109 non-null   float64
 10  construction          4109 non-null   float64
 11  death                 4109 non-null   float64
 12  doctors               4109 non-null   float64
 13  hospital_beds         4109 non-null   float64
 14  hospitals             4109 non-null   float64
 15  industry             

In [24]:
# Контекст (location + time)
context_cols = ['region', 'municipality', 'settlement', 'oktmo', 'latitude_dd', 'longitude_dd', 'year']
demography_cols = ['oktmo', 'year', 'pop_1_6', 'pop_work', 'pop_young', 'pens', 'rni', 'death', 'birth', 'population']
economy_cols = ['oktmo', 'year', 'industry', 'servises', 'assets', 'retail', 'invest_fed', 'invest_budg',
                'investment', 'wage', 'pension', 'job_regist_unempl', 'n_companies', 'workers']
education_cols = ['oktmo', 'year', 'pupils', 'schools', 'preschool', 'preschool_places', 'preschool_child']
healthcare_cols = ['oktmo', 'year', 'polyclinic', 'hospitals', 'polycl_visits', 'hospital_beds', 'doctors', 'nurses']
infrastructure_cols = ['oktmo', 'year', 'build_flat', 'new_hospital_beds', 'new_school_places',
                       'new_polycl_visits', 'new_preschool_places', 'construction', 'new_housing', 'living_space']

# Разделение для df_var
context_var = df_var[context_cols].drop_duplicates()
demography_var = df_var[demography_cols]
economy_var = df_var[economy_cols]
education_var = df_var[education_cols]
healthcare_var = df_var[healthcare_cols]
infrastructure_var = df_var[infrastructure_cols]

# Разделение для df_med
context_med = df_med[context_cols].drop_duplicates()
demography_med = df_med[demography_cols]
economy_med = df_med[economy_cols]
education_med = df_med[education_cols]
healthcare_med = df_med[healthcare_cols]
infrastructure_med = df_med[infrastructure_cols]


In [26]:
tables = {
    "context_var": context_var,
    "demography_var": demography_var,
    "economy_var": economy_var,
    "education_var": education_var,
    "healthcare_var": healthcare_var,
    "infrastructure_var": infrastructure_var,
    
    "context_med": context_med,
    "demography_med": demography_med,
    "economy_med": economy_med,
    "education_med": education_med,
    "healthcare_med": healthcare_med,
    "infrastructure_med": infrastructure_med
}

In [46]:
import os

# Создадим папку для экспорта (если нет)
os.makedirs("csv_export_2", exist_ok=True)

for name, df in tables.items():
    df.to_csv(f"csv_export_2/{name}.csv", index=False)
    print(f"Экспортировано: {name}.csv")


Экспортировано: context_var.csv
Экспортировано: demography_var.csv
Экспортировано: economy_var.csv
Экспортировано: education_var.csv
Экспортировано: healthcare_var.csv
Экспортировано: infrastructure_var.csv
Экспортировано: context_med.csv
Экспортировано: demography_med.csv
Экспортировано: economy_med.csv
Экспортировано: education_med.csv
Экспортировано: healthcare_med.csv
Экспортировано: infrastructure_med.csv


In [39]:
df_var.replace({'«': '"', '»': '"'}, regex=True, inplace=True)
df_med.replace({'«': '"', '»': '"'}, regex=True, inplace=True)


In [43]:
import pandas as pd
import os
import re
from collections import Counter

folder = "csv_export_2"
files_with_issues = []  # Список файлов с подозрительными символами

# Функция для проверки символов
def has_weird_chars(s):
    return isinstance(s, str) and bool(re.search(r'[^\u0000-\u052F]', s))

# Проходим по всем файлам
for filename in os.listdir(folder):
    if filename.endswith(".csv"):
        filepath = os.path.join(folder, filename)
        
        try:
            df = pd.read_csv(filepath, encoding='utf-8')
            print(f"\n📂 Проверяется файл: {filename}")
            
            mask = df.applymap(has_weird_chars)
            if mask.any().any():
                print("⚠️ Найдены подозрительные символы!")
                files_with_issues.append(filename)

                # Показать примеры
                print("Примеры строк с подозрительными символами:")
                print(df[mask.any(axis=1)].head())

                # Подсчёт подозрительных символов
                char_counts = Counter(''.join(df.select_dtypes(include='object').stack().dropna().astype(str)))
                for char, count in char_counts.items():
                    if ord(char) > 127 and not ('А' <= char <= 'я' or char in 'ёЁ'):
                        print(f"'{char}' — {count} раз (код: {ord(char)})")
            else:
                print("✅ Подозрительных символов нет.")

        except UnicodeDecodeError as e:
            print(f"💥 Ошибка кодировки в файле {filename}: {e}")

# Вывод финального результата
print("\n=== Результаты проверки ===")
if files_with_issues:
    print(f"⚠️ В {len(files_with_issues)} файлах найдены подозрительные символы:")
    for file in files_with_issues:
        print(f"- {file}")
else:
    print("✅ Все файлы чистые, подозрительных символов нет.")



=== Результаты проверки ===
✅ Все файлы чистые, подозрительных символов нет.


In [None]:
import os
import pandas as pd
from sqlalchemy import create_engine

USER = "postgres"
PASSWORD = "123"
HOST = "localhost"
PORT = 5432

def load_csv_to_postgres(folder_path, db_name):
    engine = create_engine(f"postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}:{PORT}/{db_name}")

    for file in os.listdir(folder_path):
        if file.endswith(".csv"):
            table_name = file.replace("_bom.csv", "")
            csv_path = os.path.join(folder_path, file)
            
            print(f"📥 Загружаем {table_name} → {db_name}...")

            try:
                df = pd.read_csv(csv_path, encoding="utf-8")
            except UnicodeDecodeError:
                print(f"⚠️ Ошибка чтения {file} с utf-8. Пробуем cp1251...")
                df = pd.read_csv(csv_path, encoding="cp1251")

            df.to_sql(table_name, engine, if_exists="replace", index=False)
            print(f"✅ Готово: {table_name}")

var_dir = r"D:\data analytics\аналитика 6\powerBI\проект\csv_export_2\var"
med_dir = r"D:\data analytics\аналитика 6\powerBI\проект\csv_export_2\med"

load_csv_to_postgres(var_dir, "project_var")