In [17]:
import pandas as pd
import glob
import os

path = 'C:\programming\Year3\DataViz\extra\data' # use your path
all_files = glob.glob(os.path.join(path , "*.csv"))
print(all_files)

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

frame.info()

['C:\\programming\\Year3\\DataViz\\extra\\data\\vacancies_detailed(1).csv', 'C:\\programming\\Year3\\DataViz\\extra\\data\\vacancies_detailed(2).csv', 'C:\\programming\\Year3\\DataViz\\extra\\data\\vacancies_detailed(3).csv', 'C:\\programming\\Year3\\DataViz\\extra\\data\\vacancies_detailed(4).csv']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1040 entries, 0 to 1039
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      1040 non-null   int64  
 1   name                    1040 non-null   object 
 2   area                    1040 non-null   object 
 3   salary_from             828 non-null    float64
 4   salary_to               479 non-null    float64
 5   currency                878 non-null    object 
 6   snippet_requirement     1011 non-null   object 
 7   snippet_responsibility  994 non-null    object 
 8   experience              1040 non-null   object 
 9   empl

In [18]:
import json
from collections import Counter

frame = frame.drop_duplicates('id')  # Remove duplicate jobs

# Convert key_skills from string to list
frame['key_skills'] = frame['key_skills'].apply(lambda x: x.split(',') if pd.notna(x) else [])
    
# Analyze skills
all_skills = [skill.strip() for sublist in frame['key_skills'] for skill in sublist]
skill_counts = Counter(all_skills)
top_skills = [{'skill': skill, 'count': count} for skill, count in skill_counts.most_common(15)]

top_skills

[{'skill': 'Ветеринария', 'count': 159},
 {'skill': 'Работа в команде', 'count': 97},
 {'skill': 'Пользователь ПК', 'count': 84},
 {'skill': 'Ответственность', 'count': 80},
 {'skill': 'Работа с животными', 'count': 71},
 {'skill': 'Грамотная речь', 'count': 60},
 {'skill': 'Лечение животных', 'count': 49},
 {'skill': 'Активные продажи', 'count': 49},
 {'skill': 'Знание ветеринарных препаратов', 'count': 45},
 {'skill': 'Консультативные продажи', 'count': 45},
 {'skill': 'Исполнительность', 'count': 45},
 {'skill': 'Организаторские навыки', 'count': 39},
 {'skill': 'Уход за животными', 'count': 37},
 {'skill': 'Вакцинация животных', 'count': 37},
 {'skill': 'Деловое общение', 'count': 36}]

In [33]:
# Analyze experience levels
experience_mapping = {
        'Нет опыта': 'no_exp',
        'От 1 года до 3 лет': '1-3',
        'От 3 до 6 лет': '3-6',
        'Более 6 лет': '6+'
}
experience_counts = frame['experience'].value_counts().to_dict()
experience_data = {experience_mapping.get(k, k): v for k, v in experience_counts.items()}

frame['experience_normalized'] = frame['experience'].map(experience_mapping)
experience_counts = frame['experience_normalized'].value_counts().to_dict()
experience_data

{'1-3': 446, 'no_exp': 428, '3-6': 120, '6+': 24}

In [34]:
# Analyze employment types
employment_counts = frame['employment'].value_counts().to_dict()
employment_counts

{'Полная занятость': 972,
 'Частичная занятость': 42,
 'Проектная работа': 3,
 'Стажировка': 1}

In [26]:
from forex_python.converter import CurrencyRates
from datetime import datetime

def convert_to_rub(amount, currency, date=None):
    """Convert amount to RUB using forex-python"""
    if pd.isna(amount) or currency == 'RUR':
        return amount
    
    try:
        c = CurrencyRates()
        rate = c.get_rate(currency, 'RUB', date_obj=date or datetime.now())
        return amount * rate
    except:
        # Fallback rates if API fails (approximate as of knowledge cutoff)
        fallback_rates = {
            'BYR': 0.0032,  # 1 BYN ≈ 30 RUB (Belarusian Ruble)
            'KZT': 0.18,    # 1 KZT ≈ 0.18 RUB (Kazakhstani Tenge)
            'UZS': 0.0076,  # 1 UZS ≈ 0.0076 RUB (Uzbekistani Som)
            'USD': 90.0,    # Approximate rates
            'EUR': 100.0
        }
        return amount * fallback_rates.get(currency, 1)  # Default to 1 if unknown currency

In [28]:
# Currency conversion for salaries
frame['salary_from_rub'] = frame.apply(
        lambda x: convert_to_rub(x['salary_from'], x['currency']), 
        axis=1
)
frame['salary_to_rub'] = frame.apply(
        lambda x: convert_to_rub(x['salary_to'], x['currency']), 
        axis=1
)
    
# Calculate midpoint salary for analysis
frame['salary_mid_rub'] = frame[['salary_from_rub', 'salary_to_rub']].mean(axis=1)
frame['salary_mid_rub']

0            NaN
1        30000.0
2        85000.0
3        67500.0
4            NaN
          ...   
1035     35000.0
1036    125000.0
1037     39000.0
1038     39000.0
1039     35000.0
Name: salary_mid_rub, Length: 1018, dtype: float64

In [35]:
# Salary analysis (only for jobs with salary data)
salary_df = frame[frame['salary_mid_rub'].notna()]
salary_stats = {
        'average_from': salary_df['salary_from_rub'].mean(),
        'average_to': salary_df['salary_to_rub'].mean(),
        'median': salary_df['salary_mid_rub'].median(),
        'currency_original': frame['currency'].value_counts().to_dict(),
        'currency_used': 'RUB',
        'jobs_with_salary': len(salary_df)
}
    
# Salary by experience
salary_by_exp = salary_df.groupby('experience_normalized')['salary_mid_rub'].agg(['mean', 'count']).to_dict()
salary_by_exp

{'mean': {'1-3': 84469.22619946091,
  '3-6': 124028.26216216217,
  '6+': 90532.8705882353,
  'no_exp': 56110.80243654823},
 'count': {'1-3': 371, '3-6': 74, '6+': 17, 'no_exp': 394}}

In [36]:
# Prepare output
output = {
        'metadata': {
            'total_jobs': len(frame),
            'unique_companies': frame['name'].nunique(),
            'unique_locations': frame['area'].nunique(),
            'jobs_with_salary': frame[frame['salary_from'].notna()].shape[0]
        },
        'skills': top_skills,
        'experience': experience_data,
        'employment': employment_counts,
        'salary': salary_stats,
        'raw_data_sample': frame.head(3).to_dict('records')  # For debugging
}
    
# Save to JSON
with open('output.json', 'w', encoding='utf-8') as f:
    json.dump(output, f, ensure_ascii=False, indent=2)