In [1]:
import pandas as pd
import json
import ast
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
import scipy.stats as stats
import sklearn.linear_model as lm
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.preprocessing import MultiLabelBinarizer

%matplotlib inline
# !gdown --id 12ZK2_dILTS_22sNFtIOT7MjaojKPOQoo
# !gdown --id 1TeAXhAVS7TjFVpibo1Td_i-Xo5ZW3kt2

pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.options.display.max_columns = 20
pd.options.display.max_rows = 10
pd.options.display.min_rows = 10

In [2]:
df = pd.read_csv('curses.csv')

In [3]:
df['price'] = df['price'].apply(lambda x: np.nan if (x in [' нет данных ', ' бесплатно ']) | (x is np.nan) else x.split(' ')[0])\
                         .astype(float)
df['skills'] = df['skills'].apply(lambda x : len(ast.literal_eval(x)))
df['Сертификат'] = df['Сертификат'].apply(lambda x: 1 if x in ['Да', 'для получения']  else 0)
df['Трудоустройство'] = df['Трудоустройство'].apply(lambda x: 1 if x == 'Гарантия'
                                                                else 0.5 if x == 'Содействие' 
                                                                else 0)
df['Сложность'] = df['Сложность'].apply(lambda x: 1 if x == 'Продвинутый'  else 0)

In [4]:
def clear_time(data):
    if data is np.nan:
        return np.nan
    result = []
    for i in data.strip(',').split():
        if 'год' in i:
            result.append(' * 8760 +')
            if i[-1] == ',':
                break
        elif 'мес' in i:
            result.append(' * 732 +')
            if i[-1] == ',':
                break
        elif 'нед' in i:
            result.append(' * 168 +')
            if i[-1] == ',':
                break
        elif 'дн' in i:
            result.append(' * 24 +')
            if i[-1] == ',':
                break
        elif 'час' in i:
            result.append(' * 1 +')
            if i[-1] == ',':
                break
        elif 'мин' in i:
            result.append(' / 60 +')
            if i[-1] == ',':
                break
        elif (i == 'по') | ('ак' in i) | ('уро' in i):
            break
        elif (',' in i):
            result.append(i.replace(',','.'))
        else:
            result.append(i)
    result = ''.join(result)
    try:
        if result[-1] == '+':
            result = eval(result[:-1])
        else:
            result = eval(result)
    except:
        result = np.nan
    return result if result >= 0 else np.nan
df['time'] = df['time'].apply(clear_time)

In [5]:
df = df[['name', 'price', 'time', 'skills', 'Сертификат', 'Трудоустройство','Сложность']]

In [6]:
df_regression = pd.concat([ df['price'], df['skills'], df['time'], df['Сертификат'],df['Сложность']],axis=1)
resultsm = smf.ols(f'price ~ {" + ".join(df_regression.columns[1:])}', data=df_regression).fit()
resultsm.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.633
Model:,OLS,Adj. R-squared:,0.633
Method:,Least Squares,F-statistic:,1297.0
Date:,"Tue, 01 Nov 2022",Prob (F-statistic):,0.0
Time:,12:29:42,Log-Likelihood:,-36225.0
No. Observations:,3009,AIC:,72460.0
Df Residuals:,3004,BIC:,72490.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-5720.2852,2147.823,-2.663,0.008,-9931.638,-1508.933
skills,1188.6145,144.249,8.240,0.000,905.777,1471.452
time,14.5498,0.212,68.712,0.000,14.135,14.965
Сертификат,8318.0224,2014.461,4.129,0.000,4368.159,1.23e+04
Сложность,1.228e+04,1605.865,7.648,0.000,9133.234,1.54e+04

0,1,2,3
Omnibus:,3086.444,Durbin-Watson:,2.009
Prob(Omnibus):,0.0,Jarque-Bera (JB):,324404.403
Skew:,4.817,Prob(JB):,0.0
Kurtosis:,52.947,Cond. No.,15700.0


In [64]:
coef = pd.read_html(resultsm.summary().tables[1].as_html(),header=0,index_col=0)[0]['coef'].to_frame().T

In [13]:
coef.to_csv('coeft.csv', header=True)

Unnamed: 0,coef
Intercept,-5720.285
skills,1188.614
time,14.55
Сертификат,8318.022
Сложность,12280.0
