In [1]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVR, SVC
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score, f1_score, recall_score
import pandas as pd
import numpy as np

In [2]:
# 데이터 불러오기
df = pd.read_csv('주최측데이터.csv')

In [3]:
df = df.dropna(subset = ['age','gender','genre','place','running_time','price'])
df = df[['age','gender','genre','place','running_time','price']]
df

Unnamed: 0,age,gender,genre,place,running_time,price
0,50.0,F,교향곡,콘서트홀,120,10000
4,50.0,M,독주,콘서트홀,90,180000
5,30.0,F,교향곡,콘서트홀,100,144000
8,30.0,F,실내악,IBK챔버홀,80,56000
11,50.0,F,오페라,콘서트홀,120,24000
...,...,...,...,...,...,...
1920853,60.0,F,교향곡,콘서트홀,110,90000
1920855,50.0,M,교향곡,콘서트홀,120,10000
1920860,60.0,M,클래식,IBK챔버홀,100,15000
1920864,20.0,F,교향곡,콘서트홀,120,21000


In [6]:
df['genre'].value_counts()/len(df)

교향곡      0.418964
클래식      0.207970
독주       0.165467
실내악      0.115601
합창       0.035088
성악       0.027004
콘서트      0.010771
오페라      0.008828
복합장르     0.007929
재즈       0.001291
기타       0.000553
크로스오버    0.000486
가족극      0.000031
전시       0.000016
무용       0.000002
Name: genre, dtype: float64

In [7]:
lb = LabelEncoder()
a = lb.fit_transform(df['gender'])
df['gender'] = a

df = pd.concat([df,pd.get_dummies(df['place'])], axis=1)
df = df.drop('place',axis=1)

df['new_genre'] = df['genre'].apply(lambda x: 1 if x == '교향곡' else 0)
df = df.drop('genre',axis=1)

In [8]:
x1 = df.drop('price',axis=1)
x2 = df.drop('new_genre',axis=1)
y1 = df['price']
y2 = df['new_genre']

In [9]:
x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1)
x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, stratify = y1)

In [10]:
ms = MinMaxScaler()
ms.fit(x1_train)
x1_train_ms = ms.transform(x1_train)
x1_test_ms = ms.transform(x1_test)

In [11]:
ms = MinMaxScaler()
ms.fit(x2_train)
x2_train_ms = ms.transform(x2_train)
x2_test_ms = ms.transform(x2_test)

In [12]:
lr_model = LinearRegression()
lr_model.fit(x1_train_ms, y1_train)
pred_lr = lr_model.predict(x1_test_ms)

In [13]:
glm_model = LogisticRegression()
glm_model.fit(x2_train_ms, y2_train)
pred_glm = glm_model.predict(x2_test_ms)

In [14]:
xgbrg_model = XGBRegressor()
xgbrg_model.fit(x1_train_ms, y1_train)
pred_xgbrg = xgbrg_model.predict(x1_test_ms)

In [15]:
xgbcf_model = XGBClassifier()
xgbcf_model.fit(x2_train_ms, y2_train)
pred_xgbcf = xgbcf_model.predict(x2_test_ms)

In [16]:
f1_score(y2_test, pred_glm)

0.6423807005171603

In [17]:
f1_score(y2_test, pred_xgbcf)

0.7914110429447854

In [18]:
mean_squared_error(y1_test, pred_lr)

2981805238.0968914

In [19]:
mean_squared_error(y1_test, pred_xgbrg)

2557886903.5370226

In [20]:
x1_train

Unnamed: 0,age,gender,running_time,IBK챔버홀,리사이틀홀,콘서트홀,new_genre
405838,40.0,0,70,0,1,0,0
1043228,60.0,0,120,0,0,1,1
1780827,40.0,0,120,0,0,1,1
897572,20.0,0,120,1,0,0,0
928419,30.0,0,120,0,0,1,1
...,...,...,...,...,...,...,...
511114,30.0,0,105,0,0,1,0
832411,60.0,0,120,0,0,1,0
1654907,40.0,0,90,0,0,1,1
248365,40.0,0,105,0,0,1,1


In [21]:
x2_train

Unnamed: 0,age,gender,running_time,price,IBK챔버홀,리사이틀홀,콘서트홀
37756,40.0,0,100,99000,0,0,1
1641511,50.0,1,120,90000,0,0,1
1781712,40.0,0,100,80000,0,0,1
564817,20.0,1,110,66000,0,0,1
1783824,30.0,1,120,70000,0,0,1
...,...,...,...,...,...,...,...
141064,30.0,1,85,152000,0,0,1
1375155,40.0,0,120,30000,0,0,1
1788646,60.0,1,100,20000,1,0,0
1140701,50.0,1,120,105000,0,0,1


In [30]:
lr_model.coef_

array([-2.12008630e+03, -5.44554184e+03,  2.40800417e+04,  1.93601004e+14,
        1.93601004e+14,  1.93601004e+14, -1.23884802e+04])

In [31]:
glm_model.coef_

array([[ 0.59246784,  0.37237905,  3.1931001 , -1.6786284 , -1.15803445,
        -2.91423062,  1.50058373]])

In [32]:
xgbrg_model.feature_importances_

array([1.8172659e-02, 5.8546336e-03, 9.5153034e-02, 3.0036462e-02,
       4.2266838e-04, 6.3064283e-01, 2.1971773e-01], dtype=float32)

In [33]:
xgbcf_model.feature_importances_

array([0.00564923, 0.00820833, 0.0354966 , 0.04299395, 0.01748492,
       0.00991943, 0.88024753], dtype=float32)