#자동차 시장 세분화

* 자동차 회사는 새로운 전략을 수립하기 위해 4개의 시장으로 세분화 했다
* 기존 고객 분류 자료를 바탕으로 신규 고객이 어떤 분류에 속할지 예측해라
* 예측할 값(y) : "Segmentation" (1,2,3,4)
* 평가 : Marco f1-score


In [121]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score, precision_score,recall_score

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [75]:
# 데이터 불러오기
import pandas as pd
train = pd.read_csv('https://raw.githubusercontent.com/HyeonSu-Kang/data/refs/heads/main/train_04.csv')
test = pd.read_csv('https://raw.githubusercontent.com/Soyoung-Yoon/bigdata/refs/heads/main/test_04.csv')

display(train.head(3))
display(test.head(3))

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,4
1,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,2
2,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,2


Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
0,458989,Female,Yes,36,Yes,Engineer,0.0,Low,1.0,Cat_6
1,458994,Male,Yes,37,Yes,Healthcare,8.0,Average,4.0,Cat_6
2,459000,Male,Yes,59,No,Executive,11.0,High,2.0,Cat_6


In [76]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6665 entries, 0 to 6664
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               6665 non-null   int64  
 1   Gender           6665 non-null   object 
 2   Ever_Married     6665 non-null   object 
 3   Age              6665 non-null   int64  
 4   Graduated        6665 non-null   object 
 5   Profession       6665 non-null   object 
 6   Work_Experience  6665 non-null   float64
 7   Spending_Score   6665 non-null   object 
 8   Family_Size      6665 non-null   float64
 9   Var_1            6665 non-null   object 
 10  Segmentation     6665 non-null   int64  
dtypes: float64(2), int64(3), object(6)
memory usage: 572.9+ KB


In [77]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2154 entries, 0 to 2153
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               2154 non-null   int64  
 1   Gender           2154 non-null   object 
 2   Ever_Married     2154 non-null   object 
 3   Age              2154 non-null   int64  
 4   Graduated        2154 non-null   object 
 5   Profession       2154 non-null   object 
 6   Work_Experience  2154 non-null   float64
 7   Spending_Score   2154 non-null   object 
 8   Family_Size      2154 non-null   float64
 9   Var_1            2154 non-null   object 
dtypes: float64(2), int64(2), object(6)
memory usage: 168.4+ KB


In [78]:
train['Segmentation'].unique()

array([4, 2, 3, 1])

In [79]:
X = train.drop(columns='Segmentation')
X_submission = test.copy()
Y = train[['ID','Segmentation']]

In [80]:
dfX = pd.concat([X,X_submission])
print(dfX.isna().sum())
print(dfX.info())

ID                 0
Gender             0
Ever_Married       0
Age                0
Graduated          0
Profession         0
Work_Experience    0
Spending_Score     0
Family_Size        0
Var_1              0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 8819 entries, 0 to 2153
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               8819 non-null   int64  
 1   Gender           8819 non-null   object 
 2   Ever_Married     8819 non-null   object 
 3   Age              8819 non-null   int64  
 4   Graduated        8819 non-null   object 
 5   Profession       8819 non-null   object 
 6   Work_Experience  8819 non-null   float64
 7   Spending_Score   8819 non-null   object 
 8   Family_Size      8819 non-null   float64
 9   Var_1            8819 non-null   object 
dtypes: float64(2), int64(2), object(6)
memory usage: 757.9+ KB
None


In [81]:
# Gender,Ever_Married,Graduated,Profession,Spending_Score,Var_1 Label Encoding
print(dfX['Gender'].unique())
dfX['Gender'] = dfX['Gender'].replace(['Male','Female'],[0,1])

print(dfX['Ever_Married'].unique())
dfX['Ever_Married'] = dfX['Ever_Married'].replace(['No','Yes'],[0,1])

print(dfX['Graduated'].unique())
dfX['Graduated'] = dfX['Graduated'].replace(['No','Yes'],[0,1])

print(dfX['Profession'].unique())
dfX['Profession'] = dfX['Profession'].replace(['Healthcare','Engineer','Lawyer','Artist','Doctor','Homemaker','Entertainment','Marketing','Executive']
                          ,[0,1,2,3,4,5,6,7,8])

print(dfX['Spending_Score'].unique())
dfX['Spending_Score'] = dfX['Spending_Score'].replace(['Low','High','Average'],[0,1,2])

print(dfX['Var_1'].nunique())
dfX['Var_1'] = dfX['Var_1'].replace(['Cat_4','Cat_6','Cat_7','Cat_3','Cat_1','Cat_2','Cat_5']
                                    ,[0,1,2,3,4,5,6])

['Male' 'Female']
['No' 'Yes']
['No' 'Yes']
['Healthcare' 'Engineer' 'Lawyer' 'Artist' 'Doctor' 'Homemaker'
 'Entertainment' 'Marketing' 'Executive']
['Low' 'High' 'Average']
7


  dfX['Gender'] = dfX['Gender'].replace(['Male','Female'],[0,1])
  dfX['Ever_Married'] = dfX['Ever_Married'].replace(['No','Yes'],[0,1])
  dfX['Graduated'] = dfX['Graduated'].replace(['No','Yes'],[0,1])
  dfX['Profession'] = dfX['Profession'].replace(['Healthcare','Engineer','Lawyer','Artist','Doctor','Homemaker','Entertainment','Marketing','Executive']
  dfX['Spending_Score'] = dfX['Spending_Score'].replace(['Low','High','Average'],[0,1,2])
  dfX['Var_1'] = dfX['Var_1'].replace(['Cat_4','Cat_6','Cat_7','Cat_3','Cat_1','Cat_2','Cat_5']


In [82]:
dfX.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8819 entries, 0 to 2153
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               8819 non-null   int64  
 1   Gender           8819 non-null   int64  
 2   Ever_Married     8819 non-null   int64  
 3   Age              8819 non-null   int64  
 4   Graduated        8819 non-null   int64  
 5   Profession       8819 non-null   int64  
 6   Work_Experience  8819 non-null   float64
 7   Spending_Score   8819 non-null   int64  
 8   Family_Size      8819 non-null   float64
 9   Var_1            8819 non-null   int64  
dtypes: float64(2), int64(8)
memory usage: 757.9 KB


In [112]:
def get_scores(model,xtrain,xtest,ytrain,ytest):
  A = model.score(xtrain,ytrain)
  B = model.score(xtest,ytest)
  pred = model.predict(xtest)
  C = f1_score(ytest,pred,average='macro')
  proba = model.predict_proba(xtest)
  D = roc_auc_score(ytest,proba,average='macro',multi_class='ovo')
  return f'ACC : {A:.4f} {B:.4f} f1_score : {C:.4f} roc : {D:.4f}'

In [129]:
def make_models(xtrain,xtest,ytrain,ytest):
  model_1 = LogisticRegression().fit(xtrain,ytrain)
  print('model_1',get_scores(model_1,xtrain,xtest,ytrain,ytest))

  for i in range(1,10):
    model_2 = KNeighborsClassifier(i).fit(xtrain,ytrain)
    print('model_2',get_scores(model_2,xtrain,xtest,ytrain,ytest))

  model_3 = DecisionTreeClassifier(random_state=1).fit(xtrain,ytrain)
  print('model_3',get_scores(model_3,xtrain,xtest,ytrain,ytest))

  for k in range(3,8):
    model_3 = DecisionTreeClassifier(random_state=1,max_depth=k).fit(xtrain,ytrain)
    print('model_3',k,get_scores(model_3,xtrain,xtest,ytrain,ytest))

  model_4 = RandomForestClassifier(500,random_state=1).fit(xtrain,ytrain)
  print('model_4',get_scores(model_4,xtrain,xtest,ytrain,ytest))

  for t in range(3,8):
    model_4 = RandomForestClassifier(500,random_state=1,max_depth=t).fit(xtrain,ytrain)
    print('model_4',t,get_scores(model_4,xtrain,xtest,ytrain,ytest))




In [106]:
def get_data(dfX,Y):
  X = dfX.drop(columns='ID')
  X_use = dfX.iloc[:6665,:]
  X_submission = dfX.iloc[6665:,:]
  Y1 = Y['Segmentation']
  scaler = MinMaxScaler()
  X1_use = scaler.fit_transform(X_use)
  X1_submission = scaler.transform(X_submission)
  return X1_use,X1_submission,Y1

In [130]:
 X1_use,X1_submission,Y1= get_data(dfX,Y)
 xtrain1,xtest1,ytrain1,ytest1 = train_test_split(X1_use,Y1,stratify=Y1,
                                                  random_state=4231,
                                                  test_size=0.2)

 make_models(xtrain1,xtest1,ytrain1,ytest1)

model_1 ACC : 0.4709 0.4711 f1_score : 0.4363 roc : 0.7308
model_2 ACC : 1.0000 0.4471 f1_score : 0.4429 roc : 0.6288
model_2 ACC : 0.7192 0.4119 f1_score : 0.4148 roc : 0.6707
model_2 ACC : 0.6731 0.4561 f1_score : 0.4527 roc : 0.6926
model_2 ACC : 0.6437 0.4561 f1_score : 0.4538 roc : 0.7076
model_2 ACC : 0.6245 0.4689 f1_score : 0.4652 roc : 0.7128
model_2 ACC : 0.6108 0.4696 f1_score : 0.4648 roc : 0.7190
model_2 ACC : 0.6018 0.4614 f1_score : 0.4553 roc : 0.7210
model_2 ACC : 0.5928 0.4764 f1_score : 0.4711 roc : 0.7239
model_2 ACC : 0.5876 0.4591 f1_score : 0.4512 roc : 0.7250
model_3 ACC : 1.0000 0.4291 f1_score : 0.4264 roc : 0.6166
model_3 3 ACC : 0.4871 0.4936 f1_score : 0.4629 roc : 0.7278
model_3 4 ACC : 0.5191 0.4906 f1_score : 0.4843 roc : 0.7545
model_3 5 ACC : 0.5358 0.5079 f1_score : 0.4965 roc : 0.7618
model_3 6 ACC : 0.5550 0.5056 f1_score : 0.4993 roc : 0.7575
model_3 7 ACC : 0.5722 0.4989 f1_score : 0.4897 roc : 0.7473
model_4 ACC : 1.0000 0.5079 f1_score : 0.4991 

In [134]:
#model_4 7 ACC : 0.5932 0.5169 f1_score : 0.5049 roc : 0.7834

final_model = RandomForestClassifier(500,random_state=1,max_depth=7).fit(xtrain1,ytrain1)
print('final_model',get_scores(final_model,xtrain1,xtest1,ytrain1,ytest1))

final_model ACC : 0.5932 0.5169 f1_score : 0.5049 roc : 0.7834


In [137]:
pred = final_model.predict_proba(X1_submission)[:,1]
submission = pd.DataFrame({'ID' : X_submission['ID'],
                           'Segmentation' : pred})
submission

Unnamed: 0,ID,Segmentation
0,458989,0.333355
1,458994,0.168903
2,459000,0.284474
3,459003,0.377576
4,459005,0.296947
...,...,...
2149,467950,0.203532
2150,467954,0.082544
2151,467958,0.306125
2152,467961,0.442716


In [138]:
submission.to_csv('number.csv',index=False)