# Feature Engineering and Selection

***
**Author:** Jiacheng

**Create Time:**  2020-01-06

**Update Time:**  2020-01-09
***

## [0. EDA](#0.EDA)
## [1. 特征工程 (Feature Engineering)](#一、特征工程)
## [2. 特征选择 (Feature Selection)](#二、特征选择)

In [1]:
# 导入所需模块
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import skew
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.ensemble import IsolationForest

import warnings
warnings.filterwarnings('ignore')

In [2]:
# 数据载入
data_train = pd.read_csv('./Data_Set/train_data.csv')
data_test = pd.read_csv('./Data_Set/test_a.csv')

data_train['Type'] = 'Train'
data_test['Type'] = 'Test'

data_all = pd.concat([data_train, data_test], ignore_index=True)

print ("Train: ",data_train.shape[0],"sales, and ",data_train.shape[1],"features")
print ("Test: ",data_test.shape[0],"sales, and ",data_test.shape[1],"features")

Train:  41440 sales, and  52 features
Test:  2469 sales, and  51 features


## 0.EDA

我们将之前EDA的函数整合，用来处理原始数据，随后再进行特征工程

In [3]:
def preprocessingData(df):
    # 删除无关字段
    df.drop("city", axis=1, inplace=True)
    df.drop("ID", axis=1, inplace=True)
    
    # 转换object
#     columns = ['rentType','communityName','houseType', 'houseFloor', 'houseToward', 'houseDecoration',  'region', 'plate']
#     for feature in columns:
#         df[feature] = LabelEncoder().fit_transform(df[feature])
    
    # pv、uv填充
    df['pv'].fillna(df['pv'].mean(), inplace=True)
    df['uv'].fillna(df['uv'].mean(), inplace=True)
    # 浏览人数为整数
    df['pv'] = df['pv'].astype('int')
    df['uv'] = df['uv'].astype('int')
    
    # rentType转换
    df['rentType'][df['rentType'] == '--'] = '未知方式'
    
    # buildYear处理
    buildYearMode = pd.DataFrame(df[df['buildYear'] != '暂无信息']['buildYear'].mode())
    df.loc[df[df['buildYear'] == '暂无信息'].index, 'buildYear'] = buildYearMode.iloc[0, 0]
    df['buildYear'] = df['buildYear'].astype('int')
    
    # 分割交易时间
    # 年份相同，丢弃
    df['month'] = df['tradeTime'].apply(lambda x: x.split('/')[1]).astype('int')
    df['day'] = df['tradeTime'].apply(lambda x: x.split('/')[2]).astype('int')
    df.drop('tradeTime', axis=1, inplace=True)
    
    return df

In [4]:
def cleanData(df):
    # 孤立森林处理
    def IF_drop(df):
        IForest = IsolationForest(contamination=0.01)
        IForest.fit(df["tradeMoney"].values.reshape(-1,1))
        y_pred = IForest.predict(df["tradeMoney"].values.reshape(-1,1))
        drop_index = df.loc[y_pred==-1].index
        
        df.drop(drop_index,inplace=True)
        return df

    df = IF_drop(df)
    
    # 异常值处理
    df = df[df.area<=200]
    df.drop(df[(df['totalFloor'] == 0)].index, inplace=True)
    df = df[(df.tradeMoney <=16000) & (df.tradeMoney >=500)]
    
    # 深度清洗
    def DeepClean(data):
        data.drop(data[(data['region']=='RG00001') & (data['tradeMoney']<=1000)&(data['area']>=50)].index,inplace=True)
        data.drop(data[(data['region']=='RG00002') & (data['tradeMoney']>60000)&(data['area']<100)].index,inplace=True)
        data.drop(data[(data['region']=='RG00003') & (data['tradeMoney']>30000)&(data['area']<300)].index,inplace=True)
        data.drop(data[(data['region']=='RG00003') & (data['tradeMoney']<500)&(data['area']<50)].index,inplace=True)
        data.drop(data[(data['region']=='RG00003') & (data['tradeMoney']<1500)&(data['area']>100)].index,inplace=True)
        data.drop(data[(data['region']=='RG00003') & (data['tradeMoney']<2000)&(data['area']>300)].index,inplace=True)
        data.drop(data[(data['region']=='RG00003') & (data['tradeMoney']>5000)&(data['area']<20)].index,inplace=True) 
        data.drop(data[(data['region']=='RG00004') & (data['tradeMoney']<1000)&(data['area']>80)].index,inplace=True)
        data.drop(data[(data['region']=='RG00004') & (data['tradeMoney']>20000)&(data['area']>400)].index,inplace=True)
        data.drop(data[(data['region']=='RG00004') & (data['tradeMoney']>8000)&(data['area']<80)].index,inplace=True)
        data.drop(data[(data['region']=='RG00005') & (data['tradeMoney']<2000)&(data['area']>180)].index,inplace=True)
        data.drop(data[(data['region']=='RG00005') & (data['tradeMoney']>50000)&(data['area']<200)].index,inplace=True)
        data.drop(data[(data['region']=='RG00005') & (data['tradeMoney']>30000)&(data['area']<100)].index,inplace=True)
        data.drop(data[(data['region']=='RG00005') & (data['tradeMoney']<50000)&(data['area']>600)].index,inplace=True)
        data.drop(data[(data['region']=='RG00005') & (data['tradeMoney']>50000)&(data['area']>350)].index,inplace=True)
        data.drop(data[(data['region']=='RG00006') & (data['tradeMoney']>4000)&(data['area']<100)].index,inplace=True)
        data.drop(data[(data['region']=='RG00006') & (data['tradeMoney']<600)&(data['area']>100)].index,inplace=True)
        data.drop(data[(data['region']=='RG00006') & (data['area']>165)].index,inplace=True)
        data.drop(data[(data['region']=='RG00006') & (data['tradeMoney']<200)].index,inplace=True)
        data.drop(data[(data['region']=='RG00006') & (data['tradeMoney']<2000)&(data['area']>200)].index,inplace=True)
        data.drop(data[(data['region']=='RG00007') & (data['tradeMoney']<2500)&(data['area']>100)].index,inplace=True)
        data.drop(data[(data['region']=='RG00007') & (data['tradeMoney']<1100)&(data['area']>50)].index,inplace=True)
        data.drop(data[(data['region']=='RG00008') & (data['tradeMoney']<2000)&(data['area']>80)].index,inplace=True)
        data.drop(data[(data['region']=='RG00008') & (data['tradeMoney']>15000)&(data['area']<110)].index,inplace=True)
        data.drop(data[(data['region']=='RG00008') & (data['tradeMoney']>20000)&(data['area']>110)].index,inplace=True)
        data.drop(data[(data['region']=='RG00008') & (data['tradeMoney']<1500)&(data['area']<50)].index,inplace=True)
        data.drop(data[(data['region']=='RG00008') & (data['rentType']=='合租')&(data['area']>50)].index,inplace=True)
        data.drop(data[(data['region']=='RG00009') & (data['tradeMoney']>40000)].index,inplace=True)
        data.drop(data[(data['region']=='RG00009') & (data['area']>300)].index,inplace=True)
        data.drop(data[(data['region']=='RG00009') & (data['tradeMoney']<2000)&(data['area']>100)].index,inplace=True)
        data.drop(data[(data['region']=='RG00010') & (data['tradeMoney']>25000)&(data['area']>200)].index,inplace=True)
        data.drop(data[(data['region']=='RG00010') & (data['tradeMoney']<15000)&(data['area']>400)].index,inplace=True)
        data.drop(data[(data['region']=='RG00010') & (data['tradeMoney']<3000)&(data['area']>200)].index,inplace=True)
        data.drop(data[(data['region']=='RG00010') & (data['tradeMoney']>7000)&(data['area']<75)].index,inplace=True)
        data.drop(data[(data['region']=='RG00010') & (data['tradeMoney']>12500)&(data['area']<100)].index,inplace=True)
        data.drop(data[(data['region']=='RG00011') & (data['tradeMoney']<10000)&(data['area']>390)].index,inplace=True)
        data.drop(data[(data['region']=='RG00012') & (data['tradeMoney']<5000)&(data['area']>120)].index,inplace=True)
        data.drop(data[(data['region']=='RG00012') & (data['tradeMoney']<800)&(data['area']<30)].index,inplace=True)
        data.drop(data[(data['region']=='RG00012') & (data['tradeMoney']<800)&(data['area']<30)].index,inplace=True)   
        data.drop(data[(data['region']=='RG00013') & (data['tradeMoney']>40000)&(data['area']<100)].index,inplace=True)
        data.drop(data[(data['region']=='RG00013') & (data['tradeMoney']>50000)&(data['area']>400)].index,inplace=True)
        data.drop(data[(data['region']=='RG00013') & (data['tradeMoney']<2000)&(data['area']>80)].index,inplace=True)
        data.drop(data[(data['region']=='RG00014') & (data['tradeMoney']>40000)&(data['area']>300)].index,inplace=True)
        data.drop(data[(data['region']=='RG00014') & (data['tradeMoney']<1300)&(data['area']>80)].index,inplace=True)
        data.drop(data[(data['region']=='RG00014') & (data['tradeMoney']<8000)&(data['area']>200)].index,inplace=True)
        data.drop(data[(data['region']=='RG00014') & (data['tradeMoney']<1000)&(data['area']>20)].index,inplace=True)
        data.drop(data[(data['region']=='RG00014') & (data['tradeMoney']>25000)&(data['area']>200)].index,inplace=True)
        data.drop(data[(data['region']=='RG00014') & (data['tradeMoney']<20000)&(data['area']>250)].index,inplace=True)
        data.drop(data[(data['region']=='RG00015') ].index,inplace=True)
        
        data.loc[(data['region']=='RG00002')&(data['area']>50)&(data['rentType']=='合租'),'rentType']='整租'
        data.loc[(data['region']=='RG00014')&(data['rentType']=='合租')&(data['area']>60),'rentType']='整租'
        
        data.reset_index(drop=True, inplace=True)        
        return data
    
    df = DeepClean(df)
    
    return df

In [5]:
data_train = preprocessingData(data_train)
data_test = preprocessingData(data_test)

data_train = cleanData(data_train)

print ("Train: ",data_train.shape[0],"sales, and ",data_train.shape[1],"features")
print ("Test: ",data_test.shape[0],"sales, and ",data_test.shape[1],"features")

Train:  40160 sales, and  51 features
Test:  2469 sales, and  50 features


In [6]:
data_train.head()

Unnamed: 0,area,rentType,houseType,houseFloor,totalFloor,houseToward,houseDecoration,communityName,region,plate,...,totalWorkers,newWorkers,residentPopulation,pv,uv,lookNum,tradeMoney,Type,month,day
0,68.06,未知方式,2室1厅1卫,低,16,暂无数据,其他,XQ00051,RG00001,BK00064,...,28248,614,111546,1124,284,0,2000.0,Train,11,28
1,125.55,未知方式,3室2厅2卫,中,14,暂无数据,简装,XQ00130,RG00002,BK00049,...,14823,148,157552,701,22,1,2000.0,Train,12,16
2,132.0,未知方式,3室2厅2卫,低,32,暂无数据,其他,XQ00179,RG00002,BK00050,...,77645,520,131744,57,20,1,16000.0,Train,12,22
3,57.0,未知方式,1室1厅1卫,中,17,暂无数据,精装,XQ00313,RG00002,BK00051,...,8750,1665,253337,888,279,9,1600.0,Train,12,21
4,129.0,未知方式,3室2厅3卫,低,2,暂无数据,毛坯,XQ01257,RG00003,BK00044,...,800,117,125309,2038,480,0,2900.0,Train,11,18


## 一、特征工程

数据处理完毕，接下来可以构造一下新的特征  
选择部分特征用热图查看一下相关性  

## 二、特征选择