In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from collections import Counter
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train = pd.read_csv("tcd-ml-1920-group-income-train.csv")
test = pd.read_csv("tcd-ml-1920-group-income-test.csv")
train = train.sample(frac = 1)
train.shape

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


(1048574, 17)

In [3]:
train_missing = (train.isnull().sum()/len(train))*100
train_missing = train_missing.drop(train_missing[train_missing==0].index).sort_values(ascending=False)
miss_data = pd.DataFrame({'缺失百分比':train_missing})
miss_data

Unnamed: 0,缺失百分比
University Degree,7.68663
Gender,7.069315
Hair Color,6.695856
Satisfation with employer,3.632266
Year of Record,0.38271
Profession,0.272084


In [4]:
train.head()

Unnamed: 0,Instance,Year of Record,Housing Situation,Crime Level in the City of Employement,Work Experience in Current Job [years],Satisfation with employer,Gender,Age,Country,Size of City,Profession,University Degree,Wears Glasses,Hair Color,Body Height [cm],Yearly Income in addition to Salary (e.g. Rental Income),Total Yearly Income [EUR]
877481,820617,2006.0,Large Apartment,60,12,Average,male,25,Lebanon,770272,public relations specialist,Bachelor,1,Red,139,0 EUR,87910.18
429170,429171,1974.0,Castle,144,34,Happy,other,93,Slovakia,686288,office clerk,No,1,Blond,175,0 EUR,15511.54
130518,130519,1950.0,0,81,17,Happy,male,42,Nicaragua,49609,management analyst,Master,0,Black,163,0 EUR,1019.35
518277,486004,1979.0,Large House,111,14,Average,female,28,Guinea-Bissau,859039,security specialist,Bachelor,0,Brown,147,0 EUR,56975.39
30111,30112,1942.0,0,93,20,Average,other,45,Kyrgyzstan,41946,statistical assistant,Master,1,Black,165,0 EUR,415.01


In [5]:
data = pd.concat([train,test],ignore_index=True)

data['University Degree']=data['University Degree'].fillna('Bachelor')

data['Gender']=data['Gender'].replace('m','male')
data['Gender']=data['Gender'].replace('f','female')
data['Gender']=data['Gender'].replace('unknown','other')
data['Gender']=data['Gender'].fillna('female')

data['Housing Situation']=data['Housing Situation'].replace('nA','0')

data['Hair Color']=data['Hair Color'].fillna(method='bfill')

data['Satisfation with employer']=data['Satisfation with employer'].fillna('Average')

data.fillna(value={'Year of Record':data['Year of Record'].mean()}, inplace=True)

data['Profession']=data['Profession'].fillna(method='bfill')

data['Country']=data['Country'].fillna(method='bfill')

data.shape

(1418012, 17)

In [6]:
#构造等级特征
data['Satisfation with employer'] = data['Satisfation with employer'].map \
     ({'Average':2, 'Happy':4, 'Somewhat Happy':3, 'Unhappy':1})

In [7]:
data.isnull().any()

Instance                                                    False
Year of Record                                              False
Housing Situation                                           False
Crime Level in the City of Employement                      False
Work Experience in Current Job [years]                      False
Satisfation with employer                                   False
Gender                                                      False
Age                                                         False
Country                                                     False
Size of City                                                False
Profession                                                  False
University Degree                                           False
Wears Glasses                                               False
Hair Color                                                  False
Body Height [cm]                                            False
Yearly Inc

In [8]:
#对于每个country和profession特征，用其特征值下收入均值来替换
country_income = dict(train.groupby('Country').mean()['Total Yearly Income [EUR]']/10000)
data.Country = data.Country.map(country_income)
data.Country = data.Country.fillna(data.Country.mean())
country_income = dict(train.groupby('Profession').mean()['Total Yearly Income [EUR]']/10000)
data.Profession = data.Profession.map(country_income)
country_income = dict(train.groupby('Profession').mean()['Total Yearly Income [EUR]']/10000)
data.Profession = data.Profession.map(country_income)

#前面的254287数据用来构造均值特征
sp = 254287

In [9]:
#转换成数值
data.iloc[:,-2] = data.iloc[:,-2].map(lambda x: float(x[:-3]))

In [10]:
data.head()

Unnamed: 0,Instance,Year of Record,Housing Situation,Crime Level in the City of Employement,Work Experience in Current Job [years],Satisfation with employer,Gender,Age,Country,Size of City,Profession,University Degree,Wears Glasses,Hair Color,Body Height [cm],Yearly Income in addition to Salary (e.g. Rental Income),Total Yearly Income [EUR]
0,820617,2006.0,Large Apartment,60,12,2,male,25,5.613706,770272,,Bachelor,1,Red,139,0.0,87910.18
1,429171,1974.0,Castle,144,34,4,other,93,5.544517,686288,,No,1,Blond,175,0.0,15511.54
2,130519,1950.0,0,81,17,4,male,42,5.641801,49609,,Master,0,Black,163,0.0,1019.35
3,486004,1979.0,Large House,111,14,2,female,28,16.573602,859039,,Bachelor,0,Brown,147,0.0,56975.39
4,30112,1942.0,0,93,20,2,other,45,5.659443,41946,,Master,1,Black,165,0.0,415.01


In [11]:
cats = ['Year of Record', 'Housing Situation','Country','Size of City',
        'Crime Level in the City of Employement','Work Experience in Current Job [years]']
cons = ['Satisfation with employer','Gender','Age',
       'University Degree','Body Height [cm]','Profession']
data['Work Experience in Current Job [years]'] = data['Work Experience in Current Job [years]'].astype(str)

In [12]:
#This is the inspiration I got from the best code. 
#I added and constructed mean features, cross mean features and Category Characteristics

def create_feature(df,cats,cons,normalize=True):
    for cat in cats:
        value = df[cat].value_counts(dropna=False, normalize=normalize).to_dict()
        num = cat + '_FE_FULL'
        df[num + num] = df[cat].map(value)
        #构造均值特征mean features
        df[num] = df[cat].map( dict(df.iloc[:sp].groupby(cat).mean()['Total Yearly Income [EUR]']/10000))
        df[num] = df[num].fillna(df[num].mean())
        df[num] = df[num].astype('float32')
        for con in cons:
            new_col = cat +'_'+ con
            df[new_col] = df[cat].astype(str)+'_'+df[con].astype(str)
            temp_df = df[new_col]
            fq_encode = temp_df.value_counts(normalize=True).to_dict()
            #构造交叉均值特征cross mean features
            df[new_col] = df[new_col].map( dict(df.iloc[:sp+1].groupby( \
                               new_col).mean()['Total Yearly Income [EUR]']/10000))
            df[new_col] = df[new_col].fillna(df[new_col].mean())
    return df

data = create_feature(data,cats,cons)
data['Work Experience in Current Job [years]'] = data['Work Experience in Current Job [years]' \
                                                     ].replace('#NUM!', data.iloc[:, -1].mean()).astype(float)

#构造类别特征 Category Characteristics
for col in data.dtypes[data.dtypes == 'object'].index.tolist():
    feat_le = LabelEncoder()
    feat_le.fit(data[col].unique().astype(str))
    data[col] = feat_le.transform(data[col].astype(str))

del_col = set(['Total Yearly Income [EUR]','Instance'])
features_col =  list(set(data) - del_col)
features_col

['Crime Level in the City of Employement_Gender',
 'Size of City_University Degree',
 'Size of City_Profession',
 'Hair Color',
 'Work Experience in Current Job [years]_Satisfation with employer',
 'Year of Record_University Degree',
 'Size of City_FE_FULLSize of City_FE_FULL',
 'Size of City_Age',
 'Housing Situation_Profession',
 'Country_Age',
 'Year of Record_Age',
 'Crime Level in the City of Employement_Profession',
 'Housing Situation_Satisfation with employer',
 'Country_FE_FULLCountry_FE_FULL',
 'Satisfation with employer',
 'Size of City',
 'Country_Satisfation with employer',
 'Housing Situation_FE_FULLHousing Situation_FE_FULL',
 'Work Experience in Current Job [years]_Age',
 'Wears Glasses',
 'Size of City_Gender',
 'Crime Level in the City of Employement_Age',
 'Housing Situation_Gender',
 'Work Experience in Current Job [years]_Gender',
 'Crime Level in the City of Employement',
 'Crime Level in the City of Employement_Body Height [cm]',
 'Housing Situation_Body Height [

In [13]:
data.head()

Unnamed: 0,Instance,Year of Record,Housing Situation,Crime Level in the City of Employement,Work Experience in Current Job [years],Satisfation with employer,Gender,Age,Country,Size of City,...,Crime Level in the City of Employement_Body Height [cm],Crime Level in the City of Employement_Profession,Work Experience in Current Job [years]_FE_FULLWork Experience in Current Job [years]_FE_FULL,Work Experience in Current Job [years]_FE_FULL,Work Experience in Current Job [years]_Satisfation with employer,Work Experience in Current Job [years]_Gender,Work Experience in Current Job [years]_Age,Work Experience in Current Job [years]_University Degree,Work Experience in Current Job [years]_Body Height [cm],Work Experience in Current Job [years]_Profession
0,820617,2006.0,2,60,12.0,2,2,25,5.613706,770272,...,7.517658,7.098897,0.023664,8.260832,7.677081,8.217012,8.135961,7.897454,11.302215,8.260831
1,429171,1974.0,1,144,34.0,4,3,93,5.544517,686288,...,3.138196,6.119682,0.000415,13.872695,17.136728,12.514886,8.96764,13.309363,15.275991,13.872695
2,130519,1950.0,0,81,17.0,4,2,42,5.641801,49609,...,7.781809,6.993411,0.040724,5.576781,5.406021,5.755681,4.799894,6.136034,6.190228,5.577078
3,486004,1979.0,3,111,14.0,2,1,28,16.573602,859039,...,10.101427,6.507448,0.022589,8.799112,8.12076,8.5229,8.546618,8.273142,7.353583,8.799113
4,30112,1942.0,0,93,20.0,2,3,45,5.659443,41946,...,5.127805,6.931308,0.031696,6.100109,5.675211,6.114139,5.940158,6.93202,7.031269,6.100109


In [14]:
from sklearn.ensemble import RandomForestRegressor

param = {'num_trees':20000, 
         'max_depth': 21, 
         'objective':'regression', 
         "verbosity": -1,
         'metric': 'mae',
         'bagging_fraction': 0.8, 
         'learning_rate': 0.01,}
X_train,X_test  = data[features_col].iloc[:1048573],data[features_col].iloc[1048574:]
Y_train = data['Total Yearly Income [EUR]'].iloc[:1048573]
x_train,x_val,y_train,y_val = X_train.iloc[sp+1: ,:],  X_train.iloc[:sp,:],  \
                    Y_train.iloc[sp+1: ],  Y_train.iloc[:sp ]
train_data = lgb.Dataset(x_train, label=y_train, feature_name='auto')#categorical_feature=cat
val_data = lgb.Dataset(x_val, label=y_val, feature_name='auto')

bst = lgb.train(param, train_data, 20000, verbose_eval = 100, valid_sets=[val_data])



[100]	valid_0's l1: 31816.5
[200]	valid_0's l1: 18554.4
[300]	valid_0's l1: 13930.3
[400]	valid_0's l1: 12022.2
[500]	valid_0's l1: 11166.8
[600]	valid_0's l1: 10751.6
[700]	valid_0's l1: 10472.5
[800]	valid_0's l1: 10279.8
[900]	valid_0's l1: 10121.6
[1000]	valid_0's l1: 9988.73
[1100]	valid_0's l1: 9880.02
[1200]	valid_0's l1: 9784.73
[1300]	valid_0's l1: 9692.19
[1400]	valid_0's l1: 9599.73
[1500]	valid_0's l1: 9522.1
[1600]	valid_0's l1: 9454.18
[1700]	valid_0's l1: 9400.2
[1800]	valid_0's l1: 9355.37
[1900]	valid_0's l1: 9312.18
[2000]	valid_0's l1: 9273.14
[2100]	valid_0's l1: 9242.3
[2200]	valid_0's l1: 9211.32
[2300]	valid_0's l1: 9178.55
[2400]	valid_0's l1: 9150.71
[2500]	valid_0's l1: 9125.63
[2600]	valid_0's l1: 9100.49
[2700]	valid_0's l1: 9073.34
[2800]	valid_0's l1: 9055.25
[2900]	valid_0's l1: 9033.86
[3000]	valid_0's l1: 9018.09
[3100]	valid_0's l1: 9003.82
[3200]	valid_0's l1: 8985.03
[3300]	valid_0's l1: 8970.87
[3400]	valid_0's l1: 8948.77
[3500]	valid_0's l1: 8937.

In [15]:
from sklearn.metrics import mean_absolute_error
predict = bst.predict(x_val)
val_mae = mean_absolute_error(y_val,predict)
val_mae

8289.147295391711

In [16]:
#生成结果
#rfr.fit(X_train, Y_train)
predict = bst.predict(X_test)
result=pd.DataFrame([range(1,1+len(predict)), predict]).T
result.columns = ['Instance', 'Total Yearly Income [EUR]']
result.to_csv("sub191125_10.csv",index=False)
result.head()

Unnamed: 0,Instance,Total Yearly Income [EUR]
0,1.0,40184.478156
1,2.0,8360.89466
2,3.0,2813.397953
3,4.0,81933.348388
4,5.0,3604.240365
