In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from collections import Counter
import lightgbm as lgb
import seaborn as sns  

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train = pd.read_csv("/Users/BarryFitzpatrick/Machine Learning/Kaggle Group/tcd-ml-comp-201920-income-pred-group/tcd-ml-1920-group-income-train.csv")
test = pd.read_csv("/Users/BarryFitzpatrick/Machine Learning/Kaggle Group/tcd-ml-comp-201920-income-pred-group/tcd-ml-1920-group-income-test.csv")

train = train.sample(frac = 1)
train.shape

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


(1048574, 17)

In [3]:
train_missing = (train.isnull().sum()/len(train))*100
train_missing = train_missing.drop(train_missing[train_missing==0].index).sort_values(ascending=False)
miss_data = pd.DataFrame({'缺失百分比':train_missing})
miss_data

Unnamed: 0,缺失百分比
University Degree,7.68663
Gender,7.069315
Hair Color,6.695856
Satisfation with employer,3.632266
Year of Record,0.38271
Profession,0.272084


In [4]:
train.head()

Unnamed: 0,Instance,Year of Record,Housing Situation,Crime Level in the City of Employement,Work Experience in Current Job [years],Satisfation with employer,Gender,Age,Country,Size of City,Profession,University Degree,Wears Glasses,Hair Color,Body Height [cm],Yearly Income in addition to Salary (e.g. Rental Income),Total Yearly Income [EUR]
295732,295733,1963.0,Castle,159,16,Somewhat Happy,other,39,Honduras,1953901,personal care and service worker,Bachelor,1,Red,160,0 EUR,6752.25
884146,827282,2007.0,Large Apartment,13,10,Somewhat Happy,female,22,Austria,2446593,liaison,Bachelor,0,Brown,160,0 EUR,119814.66
63516,63517,1945.0,0,109,13,Average,male,27,Congo,2770131,program assistant,No,1,Black,181,0 EUR,2200.84
311326,311327,1965.0,Castle,81,16,Average,male,33,El Salvador,475374,risk management,Master,1,Blond,197,0 EUR,6062.7
618417,561573,1985.0,Medium House,16,18,Happy,other,44,Kuwait,28331,naval architect,No,1,Black,198,0 EUR,37415.91


In [5]:
data = pd.concat([train,test],ignore_index=True)

data['University Degree']=data['University Degree'].fillna('Bachelor')

data['Gender']=data['Gender'].replace('m','male')
data['Gender']=data['Gender'].replace('f','female')
data['Gender']=data['Gender'].replace('unknown','other')
data['Gender']=data['Gender'].fillna('female')

#data['Housing Situation']=data['Housing Situation'].replace('nA','0')
data['Housing Situation']=np.where(data['Housing Situation']=='0', 'nA', data['Housing Situation'])
data['Housing Situation']=np.where(data['Housing Situation']==0, 'nA', data['Housing Situation'])
data['Hair Color']=data['Hair Color'].fillna(method='bfill')

data['Satisfation with employer']=data['Satisfation with employer'].fillna('Average')

data.fillna(value={'Year of Record':data['Year of Record'].mean()}, inplace=True)

data['Profession']=data['Profession'].fillna(method='bfill')

data['Country']=data['Country'].fillna(method='bfill')

data.shape

(1418012, 17)

In [6]:
#构造等级特征
data['Satisfation with employer'] = data['Satisfation with employer'].map \
     ({'Average':2, 'Happy':4, 'Somewhat Happy':3, 'Unhappy':1})

In [None]:
data.isnull().any()

In [7]:
#对于每个country和profession特征，用其特征值下收入均值来替换
country_income = dict(train.groupby('Country').mean()['Total Yearly Income [EUR]']/10000)
data.Country = data.Country.map(country_income)
data.Country = data.Country.fillna(data.Country.mean())
country_income = dict(train.groupby('Profession').mean()['Total Yearly Income [EUR]']/10000)
data.Profession = data.Profession.map(country_income)
country_income = dict(train.groupby('Profession').mean()['Total Yearly Income [EUR]']/10000)
data.Profession = data.Profession.map(country_income)
#前面的254287数据用来构造均值特征
sp = 254287

In [8]:
#转换成数值
data.iloc[:,-2] = data.iloc[:,-2].map(lambda x: float(x[:-3]))

In [9]:
data['BigCity'] = np.where(data['Size of City']>7335190, 1, 0)
data['SmallCity'] = np.where(data['Size of City']<7335190, 1, 0)
#data = data.drop(columns=["Size of City"])

In [None]:
#data = data.drop(columns=["Wears Glasses"])

In [None]:
#data = data.drop(columns=["Hair Color"])

In [None]:
data

In [None]:
sns.boxplot(x=data['Wears Glasses'], y=data["Total Yearly Income [EUR]"], data=data, palette="Set1")

In [10]:
data['Crime Level in the City of Employement']=data['Crime Level in the City of Employement'].replace(0,data['Crime Level in the City of Employement'].mean())


In [None]:
sns.boxplot(x=data['BigCity'], y=data["Total Yearly Income [EUR]"], data=data, palette="Set1")


In [None]:
# Remove outliers in Size of City
#indexBigCityOutliers = data[ (data["BigCity"] == 1)  & (data["Total Yearly Income [EUR]"] > 1500000) ].index
#indexBigCityOutliers

In [None]:
#data = data.drop(indexBigCityOutliers)

In [None]:
data['Housing Situation'].value_counts()

In [None]:
# Have changed 0's and '0' to nA
sns.boxplot(x=data['Housing Situation'], y=data["Total Yearly Income [EUR]"], data=data, palette="Set1")


In [None]:
sns.boxplot(x=data['Hair Color'], y=data["Total Yearly Income [EUR]"], data=data, palette="Set1")


In [None]:
sns.boxplot(x=data['Work Experience in Current Job [years]'], y=data["Total Yearly Income [EUR]"], data=data, palette="Set1")


In [None]:
sns.boxplot(x=data['University Degree'], y=data["Total Yearly Income [EUR]"], data=data, palette="Set1")


In [None]:
sns.boxplot(x=data['Body Height [cm]'], y=data["Total Yearly Income [EUR]"], data=data, palette="Set1")



In [None]:
sns.boxplot(x=data['Crime Level in the City of Employement'], y=data["Total Yearly Income [EUR]"], data=data, palette="Set1")



In [None]:
sns.boxplot(x = data['Crime Level in the City of Employement']==0,y=data["Total Yearly Income [EUR]"])

In [None]:
sns.boxplot(x=data['Gender'], y=data["Total Yearly Income [EUR]"], data=data, palette="Set1")


In [None]:
data.head()

In [11]:
cats = ['Year of Record', 'Housing Situation','Country', 'Size of City',
        'Crime Level in the City of Employement','Work Experience in Current Job [years]']
cons = ['Satisfation with employer','Gender','Age',
       'University Degree','Body Height [cm]','Profession']
data['Work Experience in Current Job [years]'] = data['Work Experience in Current Job [years]'].astype(str)

In [12]:
#This is the inspiration I got from the best code. 
#I added and constructed mean features, cross mean features and Category Characteristics

def create_feature(df,cats,cons,normalize=True):
    for cat in cats:
        value = df[cat].value_counts(dropna=False, normalize=normalize).to_dict()
        num = cat + '_FE_FULL'
        df[num + num] = df[cat].map(value)
        #构造均值特征mean features
        df[num] = df[cat].map( dict(df.iloc[:sp].groupby(cat).mean()['Total Yearly Income [EUR]']/10000))
        df[num] = df[num].fillna(df[num].mean())
        df[num] = df[num].astype('float32')
        for con in cons:
            new_col = cat +'_'+ con
            df[new_col] = df[cat].astype(str)+'_'+df[con].astype(str)
            temp_df = df[new_col]
            fq_encode = temp_df.value_counts(normalize=True).to_dict()
            #构造交叉均值特征cross mean features
            df[new_col] = df[new_col].map( dict(df.iloc[:sp+1].groupby( \
                               new_col).mean()['Total Yearly Income [EUR]']/10000))
            df[new_col] = df[new_col].fillna(df[new_col].mean())
    return df

data = create_feature(data,cats,cons)
data['Work Experience in Current Job [years]'] = data['Work Experience in Current Job [years]' \
                                                     ].replace('#NUM!', data.iloc[:, -1].mean()).astype(float)

#构造类别特征 Category Characteristics
for col in data.dtypes[data.dtypes == 'object'].index.tolist():
    feat_le = LabelEncoder()
    feat_le.fit(data[col].unique().astype(str))
    data[col] = feat_le.transform(data[col].astype(str))

del_col = set(['Total Yearly Income [EUR]','Instance'])
features_col =  list(set(data) - del_col)
features_col

['Country_Satisfation with employer',
 'Work Experience in Current Job [years]_Body Height [cm]',
 'Yearly Income in addition to Salary (e.g. Rental Income)',
 'Year of Record_Profession',
 'Housing Situation_Body Height [cm]',
 'Housing Situation',
 'Country_Gender',
 'Country_Profession',
 'Country_FE_FULL',
 'Size of City_Profession',
 'Housing Situation_Satisfation with employer',
 'Size of City_FE_FULLSize of City_FE_FULL',
 'Size of City',
 'Housing Situation_FE_FULLHousing Situation_FE_FULL',
 'Work Experience in Current Job [years]_Satisfation with employer',
 'Crime Level in the City of Employement_Satisfation with employer',
 'Housing Situation_Profession',
 'Year of Record_Age',
 'Year of Record_FE_FULLYear of Record_FE_FULL',
 'Size of City_Body Height [cm]',
 'Size of City_FE_FULL',
 'Size of City_Gender',
 'Body Height [cm]',
 'Year of Record_FE_FULL',
 'BigCity',
 'Satisfation with employer',
 'Crime Level in the City of Employement_Body Height [cm]',
 'Year of Record_Bo

In [None]:
data.shape

In [13]:
from sklearn.ensemble import RandomForestRegressor

param = {'num_iterations':20000, 
         'max_depth': 20, 
         'feature_fraction':0.85,
         'objective':'regression', 
         "verbosity": -1,
         'metric': 'mae',
         'bagging_fraction': 0.8, 
         'learning_rate': 0.005,}
X_train,X_test  = data[features_col].iloc[:1048573],data[features_col].iloc[1048574:]
Y_train = data['Total Yearly Income [EUR]'].iloc[:1048573]
x_train,x_val,y_train,y_val = X_train.iloc[sp+1: ,:],  X_train.iloc[:sp,:],  \
                    Y_train.iloc[sp+1: ],  Y_train.iloc[:sp ]
train_data = lgb.Dataset(x_train, label=y_train, feature_name='auto')#categorical_feature=cat
val_data = lgb.Dataset(x_val, label=y_val, feature_name='auto')

bst = lgb.train(param, train_data, 20000, verbose_eval = 100, valid_sets=[val_data])



[100]	valid_0's l1: 47156.8
[200]	valid_0's l1: 32138.2
[300]	valid_0's l1: 23658.5
[400]	valid_0's l1: 18782.2
[500]	valid_0's l1: 15851.5
[600]	valid_0's l1: 14123.6
[700]	valid_0's l1: 13019.4
[800]	valid_0's l1: 12263.8
[900]	valid_0's l1: 11734.5
[1000]	valid_0's l1: 11376.6
[1100]	valid_0's l1: 11138.9
[1200]	valid_0's l1: 10944
[1300]	valid_0's l1: 10788.5
[1400]	valid_0's l1: 10664.1
[1500]	valid_0's l1: 10556.7
[1600]	valid_0's l1: 10463.4
[1700]	valid_0's l1: 10372.5
[1800]	valid_0's l1: 10296.1
[1900]	valid_0's l1: 10224.1
[2000]	valid_0's l1: 10158.4
[2100]	valid_0's l1: 10100.2
[2200]	valid_0's l1: 10045.4
[2300]	valid_0's l1: 10000
[2400]	valid_0's l1: 9952.5
[2500]	valid_0's l1: 9909.16
[2600]	valid_0's l1: 9862.52
[2700]	valid_0's l1: 9825.64
[2800]	valid_0's l1: 9789.11
[2900]	valid_0's l1: 9757.76
[3000]	valid_0's l1: 9727.14
[3100]	valid_0's l1: 9701.36
[3200]	valid_0's l1: 9672.18
[3300]	valid_0's l1: 9645.74
[3400]	valid_0's l1: 9617.69
[3500]	valid_0's l1: 9591.57

In [None]:
from sklearn.metrics import mean_absolute_error
predict = bst.predict(x_val)
val_mae = mean_absolute_error(y_val,predict)
val_mae

In [None]:
#生成结果
#rfr.fit(X_train, Y_train)
predict = bst.predict(X_test)
result=pd.DataFrame([range(1,1+len(predict)), predict]).T
result.columns = ['Instance', 'Total Yearly Income [EUR]']
result.to_csv("lgb7.csv",index=False)
result.head()