# Poverty Prediction for Costa Rica
## Import Libraries and dataframe

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.shape

(9557, 143)

## Clean Data

In [4]:
df.head()

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq,Target
0,ID_279628684,190000.0,0,3,0,1,1,0,,0,...,100,1849,1,100,0,1.0,0.0,100.0,1849,4
1,ID_f29eb3ddd,135000.0,0,4,0,1,1,1,1.0,0,...,144,4489,1,144,0,1.0,64.0,144.0,4489,4
2,ID_68de51c94,,0,8,0,1,1,0,,0,...,121,8464,1,0,0,0.25,64.0,121.0,8464,4
3,ID_d671db89c,180000.0,0,5,0,1,1,1,1.0,0,...,81,289,16,121,4,1.777778,1.0,121.0,289,4
4,ID_d56d6f5f5,180000.0,0,5,0,1,1,1,1.0,0,...,121,1369,16,121,4,1.777778,1.0,121.0,1369,4


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9557 entries, 0 to 9556
Columns: 143 entries, Id to Target
dtypes: float64(8), int64(130), object(5)
memory usage: 10.4+ MB


### Observation
I already see some NaN values. It is hard to see all with a set this large so it's harder to determine which rows and columns

In [6]:
df.columns[df.isnull().sum() > 0]

Index(['v2a1', 'v18q1', 'rez_esc', 'meaneduc', 'SQBmeaned'], dtype='object')

In [7]:
df.fillna(method='bfill', inplace=True)

In [9]:
df.columns[df.isnull().sum() > 0]

Index(['v18q1', 'rez_esc'], dtype='object')

In [10]:
df.fillna(method='ffill', inplace=True)

In [11]:
df.columns[df.isnull().sum() > 0]

Index([], dtype='object')

### Identify conflicted categorical and numerical columns

In [13]:
num_cols = df.select_dtypes(include=np.number).columns.tolist()
cat_cols = set(df.columns).difference(set(num_cols))
print(cat_cols)

{'edjefa', 'idhogar', 'Id', 'edjefe', 'dependency'}


#### Education
edjefa and edjefe is the years of education for women and men respectivley. Looking at the actual csv data there are alot of numbers (ex: 12, 9, 4, 6 etc.).  There are also many 'no' values.  This likely means they do not have a record of education. So I will put this as 0.

In [14]:
df['edjefa'] = df['edjefa'].replace({'no': 0, 'yes': 1}).astype(float)
df['edjefe'] = df['edjefe'].replace({'no': 0, 'yes': 1}).astype(float)

In [15]:
#re run conflicted cols code to check if it worked. 
num_cols = df.select_dtypes(include=np.number).columns.tolist()
cat_cols = set(df.columns).difference(set(num_cols))
print(cat_cols)

{'Id', 'idhogar', 'dependency'}


In [18]:
#It worked. I am going to check the unique values of those two columns before to make sure there aren't any weird outliers
df['edjefa'].unique()

array([ 0., 11.,  4., 10.,  9., 15.,  7., 14., 13.,  8., 17.,  6.,  5.,
        3., 16., 19.,  1., 21., 12.,  2., 20., 18.])

In [17]:
df['edjefe'].unique()

array([10., 12.,  0., 11.,  9., 15.,  4.,  6.,  8., 17.,  7., 16., 14.,
        5., 21.,  2., 19.,  1.,  3., 18., 13., 20.])

In [20]:
df['edjefa'].value_counts()

0.0     6230
6.0      947
11.0     399
9.0      237
8.0      217
15.0     188
7.0      179
5.0      176
3.0      152
4.0      136
14.0     120
16.0     113
10.0      96
2.0       84
17.0      76
12.0      72
1.0       69
13.0      52
21.0       5
19.0       4
18.0       3
20.0       2
Name: edjefa, dtype: int64

In [21]:
df['edjefe'].value_counts()

0.0     3762
6.0     1845
11.0     751
9.0      486
3.0      307
15.0     285
8.0      257
7.0      234
5.0      222
14.0     208
17.0     202
2.0      194
4.0      137
16.0     134
1.0      123
12.0     113
10.0     111
13.0     103
21.0      43
18.0      19
19.0      14
20.0       7
Name: edjefe, dtype: int64

#### Dependency
Dependency represents rate of household members under 19 and over 64 to those between 19 and 64.  It is a calculated rate equal to (number of members of the household younger than 19 or older than 64)/(number of member of household between 19 and 64)

The column is very uneven there is a lot of yes and no values which is obviously not a rate. 

A column that this relates to is SQBDependency which is the dependency rate squared.  A way to replace the unevenness of this column is to sqrt the SQBDependency col 

In [22]:
df['dependency'] = np.sqrt(df['SQBdependency'])

In [23]:
df.isna().sum().sum()

0

In [24]:
#re run conflicted cols code to check if it worked. 
num_cols = df.select_dtypes(include=np.number).columns.tolist()
cat_cols = set(df.columns).difference(set(num_cols))
print(cat_cols)

{'Id', 'idhogar'}


#### Id and idhogar
These are both IDs. ID is a unique identifier for each row and idhogar is an id for the household this is not helpful for fitting our data.

In [28]:
na_cols = df[['Id', 'idhogar']]
df.drop(na_cols,axis=1,inplace=True)

In [29]:
#re run conflicted cols code to check if it worked. 
num_cols = df.select_dtypes(include=np.number).columns.tolist()
cat_cols = set(df.columns).difference(set(num_cols))
print(cat_cols)

set()


## Identifying Features

In [30]:
train_set = df

#### Rent Per....
I am going to calculate the rent per adult and rent per total 

In [32]:
df['rent_per_adult'] = df['v2a1'] / df['hogar_adul']
df['rent_per_person'] = df['v2a1'] / df['hogar_total']

#### Overcrowding
I will calculate the average of overcrowding by rooms and overcrowding by bedrooms.  These are either 1 for yes or 0 for no. 

In [33]:
df['overcrowding'] = (df['hacdor'] + df['hacapo']) / 2 

#### Gender Under 12
I am going to look at the rates by gender for those under 12

In [39]:
df['percent_female_under12'] = df['r4m1'] / df['r4m3']
df['percent_male_under12'] = df['r4h1'] / df['r4h3']
df['female_under12_to_total'] = df['r4m1'] / df['r4m3'] 
df['male_under12_to_total'] = df['r4h1'] / df['r4t3']
df['all_under12_to_total'] = df['r4t1'] / df['r4t3']

#### Dependency
Dependency is related to those living in the household under 19 and over 64. 

Lets look at the age groups and their percentage to the rest of the house...

In [44]:
df['adults'] = df['hogar_adul'] - df['hogar_mayor']
df['total_dependents'] = df['hogar_nin'] + df['hogar_mayor']
df['dependency rate'] = df['total_dependents'] / df['adults']
df['dep_child_percent'] = df['hogar_nin'] / df['hogar_total']
df['age_12_19'] = df['hogar_nin'] - df['r4t1']
df['dep_elder_percent'] = df['hogar_mayor'] / df['hogar_total']
df['adults_percent'] = df['hogar_adul'] / df['hogar_total']

#### Bedrooms
Lets look at the number of bedrooms and how they relate to rent and some other features we've looked at already.

In [40]:
df['rent_per_bedroom'] = df['v2a1'] / df['bedrooms']
df['adults_per_bedroom'] = df['adults'] / df['bedrooms']
df['children_per_bedroom'] = df['hogar_nin'] / df['bedrooms']
df['females_per_bedroom'] = df['r4m3'] / df['bedrooms']
df['males_per_bedroom'] = df['r4t3'] / df['bedrooms']
df['total_people_per_bedroom'] = df['hogar_total'] / df['bedrooms']

#### Technology
Next I will add the features related to technology and what is in the household per person

In [43]:
df['phones_per_person'] = df['qmobilephone'] / df['r4t3']
df['tablets_per_person'] = df['v18q1'] / df['r4t3']

#### Per Room....
Lets look at different factors per room not just bedrooms

In [45]:
df['rent_per_room'] = df['v2a1'] / df['rooms']
df['bedrooms_per_room'] = df['bedrooms'] / df['rooms']
df['adults_per_room'] = df['adults'] / df['rooms']
df['child_per_room'] = df['hogar_nin'] / df['rooms']
df['elder_per_room'] = df['hogar_mayor'] / df['rooms']
df['females_per_room'] = df['r4m3'] / df['rooms']
df['males_per_room'] = df['r4t3'] / df['rooms']
df['total_people_per_room'] = df['hogar_total'] / df['rooms']

#### Schooling
Lets look at the levels of schooling by various factors

In [47]:
df['school_years_to_age'] = df['escolari'] / df['age']
df['years_behind_to_total_years'] = df['rez_esc'] / df['escolari']
df['years_behind_under12'] = df['rez_esc'] / df['r4t1']
df['years_behind_over12'] = df['rez_esc'] / df['r4t2']
df['years_behind_house_total'] = df['rez_esc'] / df['r4t3']
df['years_behind_to_age'] = df['rez_esc'] / df['age']

In [53]:
!pip install lightgbm



In [54]:
import lightgbm as lgb

OSError: dlopen(/Users/hillaryweaver/anaconda3/lib/python3.10/site-packages/lightgbm/lib_lightgbm.so, 0x0006): Library not loaded: '/usr/local/opt/libomp/lib/libomp.dylib'
  Referenced from: '/Users/hillaryweaver/anaconda3/lib/python3.10/site-packages/lightgbm/lib_lightgbm.so'
  Reason: tried: '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/usr/local/lib/libomp.dylib' (no such file), '/usr/lib/libomp.dylib' (no such file)