In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor

In [2]:
data = pd.read_csv("data/final_data.csv")

In [3]:
X = data.drop(columns="INCOME")
y = data["INCOME"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Linear Regression

In [5]:
LR = linear_model.LinearRegression().fit(X_train, y_train)
LR.score(X_test, y_test)

-2.7751720305407978e+17

### Ridge Regression

In [6]:
RR = linear_model.Ridge(alpha=85, random_state=42).fit(X_train, y_train)
RR.score(X_test, y_test)

0.15757294603748428

### Lasso

In [7]:
LAS = linear_model.Lasso(alpha=85, random_state=42).fit(X_train, y_train)
LAS.score(X_test, y_test)

0.15091859140242336

### Random Forest

In [8]:
RF = RandomForestRegressor(max_depth=6, random_state=42).fit(X_train, y_train)
RF.score(X_test, y_test)

0.14987449437771827

# Without FE for Major

In [9]:
data2 = pd.read_csv("data/no_major_final.csv")
data2.head()

Unnamed: 0,SIBS,DEGREE_BACHELOR,DEGREE_GRADUATE,DEGREE_HIGH SCHOOL,DEGREE_JUNIOR COLLEGE,PADEG_BACHELOR,PADEG_GRADUATE,PADEG_HIGH SCHOOL,PADEG_JUNIOR COLLEGE,PADEG_LT HIGH SCHOOL,...,GRANBORN_4.0,GRANBORN_ALL IN U.S,POLVIEWS_CONSERVATIVE,POLVIEWS_EXTREMELY LIBERAL,POLVIEWS_EXTRMLY CONSERVATIVE,POLVIEWS_LIBERAL,POLVIEWS_MODERATE,POLVIEWS_SLGHTLY CONSERVATIVE,POLVIEWS_SLIGHTLY LIBERAL,INCOME
0,0.0,1,0,0,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,30000
1,2.0,1,0,0,0,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,6000
2,3.0,0,0,0,1,0,0,1,0,0,...,1,0,0,0,0,1,0,0,0,90000
3,4.0,1,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,60000
4,2.0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,30000


In [10]:
X2 = data2.drop(columns="INCOME")
y2 = data2["INCOME"]

In [11]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

### Linear Regression

In [12]:
LR2 = linear_model.LinearRegression().fit(X2_train, y2_train)
LR2.score(X2_test, y2_test)

-5.515405562852804e+18

### Ridge Regression

In [13]:
RR2 = linear_model.Ridge(alpha=90, random_state=42).fit(X2_train, y2_train)
RR2.score(X2_test, y2_test)

0.11425950601755575

### Lasso

In [14]:
LAS2 = linear_model.Lasso(alpha=500, random_state=42).fit(X2_train, y2_train)
LAS2.score(X2_test, y2_test)

0.1114772292514451

### Random Forest

In [15]:
RF2 = RandomForestRegressor(max_depth=2, random_state=42).fit(X2_train, y2_train)
RF2.score(X2_test, y2_test)

0.07222040405741348

# TF/Keras

In [16]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from tensorflow.keras.utils import plot_model
from sklearn import metrics
import matplotlib.pyplot as plt

In [17]:
modelff = Sequential()
modelff.add(Dense(units=512, activation='linear', input_dim=128))
modelff.add(Dropout(0.5))
modelff.add(Dense(units=256, activation='relu'))
modelff.add(Dropout(0.5))
modelff.add(Dense(units=128, activation='linear'))
modelff.add(Dropout(0.5))
modelff.add(Dense(1, activation='linear'))
modelff.compile(loss='mse', optimizer="adam", metrics=['accuracy'])

In [18]:
modelff.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               66048     
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               32896     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 1

In [19]:
num_epochs = 5000
history = modelff.fit(X_train, y_train.values)



In [20]:
modelff.predict(X_test)

array([[ 45606.914],
       [ 40525.324],
       [ 47346.766],
       [ 40354.08 ],
       [ 49915.914],
       [ 45383.2  ],
       [ 36169.33 ],
       [ 58126.45 ],
       [ 42909.777],
       [ 37805.023],
       [ 45742.96 ],
       [ 50937.023],
       [ 44008.645],
       [ 42773.562],
       [ 37381.625],
       [ 39109.41 ],
       [ 47952.95 ],
       [ 44945.46 ],
       [ 33971.867],
       [ 39303.91 ],
       [ 50808.47 ],
       [ 39022.312],
       [ 47015.36 ],
       [ 41763.676],
       [ 57367.242],
       [ 37160.715],
       [ 49093.77 ],
       [ 41257.574],
       [ 41002.562],
       [ 47064.438],
       [ 53924.9  ],
       [ 51073.36 ],
       [ 43055.285],
       [ 39875.758],
       [ 47995.703],
       [ 43244.1  ],
       [ 65481.277],
       [ 39262.18 ],
       [ 46370.035],
       [ 43659.54 ],
       [ 50894.617],
       [ 47964.65 ],
       [ 34383.945],
       [ 46647.633],
       [ 48806.65 ],
       [ 41714.984],
       [ 44238.324],
       [ 4211

## XGBoost

In [9]:
import xgboost as xgb 
import graphviz

In [10]:
XG = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bynode = 0.5,colsample_bylevel=0.5, learning_rate = 0.05,
                max_depth = 5, alpha = 10, n_estimators = 100, gamma=0.5)

XG.fit(X_train, y_train) 

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=0.5,
             colsample_bynode=0.5, colsample_bytree=1, gamma=0.5, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.05, max_delta_step=0, max_depth=5,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
             reg_alpha=10, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [11]:
XG.score(X_test, y_test)

0.15566510863040006

### Hyperparameter tuning

In [None]:
xgb1 = xgb.XGBRegressor()
parameters = {'objective':['reg:squarederror'],
              'learning_rate': [0.03, 0.05, 0.1], 
              'max_depth': [5, 6, 7],
              'min_child_weight': [1, 2, 4],
              'subsample': [0.7, 0.9, 1],
               'gamma':[0, 0.1, 0.5],
              'colsample_bylevel': [0.5, 0.7, 1],
              'colsample_bynode': [0.5, 0.7, 1],
              'colsample_bytree': [0.5, 0.7, 1],
              'n_estimators': [100, 200]}

xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        cv = 2,
                        n_jobs = -1,
                        verbose=True)

xgb_grid.fit(X_train,
         y_train)

print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

In [13]:
best_params={'colsample_bylevel': 0.5, 'colsample_bynode': 0.5, 'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 100, 'objective': 'reg:squarederror', 'subsample': 0.9}

In [14]:
XG = xgb.XGBRegressor(**best_params)
XG.fit(X_train, y_train) 
XG.score(X_test, y_test)

0.16381963132089417

EXPORTING THE MODEL

In [15]:
with open('model_pickle', 'wb') as f:
    pickle.dump(XG, f)